diff --git a/.github/scripts/translate_docs.py b/.github/scripts/translate_docs.py
index c5af35fd6..f9879d4d1 100644
--- a/.github/scripts/translate_docs.py
+++ b/.github/scripts/translate_docs.py
@@ -12,7 +12,6 @@
import os
import subprocess
-import sys
from pathlib import Path
from anthropic import Anthropic
@@ -20,6 +19,15 @@
EN_DIR = "docs/src/en"
ES_DIR = "docs/src/es"
+# Source files outside EN_DIR whose Spanish translation lives in ES_DIR. The
+# English docs page for these is an mdbook {{#include}} of the source file, so
+# the source file is the single source of truth and drives its es/ translation.
+EXTERNAL_SOURCES = {"CONTRIBUTING.md": f"{ES_DIR}/contributing.md"}
+
+# English doc files that must not be translated directly — e.g. pages that are
+# just an mdbook {{#include}} of a source handled via EXTERNAL_SOURCES above.
+SKIP_EN_FILES = {f"{EN_DIR}/contributing.md"}
+
# Files to use as style/terminology reference (picked for breadth of patterns)
REFERENCE_FILES = ["archs4.md", "blast.md", "info.md"]
@@ -80,31 +88,46 @@
"""
+def es_target(filepath):
+ """Map an English/source doc path to its Spanish counterpart path."""
+ if filepath in EXTERNAL_SOURCES:
+ return EXTERNAL_SOURCES[filepath]
+ return filepath.replace(EN_DIR, ES_DIR, 1)
+
+
def get_changed_files(before_sha, after_sha):
- """Return dict of added/modified/deleted English doc files."""
+ """Return dict of added/modified/deleted documentation source files.
+
+ Watches the English docs directory plus any external source files
+ (e.g. the root CONTRIBUTING.md, which the English docs page includes).
+ """
+ watched = [EN_DIR, *EXTERNAL_SOURCES]
# Check if before_sha is a valid commit
- is_valid = subprocess.run(
- ["git", "cat-file", "-t", before_sha],
- capture_output=True,
- text=True,
- ).returncode == 0
+ is_valid = (
+ subprocess.run(
+ ["git", "cat-file", "-t", before_sha],
+ capture_output=True,
+ text=True,
+ ).returncode
+ == 0
+ )
if not is_valid:
# Initial push or invalid ref — treat all current files as new
result = subprocess.run(
- ["git", "ls-tree", "-r", "--name-only", after_sha, "--", EN_DIR],
+ ["git", "ls-tree", "-r", "--name-only", after_sha, "--", *watched],
capture_output=True,
text=True,
check=True,
)
return {
- "added": [f for f in result.stdout.strip().split("\n") if f],
+ "added": [f for f in result.stdout.strip().split("\n") if f and f not in SKIP_EN_FILES],
"modified": [],
"deleted": [],
}
result = subprocess.run(
- ["git", "diff", "--name-status", before_sha, after_sha, "--", EN_DIR],
+ ["git", "diff", "--name-status", before_sha, after_sha, "--", *watched],
capture_output=True,
text=True,
check=True,
@@ -125,6 +148,10 @@ def get_changed_files(before_sha, after_sha):
elif status == "R":
files["deleted"].append(parts[1])
files["added"].append(parts[2])
+
+ # Drop English pages that must not be translated directly (handled elsewhere).
+ for key in files:
+ files[key] = [f for f in files[key] if f not in SKIP_EN_FILES]
return files
@@ -151,9 +178,7 @@ def load_reference_files():
def build_reference_block(references):
"""Format reference files into a single text block."""
- return "\n\n---\n\n".join(
- f"=== {name} ===\n{content}" for name, content in references.items()
- )
+ return "\n\n---\n\n".join(f"=== {name} ===\n{content}" for name, content in references.items())
def clean_model_output(text):
@@ -231,6 +256,7 @@ def translate_diff(client, diff_text, en_content, es_content, filename, ref_bloc
def main():
+ """Translate English docs changed between two commits into Spanish."""
before_sha = os.environ.get("BEFORE_SHA", "").strip()
after_sha = os.environ.get("AFTER_SHA", "HEAD").strip()
@@ -259,7 +285,7 @@ def main():
# --- Deletions ---
for filepath in changed["deleted"]:
- es_path = filepath.replace(EN_DIR, ES_DIR, 1)
+ es_path = es_target(filepath)
if Path(es_path).exists():
Path(es_path).unlink()
print(f"Deleted: {es_path}")
@@ -270,7 +296,7 @@ def main():
filename = Path(filepath).name
print(f"Translating new file: {filename} ...")
translated = translate_new_file(client, en_content, filename, ref_block)
- es_path = filepath.replace(EN_DIR, ES_DIR, 1)
+ es_path = es_target(filepath)
Path(es_path).parent.mkdir(parents=True, exist_ok=True)
Path(es_path).write_text(translated)
print(f" -> Created: {es_path}")
@@ -278,7 +304,7 @@ def main():
# --- Modified files ---
for filepath in changed["modified"]:
filename = Path(filepath).name
- es_path = filepath.replace(EN_DIR, ES_DIR, 1)
+ es_path = es_target(filepath)
en_content = Path(filepath).read_text()
if not Path(es_path).exists():
@@ -292,9 +318,7 @@ def main():
continue
es_content = Path(es_path).read_text()
print(f"Applying edits to {filename} ...")
- translated = translate_diff(
- client, diff_text, en_content, es_content, filename, ref_block
- )
+ translated = translate_diff(client, diff_text, en_content, es_content, filename, ref_block)
Path(es_path).parent.mkdir(parents=True, exist_ok=True)
Path(es_path).write_text(translated)
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
new file mode 100644
index 000000000..c8a62834b
--- /dev/null
+++ b/.github/workflows/build.yml
@@ -0,0 +1,25 @@
+name: Check Build
+
+on:
+ push:
+ branches: [main]
+ pull_request:
+ branches: [main]
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ package:
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v4
+ with:
+ fetch-depth: 0
+ - name: Install uv
+ uses: astral-sh/setup-uv@v7
+ - name: Build package
+ run: uv build
+ - name: Check package
+ run: uvx twine check --strict dist/*.whl
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 461b3dae0..6759d5e9f 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -1,37 +1,72 @@
name: CI - tests
on:
- # Scheduled runs twice weekly.
- # These runs execute tests on both Python versions, save the pytest output to a file,
- # upload the file as an artifact, and commit the 3.12 report back to main.
+ # Scheduled runs twice weekly: save the pytest output to a file, upload it as
+ # an artifact, and commit the 3.12 report back to the branch.
schedule:
- cron: "0 16 * * 1,4"
- # Push runs only when package code or tests change.
- # These runs execute tests normally and fail immediately on test failure.
+ # Run post-merge on the integration branches only — pushes to a PR's feature
+ # branch are already covered by the pull_request event below, so scoping push
+ # to main/dev avoids running the suite twice for the same commit.
push:
+ branches: [main, dev]
paths:
- "gget/**"
- "tests/**"
- # Avoid recursively triggering on committed pytest result files.
+ - "pyproject.toml"
+ # Avoid recursively triggering on the bot-committed pytest result files.
- "!tests/pytest_results_py*.txt"
+ # Run on every pull request into the integration branches.
+ pull_request:
+ branches: [main, dev]
+ paths:
+ - "gget/**"
+ - "tests/**"
+ - "pyproject.toml"
# Manual runs behave like scheduled runs:
- # save output, upload artifact, and optionally commit report back to main.
+ # save output, upload artifact, and commit report back.
workflow_dispatch:
+concurrency:
+ group: ${{ github.workflow }}-${{ github.ref }}
+ cancel-in-progress: true
+
permissions:
- contents: write
+ contents: read
jobs:
- build:
- name: Test on Python ${{ matrix.python }}
- runs-on: ubuntu-22.04
+ # Derive the test matrix from pyproject.toml ([tool.hatch.envs.hatch-test]),
+ # so the tested environments are defined in a single place and stay identical
+ # locally (`hatch test`) and in CI.
+ get-environments:
+ runs-on: ubuntu-latest
+ outputs:
+ envs: ${{ steps.get-envs.outputs.envs }}
+ steps:
+ - uses: actions/checkout@v4
+ - name: Install uv
+ uses: astral-sh/setup-uv@v7
+ - name: Get test environments from hatch
+ id: get-envs
+ run: |
+ ENVS_JSON=$(uvx hatch env show --json | jq -c 'to_entries
+ | map(select(.key | startswith("hatch-test")) | { name: .key, python: .value.python })')
+ echo "envs=${ENVS_JSON}" | tee "$GITHUB_OUTPUT"
+
+ test:
+ needs: get-environments
+ name: ${{ matrix.env.name }}
+ runs-on: ubuntu-latest
+ permissions:
+ contents: write # commit pytest report back on scheduled/manual runs
+ id-token: write # codecov OIDC
strategy:
fail-fast: false
matrix:
- python: ["3.11", "3.12"]
+ env: ${{ fromJSON(needs.get-environments.outputs.envs) }}
steps:
- name: Checkout branch
@@ -39,64 +74,74 @@ jobs:
with:
fetch-depth: 0
- - name: Setup python
- uses: actions/setup-python@v5
+ - name: Install uv
+ uses: astral-sh/setup-uv@v7
with:
- python-version: ${{ matrix.python }}
-
- - name: Install dependencies
- run: |
- python -m pip install -r requirements.txt
- python -m pip install -r dev-requirements.txt
-
- # Push behavior:
- # run pytest normally and let this step fail the job immediately if tests fail.
- - name: Run tests on push
- if: github.event_name == 'push'
- run: |
- pytest -ra -v --tb=long --durations=10 \
- --cov=gget --cov-report=term-missing tests
-
- # Scheduled/manual behavior:
- # run pytest, save full output to a file, and capture the real pytest exit code.
- #
- # Important:
- # - GitHub bash shells may stop on non-zero commands before later lines run.
- # - We temporarily disable errexit with "set +e" so failed tests do not prevent
- # us from recording PIPESTATUS[0] and writing it to GITHUB_OUTPUT.
- # - continue-on-error keeps later artifact/commit steps running.
- - name: Run tests and save output for scheduled/manual runs
+ python-version: ${{ matrix.env.python }}
+
+ # Builds the environment (project + test dependency-group, plus the
+ # cellxgene extra only where pyproject says it is available).
+ - name: Create hatch test environment
+ run: uvx hatch env create ${{ matrix.env.name }}
+
+ # Push/PR: run tests and fail the job immediately on test failure.
+ - name: Run tests (push / pull_request)
+ if: github.event_name == 'push' || github.event_name == 'pull_request'
+ env:
+ MPLBACKEND: agg
+ run: uvx hatch run ${{ matrix.env.name }}:run-cov -ra -v --durations=10
+
+ # Scheduled/manual: save full output to a file and capture the real pytest
+ # exit code. "set +e" keeps a test failure from preventing the exit-code /
+ # artifact / report-commit handling below; continue-on-error does the same
+ # at the step level.
+ - name: Run tests and save output (schedule / workflow_dispatch)
id: pytest_saved
if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch'
continue-on-error: true
shell: bash
+ env:
+ MPLBACKEND: agg
run: |
set -o pipefail
- OUT="tests/pytest_results_py${{ matrix.python }}.txt"
- echo "Pytest results (Python ${{ matrix.python }}) - $(date -u +"%Y-%m-%dT%H:%M:%SZ")" > "$OUT"
+ OUT="tests/pytest_results_py${{ matrix.env.python }}.txt"
+ echo "Pytest results (Python ${{ matrix.env.python }}) - $(date -u +"%Y-%m-%dT%H:%M:%SZ")" > "$OUT"
echo "" >> "$OUT"
set +e
- pytest -ra -v --tb=long --durations=10 \
- --cov=gget --cov-report=term-missing tests 2>&1 | tee -a "$OUT"
+ uvx hatch run ${{ matrix.env.name }}:run-cov -ra -v --durations=10 2>&1 | tee -a "$OUT"
code=${PIPESTATUS[0]}
set -e
echo "exit_code=$code" >> "$GITHUB_OUTPUT"
echo "pytest exit code: $code"
-
- # Do not fail here; a later step fails the job after artifacts/report handling.
exit 0
+ # Coverage upload is best-effort: a failure here must not mask the test
+ # result (which is handled by the steps above/below).
+ - name: Generate coverage report
+ if: always()
+ continue-on-error: true
+ run: |
+ test -f .coverage || uvx hatch run ${{ matrix.env.name }}:cov-combine
+ uvx hatch run ${{ matrix.env.name }}:coverage xml
+
+ - name: Upload coverage to Codecov
+ if: always()
+ uses: codecov/codecov-action@v6
+ with:
+ use_oidc: true
+ fail_ci_if_error: false
+
# Upload the saved pytest report as an artifact.
- # Only do this once (3.12) to avoid duplicate artifacts from the matrix.
+ # Only once (3.12) to avoid duplicate artifacts from the matrix.
- name: Upload pytest results artifact
- if: always() && matrix.python == '3.12' && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
+ if: always() && matrix.env.python == '3.12' && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
uses: actions/upload-artifact@v4
with:
- name: pytest-results-py${{ matrix.python }}
- path: tests/pytest_results_py${{ matrix.python }}.txt
+ name: pytest-results-py${{ matrix.env.python }}
+ path: tests/pytest_results_py${{ matrix.env.python }}.txt
# Commit the saved pytest report back to the repository.
# Safety guards:
@@ -107,27 +152,27 @@ jobs:
- name: Commit and push pytest results
if: >
always() &&
- matrix.python == '3.12' &&
+ matrix.env.python == '3.12' &&
(github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
shell: bash
run: |
set -euo pipefail
-
+
BRANCH="${GITHUB_REF#refs/heads/}"
echo "Current branch: $BRANCH"
-
+
git config user.name "github-actions[bot]"
git config user.email "github-actions[bot]@users.noreply.github.com"
-
+
git add tests/pytest_results_py*.txt
-
+
if git diff --cached --quiet; then
echo "No changes to commit."
exit 0
fi
-
+
git commit -m "CI: update pytest results ($BRANCH)"
-
+
for attempt in 1 2 3 4 5; do
echo "Push attempt $attempt..."
git pull --rebase --autostash origin "$BRANCH" || true
@@ -136,13 +181,12 @@ jobs:
fi
sleep $((attempt * 5))
done
-
+
echo "Push failed after retries."
exit 1
# After scheduled/manual runs, explicitly fail the job if pytest failed.
- # This step is separate so that artifact upload and report commit can still happen
- # even when tests fail.
+ # Separate so that artifact upload and report commit still happen on failure.
- name: Fail job if pytest failed
if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
shell: bash
@@ -150,7 +194,6 @@ jobs:
code="${{ steps.pytest_saved.outputs.exit_code }}"
echo "Captured pytest exit code: ${code:-}"
- # Missing output means something went wrong before exit code capture.
if [ -z "${code:-}" ]; then
echo "pytest exit code was not captured"
exit 1
@@ -159,3 +202,17 @@ jobs:
if [ "$code" != "0" ]; then
exit "$code"
fi
+
+ # Single gate job so branch protection can require one stable check name
+ # instead of every matrix entry. See https://github.com/re-actors/alls-green.
+ check:
+ name: Tests pass
+ if: always()
+ needs:
+ - get-environments
+ - test
+ runs-on: ubuntu-latest
+ steps:
+ - uses: re-actors/alls-green@release/v1
+ with:
+ jobs: ${{ toJSON(needs) }}
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
index 220a8f49d..46da29aba 100644
--- a/.github/workflows/deploy.yml
+++ b/.github/workflows/deploy.yml
@@ -3,7 +3,7 @@ on:
push:
branches:
- main
-
+
paths:
- 'docs/**'
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
new file mode 100644
index 000000000..89decb39c
--- /dev/null
+++ b/.github/workflows/release.yml
@@ -0,0 +1,26 @@
+name: Release
+
+on:
+ release:
+ types: [published]
+
+# Use "trusted publishing", see https://docs.pypi.org/trusted-publishers/
+jobs:
+ release:
+ name: Upload release to PyPI
+ runs-on: ubuntu-latest
+ environment:
+ name: pypi
+ url: https://pypi.org/p/gget
+ permissions:
+ id-token: write # IMPORTANT: this permission is mandatory for trusted publishing
+ steps:
+ - uses: actions/checkout@v4
+ with:
+ fetch-depth: 0
+ - name: Install uv
+ uses: astral-sh/setup-uv@v7
+ - name: Build package
+ run: uv build
+ - name: Publish package distributions to PyPI
+ uses: pypa/gh-action-pypi-publish@release/v1
diff --git a/.github/workflows/traffic.yml b/.github/workflows/traffic.yml
index 29da6d30b..3d578671c 100644
--- a/.github/workflows/traffic.yml
+++ b/.github/workflows/traffic.yml
@@ -1,9 +1,9 @@
name: Repo Traffic Back Up
on:
- schedule:
+ schedule:
# Runs every week
- cron: "0 0 */7 * *"
-
+
jobs:
# This workflow stores repository traffic and clones past the default 2 week period
traffic:
@@ -15,14 +15,14 @@ jobs:
- uses: actions/checkout@v2
with:
ref: "traffic"
-
+
# Calculates traffic and clones and stores them in a CSV file
# This workflow is based on https://github.com/marketplace/actions/repository-traffic
- - name: GitHub traffic
+ - name: GitHub traffic
uses: sangonzal/repository-traffic-action@v.0.1.6
env:
- TRAFFIC_ACTION_TOKEN: ${{ secrets.TRAFFIC_ACTION_TOKEN }}
-
+ TRAFFIC_ACTION_TOKEN: ${{ secrets.TRAFFIC_ACTION_TOKEN }}
+
# Commits files to traffic branch
- name: Commit changes
uses: EndBug/add-and-commit@v4
diff --git a/.github/workflows/translate_docs.yml b/.github/workflows/translate_docs.yml
index 583639ec6..656d9277c 100644
--- a/.github/workflows/translate_docs.yml
+++ b/.github/workflows/translate_docs.yml
@@ -5,6 +5,9 @@ on:
branches: [main]
paths:
- 'docs/src/en/**'
+ # The English contributing page is an mdbook include of this file, so a
+ # change here must regenerate docs/src/es/contributing.md (see translate_docs.py).
+ - 'CONTRIBUTING.md'
workflow_dispatch:
# Only one translation run at a time; new pushes cancel in-progress runs.
diff --git a/.gitignore b/.gitignore
index 1e8e1f52b..c4bfe26d7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -144,3 +144,5 @@ dmypy.json
# VSCode settings
.vscode/
+# uv lockfile (library: resolve fresh; hatch CI manages its own envs)
+uv.lock
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 000000000..31447cbf5
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,49 @@
+fail_fast: false
+default_language_version:
+ python: python3
+default_stages:
+ - pre-commit
+ - pre-push
+minimum_pre_commit_version: 2.16.0
+# Never reformat vendored binaries, bundled package data, test reference
+# fixtures, or the auto-generated CI report — these are content/data whose
+# exact bytes matter (CRLF in .pdb fixtures, exact-match test inputs, ...).
+exclude: |
+ (?x)^(
+ gget/bins/
+ | gget/constants/
+ | tests/fixtures/
+ | tests/pytest_results_py.*\.txt
+ )
+repos:
+ - repo: https://github.com/biomejs/pre-commit
+ rev: v2.4.16
+ hooks:
+ - id: biome-format
+ - repo: https://github.com/tox-dev/pyproject-fmt
+ rev: v2.23.0
+ hooks:
+ - id: pyproject-fmt
+ - repo: https://github.com/astral-sh/ruff-pre-commit
+ rev: v0.15.15
+ hooks:
+ - id: ruff-check
+ types_or: [python, pyi, jupyter]
+ args: [--fix, --exit-non-zero-on-fix]
+ - id: ruff-format
+ types_or: [python, pyi, jupyter]
+ - repo: https://github.com/pre-commit/pre-commit-hooks
+ rev: v6.0.0
+ hooks:
+ - id: detect-private-key
+ - id: check-ast
+ - id: end-of-file-fixer
+ - id: mixed-line-ending
+ args: [--fix=lf]
+ - id: trailing-whitespace
+ # Preserve Markdown hard line breaks (trailing double-space) in docs.
+ args: [--markdown-linebreak-ext=md]
+ - id: check-case-conflict
+ # Check that there are no merge conflicts (could be generated by template sync)
+ - id: check-merge-conflict
+ args: [--assume-in-merge]
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 01fe0ef36..feb45069f 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -40,10 +40,10 @@ Commit the changes once you are happy with them.
1. Review the content for technical accuracy.
2. Copy-edit the changes/comments for grammar, spelling, and adherence to the general style of existing gget code.
-3. Format your code using [black](https://black.readthedocs.io/en/stable/getting_started.html).
+3. Format and lint your code with [pre-commit](https://pre-commit.com/) (powered by [ruff](https://docs.astral.sh/ruff/)). Install the hooks once with `prek install` (or `pre-commit install`) so they run automatically on every commit, or run them on demand with `prek run --all-files` (or `pre-commit run --all-files`).
4. Make sure the unit tests pass:
- - Developer dependencies can be installed with `pip install -r dev-requirements.txt`
- - Run existing unit tests from the gget repository root with `coverage run -m pytest -ra -v tests && coverage report --omit=main.py,tests*`
+ - The tested environments are defined in `pyproject.toml` under `[tool.hatch.envs.hatch-test]` (the single source of truth used by CI). Run the full matrix with `uvx hatch test`.
+ - For a quick single-environment run, install the test dependencies with `uv sync --group test` and run `uv run pytest -ra -v --cov=gget --cov-report=term-missing tests`. To also exercise the `gget cellxgene` module, install its extra (`uv sync --group test --extra cellxgene`) on Python 3.12/3.13 — its dependency has no wheels for newer Python versions yet, and that test skips itself when the dependency is absent.
5. Add new unit tests if applicable:
- Arguments and expected results are stored in json files in ./tests/fixtures/
- Unit tests can be added to ./tests/test_*.py and will be automatically detected
@@ -59,7 +59,7 @@ If you have any questions, feel free to start a [discussion](https://github.com/
When you're finished with the changes, [create a pull request](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request), also known as a PR.
-‼️ Please make all PRs against the `dev` branch of the gget repository.
+‼️ Please make all PRs against the `dev` branch of the gget repository.
- Don't forget to [link PR to issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue) if you are solving one.
- Enable the checkbox to [allow maintainer edits](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/allowing-changes-to-a-pull-request-branch-created-from-a-fork) so the branch can be updated for a merge.
diff --git a/MANIFEST.in b/MANIFEST.in
deleted file mode 100644
index 85c2d56eb..000000000
--- a/MANIFEST.in
+++ /dev/null
@@ -1,4 +0,0 @@
-include LICENSE
-include requirements.txt
-recursive-include gget/bins *
-recursive-include gget/constants *
\ No newline at end of file
diff --git a/README.md b/README.md
index 945026107..119f6af0e 100644
--- a/README.md
+++ b/README.md
@@ -12,10 +12,10 @@
`gget` is part of the [scverse®](https://scverse.org) project and is fiscally sponsored by [NumFOCUS](https://numfocus.org/). If you like `gget` and want to support our mission, please consider making a tax-deductible [donation](https://opencollective.com/scverse/projects/scverse-gget/donate?interval=oneTime&amount=20&contributeAs=me).
-
+

-
-If you use `gget` in a publication, please [cite*](https://pachterlab.github.io/gget/en/cite.html):
+
+If you use `gget` in a publication, please [cite*](https://pachterlab.github.io/gget/en/cite.html):
```
Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. https://doi.org/10.1093/bioinformatics/btac836
```
@@ -42,7 +42,7 @@ For use in Jupyter Lab / Google Colab:
# Python
import gget
```
-# [🔗 Manual](https://pachterlab.github.io/gget)
+# [🔗 Manual](https://pachterlab.github.io/gget)
# 🪄 Quick start guide
Command line:
diff --git a/dev-requirements.txt b/dev-requirements.txt
deleted file mode 100644
index d3f679228..000000000
--- a/dev-requirements.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-coverage>=5.1
-pytest>=7.0.0
-pytest-cov>=6.2.1
-openai<=0.28.1
-cellxgene-census
-parameterized==0.9.0
-bravado==11.0.3
diff --git a/docs/src/SUMMARY.md b/docs/src/SUMMARY.md
index 9ccbe4bed..ca34b5f9c 100644
--- a/docs/src/SUMMARY.md
+++ b/docs/src/SUMMARY.md
@@ -9,7 +9,7 @@
* [Quick Start Guide](en/quick_start_guide.md)
# Manual
-* [gget 8cube](en/8cube.md)
+* [gget 8cube](en/8cube.md)
* [gget alphafold](en/alphafold.md)
* [gget archs4](en/archs4.md)
* [gget bgee](en/bgee.md)
@@ -25,12 +25,12 @@
* [gget info](en/info.md)
* [gget muscle](en/muscle.md)
* [gget mutate](en/mutate.md)
-* [gget opentargets](en/opentargets.md)
+* [gget opentargets](en/opentargets.md)
* [gget pdb](en/pdb.md)
* [gget ref](en/ref.md)
* [gget search](en/search.md)
* [gget setup](en/setup.md)
-* [gget seq](en/seq.md)
+* [gget seq](en/seq.md)
* [gget virus](en/virus.md)
---
@@ -44,14 +44,14 @@
# Español
* [Introdución](es/introduction.md)
* [¡Lo más reciente!](es/updates.md)
-* [Dependientes y Noticias](es/dependents.md)
+* [Dependientes y Noticias](es/dependents.md)
# Guía del usario
* [Instalación](es/installation.md)
* [Guía de inicio rápido](es/quick_start_guide.md)
# Manuál
-* [gget 8cube](es/8cube.md)
+* [gget 8cube](es/8cube.md)
* [gget alphafold](es/alphafold.md)
* [gget archs4](es/archs4.md)
* [gget bgee](es/bgee.md)
@@ -67,7 +67,7 @@
* [gget info](es/info.md)
* [gget muscle](es/muscle.md)
* [gget mutate](es/mutate.md)
-* [gget opentargets](es/opentargets.md)
+* [gget opentargets](es/opentargets.md)
* [gget pdb](es/pdb.md)
* [gget ref](es/ref.md)
* [gget search](es/search.md)
@@ -80,4 +80,3 @@
* [Guía de contribución](es/contributing.md)
* [Codigo de conducto](es/code_of_conduct.md)
* [Cómo citar](es/cite.md)
-
diff --git a/docs/src/en/8cube.md b/docs/src/en/8cube.md
index b7dbf9e32..684ec7512 100644
--- a/docs/src/en/8cube.md
+++ b/docs/src/en/8cube.md
@@ -36,7 +36,7 @@ Gene symbols or Ensembl gene IDs. Multiple genes allowed.
**Optional arguments**
`-csv` `--csv`
-Returns CSV instead of JSON (command-line only).
+Returns CSV instead of JSON (command-line only).
Python: Use `json=False` (default DataFrame) or `json=True` for JSON.
`-o` `--out`
diff --git a/docs/src/en/alphafold.md b/docs/src/en/alphafold.md
index 27c26403f..b647e8936 100644
--- a/docs/src/en/alphafold.md
+++ b/docs/src/en/alphafold.md
@@ -13,10 +13,10 @@ Before using `gget alphafold` for the first time:
`conda install -qy conda==24.1.2 && conda install -qy -c conda-forge openmm=7.7.0`
For Python version 3.11:
`conda install -qy conda==24.11.1 && conda install -qy -c conda-forge openmm=8.0.0`
-
+
Recommendation: Follow with `conda update -qy conda` to update conda to the latest version afterwards.
-
-3. Run `gget setup alphafold` / `gget.setup("alphafold")` once (also see [`gget setup`](setup.md)). Running `gget setup alphafold` / `gget.setup("alphafold")` will download and install the latest version of AlphaFold2 hosted on the [AlphaFold GitHub Repo](https://github.com/deepmind/alphafold). You can rerun this command any time to update the software after a new AlphaFold release.
+
+3. Run `gget setup alphafold` / `gget.setup("alphafold")` once (also see [`gget setup`](setup.md)). Running `gget setup alphafold` / `gget.setup("alphafold")` will download and install the latest version of AlphaFold2 hosted on the [AlphaFold GitHub Repo](https://github.com/deepmind/alphafold). You can rerun this command any time to update the software after a new AlphaFold release.
**Positional argument**
`sequence`
@@ -27,27 +27,27 @@ Amino acid sequence (str), or list of sequences (*gget alphafold will automatica
The multimer model will continue recycling until the predictions stop changing, up to the limit set here. Default: 3.
For higher accuracy, at the potential cost of longer inference times, set this to 20.
-`-o` `--out`
+`-o` `--out`
Path to folder to save prediction results in (str). Default: "./[date_time]_gget_alphafold_prediction".
-
-**Flags**
+
+**Flags**
`-mfm` `--multimer_for_monomer`
Use multimer model for a monomer.
-`-r` `--relax`
-AMBER relax the best model.
+`-r` `--relax`
+AMBER relax the best model.
-`-q` `--quiet`
+`-q` `--quiet`
Command-line only. Prevents progress information from being displayed.
-Python: Use `verbose=False` to prevent progress information from being displayed.
+Python: Use `verbose=False` to prevent progress information from being displayed.
`plot`
Python only. `plot=True` provides an interactive, 3D graphical overview of the predicted structure and alignment quality using [py3Dmol](https://pypi.org/project/py3Dmol/) and [matplotlib](https://matplotlib.org/) (default: True).
`show_sidechains`
Python only. `show_sidechains=True` includes side chains in the plot (default: True).
-
-
+
+
### Example
```bash
# Generate new prediction from amino acid sequence
@@ -83,7 +83,7 @@ gget.pdb("2K42", save=True)
### [🔗 gget alphafold FAQ](https://github.com/pachterlab/gget/discussions/39)
# References
-If you use `gget alphafold` in a publication, please cite the following articles:
+If you use `gget alphafold` in a publication, please cite the following articles:
- Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836)
diff --git a/docs/src/en/archs4.md b/docs/src/en/archs4.md
index 373b1b7e2..23371433e 100644
--- a/docs/src/en/archs4.md
+++ b/docs/src/en/archs4.md
@@ -17,15 +17,15 @@ Alternatively: use flag `--ensembl` to input an Ensembl gene IDs, e.g. ENSG00000
'tissue' returns a tissue expression atlas calculated from human or mouse samples (as defined by 'species') in [ARCHS4](https://maayanlab.cloud/archs4/).
`-s` `--species`
-'human' (default) or 'mouse'.
+'human' (default) or 'mouse'.
Defines whether to use human or mouse samples from [ARCHS4](https://maayanlab.cloud/archs4/).
(Only for tissue expression atlas.)
-`-o` `--out`
-Path to the file the results will be saved in, e.g. path/to/directory/results.csv (or .json). Default: Standard out.
+`-o` `--out`
+Path to the file the results will be saved in, e.g. path/to/directory/results.csv (or .json). Default: Standard out.
Python: `save=True` will save the output in the current working directory.
-
-**Flags**
+
+**Flags**
`-e` `--ensembl`
Add this flag if `gene` is given as an Ensembl gene ID.
@@ -33,11 +33,11 @@ Add this flag if `gene` is given as an Ensembl gene ID.
Command-line only. Returns results in CSV format.
Python: Use `json=True` to return output in JSON format.
-`-q` `--quiet`
+`-q` `--quiet`
Command-line only. Prevents progress information from being displayed.
-Python: Use `verbose=False` to prevent progress information from being displayed.
-
-
+Python: Use `verbose=False` to prevent progress information from being displayed.
+
+
### Examples
```bash
gget archs4 ACE2
@@ -49,10 +49,10 @@ gget.archs4("ACE2")
→ Returns the 100 most correlated genes to ACE2:
| gene_symbol | pearson_correlation |
-| -------------- |-------------------------|
-| SLC5A1 | 0.579634 |
-| CYP2C18 | 0.576577 |
-| . . . | . . . |
+| -------------- |-------------------------|
+| SLC5A1 | 0.579634 |
+| CYP2C18 | 0.576577 |
+| . . . | . . . |
@@ -66,9 +66,9 @@ gget.archs4("ACE2", which="tissue")
→ Returns the tissue expression of ACE2 (by default, human data is used):
| id | min | q1 | median | q3 | max |
-| ------ |--------| ------ |--------| ------ |--------|
+| ------ |--------| ------ |--------| ------ |--------|
| System.Urogenital/Reproductive System.Kidney.RENAL CORTEX | 0.113644 | 8.274060 | 9.695840 | 10.51670 | 11.21970 |
-| System.Digestive System.Intestine.INTESTINAL EPITHELIAL CELL | 0.113644 | 5.905560 | 9.570450 | 13.26470 | 13.83590 |
+| System.Digestive System.Intestine.INTESTINAL EPITHELIAL CELL | 0.113644 | 5.905560 | 9.570450 | 13.26470 | 13.83590 |
| . . . | . . . | . . . | . . . | . . . | . . . |
@@ -80,7 +80,7 @@ Check out [this tutorial](https://davetang.org/muse/2023/05/16/check-where-a-gen
#### [More examples](https://github.com/pachterlab/gget_examples)
# References
-If you use `gget archs4` in a publication, please cite the following articles:
+If you use `gget archs4` in a publication, please cite the following articles:
- Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836)
diff --git a/docs/src/en/bgee.md b/docs/src/en/bgee.md
index 6ea2b7833..7a49ee457 100644
--- a/docs/src/en/bgee.md
+++ b/docs/src/en/bgee.md
@@ -2,7 +2,7 @@
> Python arguments are equivalent to long-option arguments (`--arg`), unless otherwise specified. Flags are True/False arguments in Python. The manual for any gget tool can be called from the command-line using the `-h` `--help` flag.
# gget bgee 🐝
-Fetch orthology and gene expression data from [Bgee](https://www.bgee.org/) using Ensembl IDs.
+Fetch orthology and gene expression data from [Bgee](https://www.bgee.org/) using Ensembl IDs.
Return format: JSON/CSV (command-line) or data frame (Python).
> If you are specifically interested in human gene expression data, consider using [gget opentargets](./opentargets.md) or [gget archs4](./archs4.md) instead.
@@ -21,19 +21,19 @@ NOTE: Some of the species in [Bgee](https://www.bgee.org/) are not in Ensembl or
`-t` `--type`
Type of data to fetch. Options: `orthologs` (default), `expression`.
-`-o` `--out`
+`-o` `--out`
Path to the JSON file the results will be saved in, e.g. path/to/directory/results.json. Default: Standard out.
-**Flags**
+**Flags**
`-csv` `--csv`
Command-line only. Returns the output in CSV format, instead of JSON format.
Python: Use `json=True` to return output in JSON format.
-`-q` `--quiet`
+`-q` `--quiet`
Command-line only. Prevents progress information from being displayed.
Python: Use `verbose=False` to prevent progress information from being displayed.
-
-
+
+
### Examples
**Get orthologs for a gene**
@@ -102,11 +102,11 @@ gget.bgee(["ENSBTAG00000047356", "ENSBTAG00000018317"], type="expression")
| BGEE:0000000 | anatomical entity and cellular component | 89.12 | high| expressed |
| ... | ... | ... | ... | ... |
-
+
#### [More examples](https://github.com/pachterlab/gget_examples)
# References
-If you use `gget bgee` in a publication, please cite the following articles:
+If you use `gget bgee` in a publication, please cite the following articles:
- Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836)
diff --git a/docs/src/en/blast.md b/docs/src/en/blast.md
index 10c7ad2fa..4aee7a527 100644
--- a/docs/src/en/blast.md
+++ b/docs/src/en/blast.md
@@ -6,7 +6,7 @@ BLAST a nucleotide or amino acid sequence to any [BLAST](https://blast.ncbi.nlm.
Return format: JSON (command-line) or data frame/CSV (Python).
**Positional argument**
-`sequence`
+`sequence`
Nucleotide or amino acid sequence, or path to FASTA or .txt file.
**Optional arguments**
@@ -25,8 +25,8 @@ Limits number of hits to return. Default: 50.
`-e` `--expect`
Defines the [expect value](https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=FAQ#expect) cutoff. Default: 10.0.
-`-o` `--out`
-Path to the file the results will be saved in, e.g. path/to/directory/results.csv (or .json). Default: Standard out.
+`-o` `--out`
+Path to the file the results will be saved in, e.g. path/to/directory/results.csv (or .json). Default: Standard out.
Python: `save=True` will save the output in the current working directory.
**Flags**
@@ -40,13 +40,13 @@ Turns off MegaBLAST algorithm. Default: MegaBLAST on (blastn only).
Command-line only. Returns results in CSV format.
Python: Use `json=True` to return output in JSON format.
-`-q` `--quiet`
+`-q` `--quiet`
Command-line only. Prevents progress information from being displayed.
Python: Use `verbose=False` to prevent progress information from being displayed.
`wrap_text`
-Python only. `wrap_text=True` displays data frame with wrapped text for easy reading (default: False).
-
+Python only. `wrap_text=True` displays data frame with wrapped text for easy reading (default: False).
+
### Example
```bash
gget blast MKWMFKEDHSLEHRCVESAKIRAKYPDRVPVIVEKVSGSQIVDIDKRKYLVPSDITVAQFMWIIRKRIQLPSEKAIFLFVDKTVPQSR
@@ -60,7 +60,7 @@ gget.blast("MKWMFKEDHSLEHRCVESAKIRAKYPDRVPVIVEKVSGSQIVDIDKRKYLVPSDITVAQFMWIIRKRI
| Description | Scientific Name | Common Name | Taxid | Max Score | Total Score | Query Cover | ... |
| -------------- |-------------------------| ------------------------| -------------- | ----------|-----|---|---|
| PREDICTED: gamma-aminobutyric acid receptor-as...| Colobus angolensis palliatus | NaN | 336983 | 180 | 180 | 100% | ... |
-| . . . | . . . | . . . | . . . | . . . | . . . | . . . | ... |
+| . . . | . . . | . . . | . . . | . . . | . . . | . . . | ... |
**BLAST from .fa or .txt file:**
@@ -71,12 +71,12 @@ gget blast fasta.fa
# Python
gget.blast("fasta.fa")
```
-→ Returns the BLAST results of the first sequence contained in the fasta.fa file.
+→ Returns the BLAST results of the first sequence contained in the fasta.fa file.
#### [More examples](https://github.com/pachterlab/gget_examples)
# References
-If you use `gget blast` in a publication, please cite the following articles:
+If you use `gget blast` in a publication, please cite the following articles:
- Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836)
diff --git a/docs/src/en/blat.md b/docs/src/en/blat.md
index 169784c87..8f64580e3 100644
--- a/docs/src/en/blat.md
+++ b/docs/src/en/blat.md
@@ -2,34 +2,34 @@
> Python arguments are equivalent to long-option arguments (`--arg`), unless otherwise specified. Flags are True/False arguments in Python. The manual for any gget tool can be called from the command-line using the `-h` `--help` flag.
# gget blat 🎯
-Find the genomic location of a nucleotide or amino acid sequence using [BLAT](https://genome.ucsc.edu/cgi-bin/hgBlat).
+Find the genomic location of a nucleotide or amino acid sequence using [BLAT](https://genome.ucsc.edu/cgi-bin/hgBlat).
Return format: JSON (command-line) or data frame/CSV (Python).
**Positional argument**
-`sequence`
+`sequence`
Nucleotide or amino acid sequence, or path to FASTA or .txt file.
**Optional arguments**
-`-st` `--seqtype`
-'DNA', 'protein', 'translated%20RNA', or 'translated%20DNA'.
+`-st` `--seqtype`
+'DNA', 'protein', 'translated%20RNA', or 'translated%20DNA'.
Default: 'DNA' for nucleotide sequences; 'protein' for amino acid sequences.
`-a` `--assembly`
-'human' (hg38) (default), 'mouse' (mm39), 'zebrafinch' (taeGut2),
+'human' (hg38) (default), 'mouse' (mm39), 'zebrafinch' (taeGut2),
or any of the species assemblies available [here](https://genome.ucsc.edu/cgi-bin/hgBlat) (use short assembly name).
-`-o` `--out`
-Path to the file the results will be saved in, e.g. path/to/directory/results.csv (or .json). Default: Standard out.
+`-o` `--out`
+Path to the file the results will be saved in, e.g. path/to/directory/results.csv (or .json). Default: Standard out.
Python: `save=True` will save the output in the current working directory.
-
+
**Flags**
`-csv` `--csv`
Command-line only. Returns results in CSV format.
Python: Use `json=True` to return output in JSON format.
-`-q` `--quiet`
+`-q` `--quiet`
Command-line only. Prevents progress information from being displayed.
-Python: Use `verbose=False` to prevent progress information from being displayed.
+Python: Use `verbose=False` to prevent progress information from being displayed.
### Example
@@ -49,9 +49,8 @@ gget.blat("MKWMFKEDHSLEHRCVESAKIRAKYPDRVPVIVEKVSGSQIVDIDKRKYLVPSDITVAQFMWIIRKRIQ
#### [More examples](https://github.com/pachterlab/gget_examples)
# References
-If you use `gget blat` in a publication, please cite the following articles:
+If you use `gget blat` in a publication, please cite the following articles:
- Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836)
- Kent WJ. BLAT--the BLAST-like alignment tool. Genome Res. 2002 Apr;12(4):656-64. doi: 10.1101/gr.229202. PMID: 11932250; PMCID: PMC187518.
-
diff --git a/docs/src/en/cbio.md b/docs/src/en/cbio.md
index d8b6c224d..da70c8e3f 100644
--- a/docs/src/en/cbio.md
+++ b/docs/src/en/cbio.md
@@ -4,18 +4,18 @@
# gget cbio 📖
Plot cancer genomics heatmaps using data from [cBioPortal](https://www.cbioportal.org/) using Ensembl IDs or gene names.
-This module was written by [Sam Wagenaar](https://github.com/techno-sam).
+This module was written by [Sam Wagenaar](https://github.com/techno-sam).
**Positional argument**
`subcommand`
Either `search` or `plot`
### `search` subcommand (Python: `gget.cbio_search`)
-Find cBioPortal study IDs by keyword.
-Return format: JSON (command-line) or string list (Python).
+Find cBioPortal study IDs by keyword.
+Return format: JSON (command-line) or string list (Python).
**Note: This does not return studies with mixed cancer types.**
-**Positional argument**
+**Positional argument**
`keywords`
Space-separated list of keywords to search for, e.g. breast lung.
Python: Pass keywords as a list of strings.
@@ -25,14 +25,14 @@ Plot cancer genomics heatmaps using data from cBioPortal.
Return format: PNG (command-line and Python)
**Required arguments**
-`-s` `--study_ids`
+`-s` `--study_ids`
Space-separated list of cBioPortal study IDs, e.g. msk_impact_2017 egc_msk_2023.
`-g` `--genes`
Space-separated list of gene names or Ensembl IDs, e.g. NOTCH3 ENSG00000108375.
**Optional arguments**
-`-st` `--stratification`
+`-st` `--stratification`
Column to stratify the data by. Default: `tissue`.
Options:
- tissue
@@ -41,8 +41,8 @@ Options:
- study_id
- sample
-`-vt` `--variation_type`
-Type of variation to plot. Default: `mutation_occurrences`.
+`-vt` `--variation_type`
+Type of variation to plot. Default: `mutation_occurrences`.
Options:
- mutation_occurrences
- cna_nonbinary (Note: `stratification` must be 'sample' for this option)
@@ -50,18 +50,18 @@ Options:
- cna_occurrences
- Consequence (Note: `stratification` must be 'sample' for this option)
-`-f` `--filter`
-Filter the data by a specific value in a specific column, e.g. `study_id:msk_impact_2017`
+`-f` `--filter`
+Filter the data by a specific value in a specific column, e.g. `study_id:msk_impact_2017`
Python: `filter=(column, value)`
`-dd` `--data_dir`
Directory to store data files. Default: `./gget_cbio_cache`.
-`-fd` `--figure_dir`
+`-fd` `--figure_dir`
Directory to output figures. Default: `./gget_cbio_figures`.
`-fn` `--filename`
-Filename for the output figure, relative to `figure_dir`. Default: auto-generated
+Filename for the output figure, relative to `figure_dir`. Default: auto-generated
Python: `figure_filename`
`-t` `--title`
@@ -71,23 +71,23 @@ Python: `figure_title`
`-dpi` `--dpi`
DPI of the output figure. Default: 100.
-**Flags**
+**Flags**
-`-q` `--quiet`
+`-q` `--quiet`
Command-line only. Prevents progress information from being displayed.
-Python: Use `verbose=False` to prevent progress information from being displayed.
+Python: Use `verbose=False` to prevent progress information from being displayed.
-`-nc` `--no_confirm`
+`-nc` `--no_confirm`
Command-line only. Skip download confirmation prompts.
Python: Use `confirm_download=True` to enable download confirmation prompts.
`-sh` `--show`
Show the plot in a window (automatic in Jupyter notebooks).
-
-
+
+
### Examples
-**Find all cBioPortal studies with cancer types matching specific keywords:**
+**Find all cBioPortal studies with cancer types matching specific keywords:**
```bash
gget cbio search esophag ovary ovarian
```
@@ -104,7 +104,7 @@ gget.cbio_search(['esophag', 'ovary', 'ovarian'])
-**Plot a heatmap of mutation occurrences for specific genes in a specific study:**
+**Plot a heatmap of mutation occurrences for specific genes in a specific study:**
```bash
gget cbio plot \
-s msk_impact_2017 \
@@ -131,7 +131,7 @@ gget.cbio_plot(
-**Plot a heatmap of mutation types for specific genes in a specific study:**
+**Plot a heatmap of mutation types for specific genes in a specific study:**
```bash
gget cbio plot \
-s msk_impact_2017 \
@@ -217,19 +217,18 @@ gget.cbio_plot(
→ Saves a heatmap of mutation types for the specified genes in the specified study, filtered by tissue, with the title "Intestinal Mutations" to `./gget_cbio_figures/intestinal_mutations.png`.

-
+
#### [More examples](https://github.com/pachterlab/gget_examples)
# References
-If you use `gget cbio` in a publication, please cite the following articles:
+If you use `gget cbio` in a publication, please cite the following articles:
- Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836)
- Cerami E, Gao J, Dogrusoz U, Gross BE, Sumer SO, Aksoy BA, Jacobsen A, Byrne CJ, Heuer ML, Larsson E, Antipin Y, Reva B, Goldberg AP, Sander C, Schultz N. The cBio cancer genomics portal: an open platform for exploring multidimensional cancer genomics data. Cancer Discov. 2012 May;2(5):401-4. doi: [10.1158/2159-8290.CD-12-0095](https://doi.org/10.1158/2159-8290.cd-12-0095). Erratum in: Cancer Discov. 2012 Oct;2(10):960. PMID: 22588877; PMCID: PMC3956037.
-
+
- Gao J, Aksoy BA, Dogrusoz U, Dresdner G, Gross B, Sumer SO, Sun Y, Jacobsen A, Sinha R, Larsson E, Cerami E, Sander C, Schultz N. Integrative analysis of complex cancer genomics and clinical profiles using the cBioPortal. Sci Signal. 2013 Apr 2;6(269):pl1. doi: [10.1126/scisignal.2004088](https://doi.org/10.1126/scisignal.2004088). PMID: 23550210; PMCID: PMC4160307.
-
+
- de Bruijn I, Kundra R, Mastrogiacomo B, Tran TN, Sikina L, Mazor T, Li X, Ochoa A, Zhao G, Lai B, Abeshouse A, Baiceanu D, Ciftci E, Dogrusoz U, Dufilie A, Erkoc Z, Garcia Lara E, Fu Z, Gross B, Haynes C, Heath A, Higgins D, Jagannathan P, Kalletla K, Kumari P, Lindsay J, Lisman A, Leenknegt B, Lukasse P, Madela D, Madupuri R, van Nierop P, Plantalech O, Quach J, Resnick AC, Rodenburg SYA, Satravada BA, Schaeffer F, Sheridan R, Singh J, Sirohi R, Sumer SO, van Hagen S, Wang A, Wilson M, Zhang H, Zhu K, Rusk N, Brown S, Lavery JA, Panageas KS, Rudolph JE, LeNoue-Newton ML, Warner JL, Guo X, Hunter-Zinck H, Yu TV, Pilai S, Nichols C, Gardos SM, Philip J; AACR Project GENIE BPC Core Team, AACR Project GENIE Consortium; Kehl KL, Riely GJ, Schrag D, Lee J, Fiandalo MV, Sweeney SM, Pugh TJ, Sander C, Cerami E, Gao J, Schultz N. Analysis and Visualization of Longitudinal Genomic and Clinical Data from the AACR Project GENIE Biopharma Collaborative in cBioPortal. Cancer Res. 2023 Dec 1;83(23):3861-3867. doi: [10.1158/0008-5472.CAN-23-0816](https://doi.org/10.1158/0008-5472.CAN-23-0816). PMID: 37668528; PMCID: PMC10690089.
-
-- Please also cite the source of the data if you are using a publicly available dataset.
+- Please also cite the source of the data if you are using a publicly available dataset.
diff --git a/docs/src/en/cellxgene.md b/docs/src/en/cellxgene.md
index 018175d34..44b137679 100644
--- a/docs/src/en/cellxgene.md
+++ b/docs/src/en/cellxgene.md
@@ -2,7 +2,7 @@
> Python arguments are equivalent to long-option arguments (`--arg`), unless otherwise specified. Flags are True/False arguments in Python. The manual for any gget tool can be called from the command-line using the `-h` `--help` flag.
# gget cellxgene 🍱
-Query data from [CZ CELLxGENE Discover](https://cellxgene.cziscience.com/) using the [CZ CELLxGENE Discover Census](https://github.com/chanzuckerberg/cellxgene-census). [CZ CELLxGENE Discover](https://cellxgene.cziscience.com/) provides ready-to-use single-cell RNA sequencing count matrices for certain tissues/diseases/genes/etc.
+Query data from [CZ CELLxGENE Discover](https://cellxgene.cziscience.com/) using the [CZ CELLxGENE Discover Census](https://github.com/chanzuckerberg/cellxgene-census). [CZ CELLxGENE Discover](https://cellxgene.cziscience.com/) provides ready-to-use single-cell RNA sequencing count matrices for certain tissues/diseases/genes/etc.
Returns: An AnnData object containing the count matrix and metadata of single-cell RNA sequencing data from the defined tissues/genes/etc.
@@ -15,7 +15,7 @@ Non-human primates ('macaca_mulatta', 'callithrix_jacchus', 'pan_troglodytes') r
`-g` `--gene`
Str or list of gene name(s) or Ensembl ID(s). Default: None.
- NOTE: Use `-e / --ensembl` (Python: `ensembl=True`) when providing Ensembl ID(s) instead of gene name(s).
+ NOTE: Use `-e / --ensembl` (Python: `ensembl=True`) when providing Ensembl ID(s) instead of gene name(s).
NOTE: Gene symbols are case sensitive! Use canonical casing when passing gene symbols, e.g., 'PAX7' (human), 'Pax7' (mouse).
See https://cellxgene.cziscience.com/gene-expression for examples of available genes.
@@ -27,7 +27,7 @@ List of metadata columns to return (stored in AnnData.obs).
Default: ['dataset_id', 'assay', 'suspension_type', 'sex', 'tissue_general', 'tissue', 'cell_type']
For more options, see: https://api.cellxgene.cziscience.com/curation/ui/#/ -> Schemas -> dataset
-`-o` `--out`
+`-o` `--out`
Path to file to save generated AnnData .h5ad file (or .csv with `-mo / --meta_only`).
Required when using from command line!
@@ -38,7 +38,7 @@ Use when genes are provided as Ensembl IDs instead of gene names.
`-mo` `--meta_only`
Only returns metadata data frame (corresponds to AnnData.obs).
-`-q` `--quiet`
+`-q` `--quiet`
Command-line only. Prevents progress information from being displayed.
Python: Use `verbose=False` to prevent progress information from being displayed.
@@ -104,7 +104,7 @@ Str or list of sex ontology ID(s) as defined in the [CELLxGENE dataset schema](h
`--suspension_type`
Str or list of suspension type(s) as defined in the [CELLxGENE dataset schema](https://github.com/chanzuckerberg/single-cell-curation/tree/main/schema). Default: None.
-
+
### Examples
```bash
gget cellxgene --gene ACE2 ABCA1 SLC5A1 --tissue lung --cell_type 'mucus secreting cell' 'neuroendocrine cell' -o example_adata.h5ad
@@ -142,9 +142,8 @@ df
Also see: [https://chanzuckerberg.github.io/cellxgene-census/notebooks/api_demo/census_gget_demo.html](https://chanzuckerberg.github.io/cellxgene-census/notebooks/api_demo/census_gget_demo.html)
# References
-If you use `gget cellxgene` in a publication, please cite the following articles:
+If you use `gget cellxgene` in a publication, please cite the following articles:
- Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836)
- Chanzuckerberg Initiative. (n.d.). CZ CELLxGENE Discover. Retrieved [insert date here], from [https://cellxgene.cziscience.com/](https://cellxgene.cziscience.com/)
-
diff --git a/docs/src/en/cite.md b/docs/src/en/cite.md
index b090d2413..694d6172c 100644
--- a/docs/src/en/cite.md
+++ b/docs/src/en/cite.md
@@ -4,7 +4,7 @@
# Citation
-If you use `gget` in a publication, please cite:
+If you use `gget` in a publication, please cite:
Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836)
- If using `gget alphafold`, please also cite:
@@ -13,32 +13,32 @@ Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference data
And, if applicable:
- Evans, R. et al. Protein complex prediction with AlphaFold-Multimer. bioRxiv 2021.10.04.463034; [https://doi.org/10.1101/2021.10.04.463034](https://doi.org/10.1101/2021.10.04.463034)
-- If using `gget archs4`, please also cite:
+- If using `gget archs4`, please also cite:
- Lachmann A, Torre D, Keenan AB, Jagodnik KM, Lee HJ, Wang L, Silverstein MC, Ma’ayan A. Massive mining of publicly available RNA-seq data from human and mouse. Nature Communications 9. Article number: 1366 (2018), doi:10.1038/s41467-018-03751-6
- Bray NL, Pimentel H, Melsted P and Pachter L, Near optimal probabilistic RNA-seq quantification, Nature Biotechnology 34, p 525--527 (2016). [https://doi.org/10.1038/nbt.3519](https://doi.org/10.1038/nbt.3519)
- If using `gget bgee`, please also cite:
- Frederic B Bastian, Julien Roux, Anne Niknejad, Aurélie Comte, Sara S Fonseca Costa, Tarcisio Mendes de Farias, Sébastien Moretti, Gilles Parmentier, Valentine Rech de Laval, Marta Rosikiewicz, Julien Wollbrett, Amina Echchiki, Angélique Escoriza, Walid H Gharib, Mar Gonzales-Porta, Yohan Jarosz, Balazs Laurenczy, Philippe Moret, Emilie Person, Patrick Roelli, Komal Sanjeev, Mathieu Seppey, Marc Robinson-Rechavi (2021). The Bgee suite: integrated curated expression atlas and comparative transcriptomics in animals. Nucleic Acids Research, Volume 49, Issue D1, 8 January 2021, Pages D831–D847, [https://doi.org/10.1093/nar/gkaa793](https://doi.org/10.1093/nar/gkaa793)
-
+
- If using `gget blast`, please also cite:
- Altschul SF, Gish W, Miller W, Myers EW, Lipman DJ. Basic local alignment search tool. J Mol Biol. 1990 Oct 5;215(3):403-10. doi: 10.1016/S0022-2836(05)80360-2. PMID: 2231712.
-- If using `gget blat`, please also cite:
+- If using `gget blat`, please also cite:
- Kent WJ. BLAT--the BLAST-like alignment tool. Genome Res. 2002 Apr;12(4):656-64. doi: 10.1101/gr.229202. PMID: 11932250; PMCID: PMC187518.
- If using `gget cbio`, please also cite:
- Cerami E, Gao J, Dogrusoz U, Gross BE, Sumer SO, Aksoy BA, Jacobsen A, Byrne CJ, Heuer ML, Larsson E, Antipin Y, Reva B, Goldberg AP, Sander C, Schultz N. The cBio cancer genomics portal: an open platform for exploring multidimensional cancer genomics data. Cancer Discov. 2012 May;2(5):401-4. doi: [10.1158/2159-8290.CD-12-0095](https://doi.org/10.1158/2159-8290.cd-12-0095). Erratum in: Cancer Discov. 2012 Oct;2(10):960. PMID: 22588877; PMCID: PMC3956037.
-
+
- Gao J, Aksoy BA, Dogrusoz U, Dresdner G, Gross B, Sumer SO, Sun Y, Jacobsen A, Sinha R, Larsson E, Cerami E, Sander C, Schultz N. Integrative analysis of complex cancer genomics and clinical profiles using the cBioPortal. Sci Signal. 2013 Apr 2;6(269):pl1. doi: [10.1126/scisignal.2004088](https://doi.org/10.1126/scisignal.2004088). PMID: 23550210; PMCID: PMC4160307.
-
+
- de Bruijn I, Kundra R, Mastrogiacomo B, Tran TN, Sikina L, Mazor T, Li X, Ochoa A, Zhao G, Lai B, Abeshouse A, Baiceanu D, Ciftci E, Dogrusoz U, Dufilie A, Erkoc Z, Garcia Lara E, Fu Z, Gross B, Haynes C, Heath A, Higgins D, Jagannathan P, Kalletla K, Kumari P, Lindsay J, Lisman A, Leenknegt B, Lukasse P, Madela D, Madupuri R, van Nierop P, Plantalech O, Quach J, Resnick AC, Rodenburg SYA, Satravada BA, Schaeffer F, Sheridan R, Singh J, Sirohi R, Sumer SO, van Hagen S, Wang A, Wilson M, Zhang H, Zhu K, Rusk N, Brown S, Lavery JA, Panageas KS, Rudolph JE, LeNoue-Newton ML, Warner JL, Guo X, Hunter-Zinck H, Yu TV, Pilai S, Nichols C, Gardos SM, Philip J; AACR Project GENIE BPC Core Team, AACR Project GENIE Consortium; Kehl KL, Riely GJ, Schrag D, Lee J, Fiandalo MV, Sweeney SM, Pugh TJ, Sander C, Cerami E, Gao J, Schultz N. Analysis and Visualization of Longitudinal Genomic and Clinical Data from the AACR Project GENIE Biopharma Collaborative in cBioPortal. Cancer Res. 2023 Dec 1;83(23):3861-3867. doi: [10.1158/0008-5472.CAN-23-0816](https://doi.org/10.1158/0008-5472.CAN-23-0816). PMID: 37668528; PMCID: PMC10690089.
-
+
- Please also cite the source of the data if you are using a publicly available dataset.
-
+
- If using `gget cellxgene`, please also cite:
- Chanzuckerberg Initiative. (n.d.). CZ CELLxGENE Discover. Retrieved [insert date here], from [https://cellxgene.cziscience.com/](https://cellxgene.cziscience.com/)
-
+
- If using `gget cosmic`, please also cite:
- Tate JG, Bamford S, Jubb HC, Sondka Z, Beare DM, Bindal N, Boutselakis H, Cole CG, Creatore C, Dawson E, Fish P, Harsha B, Hathaway C, Jupe SC, Kok CY, Noble K, Ponting L, Ramshaw CC, Rye CE, Speedy HE, Stefancsik R, Thompson SL, Wang S, Ward S, Campbell PJ, Forbes SA. COSMIC: the Catalogue Of Somatic Mutations In Cancer. Nucleic Acids Res. 2019 Jan 8;47(D1):D941-D947. doi: [10.1093/nar/gky1015](https://doi.org/10.1093/nar/gky1015). PMID: 30371878; PMCID: PMC6323903.
@@ -47,43 +47,43 @@ Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference data
- If using `gget elm`, please also cite:
- Laura Luebbert, Chi Hoang, Manjeet Kumar, Lior Pachter, Fast and scalable querying of eukaryotic linear motifs with gget elm, Bioinformatics, 2024, btae095, [https://doi.org/10.1093/bioinformatics/btae095](https://doi.org/10.1093/bioinformatics/btae095)
-
+
- Manjeet Kumar, Sushama Michael, Jesús Alvarado-Valverde, Bálint Mészáros, Hugo Sámano‐Sánchez, András Zeke, Laszlo Dobson, Tamas Lazar, Mihkel Örd, Anurag Nagpal, Nazanin Farahi, Melanie Käser, Ramya Kraleti, Norman E Davey, Rita Pancsa, Lucía B Chemes, Toby J Gibson, The Eukaryotic Linear Motif resource: 2022 release, Nucleic Acids Research, Volume 50, Issue D1, 7 January 2022, Pages D497–D508, [https://doi.org/10.1093/nar/gkab975](https://doi.org/10.1093/nar/gkab975)
-
-- If using `gget enrichr`, please also cite:
- - Chen EY, Tan CM, Kou Y, Duan Q, Wang Z, Meirelles GV, Clark NR, Ma'ayan A. Enrichr: interactive and collaborative HTML5 gene list enrichment analysis tool. BMC Bioinformatics. 2013; 128(14). [https://doi.org/10.1186/1471-2105-14-128 ](https://doi.org/10.1186/1471-2105-14-128)
- - Kuleshov MV, Jones MR, Rouillard AD, Fernandez NF, Duan Q, Wang Z, Koplev S, Jenkins SL, Jagodnik KM, Lachmann A, McDermott MG, Monteiro CD, Gundersen GW, Ma'ayan A. Enrichr: a comprehensive gene set enrichment analysis web server 2016 update. Nucleic Acids Research. 2016; gkw377. doi: [10.1093/nar/gkw377](https://doi.org/10.1093/nar/gkw377)
+- If using `gget enrichr`, please also cite:
+ - Chen EY, Tan CM, Kou Y, Duan Q, Wang Z, Meirelles GV, Clark NR, Ma'ayan A. Enrichr: interactive and collaborative HTML5 gene list enrichment analysis tool. BMC Bioinformatics. 2013; 128(14). [https://doi.org/10.1186/1471-2105-14-128 ](https://doi.org/10.1186/1471-2105-14-128)
+
+ - Kuleshov MV, Jones MR, Rouillard AD, Fernandez NF, Duan Q, Wang Z, Koplev S, Jenkins SL, Jagodnik KM, Lachmann A, McDermott MG, Monteiro CD, Gundersen GW, Ma'ayan A. Enrichr: a comprehensive gene set enrichment analysis web server 2016 update. Nucleic Acids Research. 2016; gkw377. doi: [10.1093/nar/gkw377](https://doi.org/10.1093/nar/gkw377)
- Xie Z, Bailey A, Kuleshov MV, Clarke DJB., Evangelista JE, Jenkins SL, Lachmann A, Wojciechowicz ML, Kropiwnicki E, Jagodnik KM, Jeon M, & Ma’ayan A. Gene set knowledge discovery with Enrichr. Current Protocols, 1, e90. 2021. doi: [10.1002/cpz1.90](https://doi.org/10.1002/cpz1.90).
-
+
If working with non-human/mouse datasets, please also cite:
- Kuleshov MV, Diaz JEL, Flamholz ZN, Keenan AB, Lachmann A, Wojciechowicz ML, Cagan RL, Ma'ayan A. modEnrichr: a suite of gene set enrichment analysis tools for model organisms. Nucleic Acids Res. 2019 Jul 2;47(W1):W183-W190. doi: [10.1093/nar/gkz347](https://doi.org/10.1093/nar/gkz347). PMID: 31069376; PMCID: PMC6602483.
- If using `gget info`, please also cite:
- Martin FJ, Amode MR, Aneja A, Austine-Orimoloye O, Azov AG, Barnes I, Becker A, Bennett R, Berry A, Bhai J, Bhurji SK, Bignell A, Boddu S, Branco Lins PR, Brooks L, Ramaraju SB, Charkhchi M, Cockburn A, Da Rin Fiorretto L, Davidson C, Dodiya K, Donaldson S, El Houdaigui B, El Naboulsi T, Fatima R, Giron CG, Genez T, Ghattaoraya GS, Martinez JG, Guijarro C, Hardy M, Hollis Z, Hourlier T, Hunt T, Kay M, Kaykala V, Le T, Lemos D, Marques-Coelho D, Marugán JC, Merino GA, Mirabueno LP, Mushtaq A, Hossain SN, Ogeh DN, Sakthivel MP, Parker A, Perry M, Piližota I, Prosovetskaia I, Pérez-Silva JG, Salam AIA, Saraiva-Agostinho N, Schuilenburg H, Sheppard D, Sinha S, Sipos B, Stark W, Steed E, Sukumaran R, Sumathipala D, Suner MM, Surapaneni L, Sutinen K, Szpak M, Tricomi FF, Urbina-Gómez D, Veidenberg A, Walsh TA, Walts B, Wass E, Willhoft N, Allen J, Alvarez-Jarreta J, Chakiachvili M, Flint B, Giorgetti S, Haggerty L, Ilsley GR, Loveland JE, Moore B, Mudge JM, Tate J, Thybert D, Trevanion SJ, Winterbottom A, Frankish A, Hunt SE, Ruffier M, Cunningham F, Dyer S, Finn RD, Howe KL, Harrison PW, Yates AD, Flicek P. Ensembl 2023. Nucleic Acids Res. 2023 Jan 6;51(D1):D933-D941. doi: [10.1093/nar/gkac958](https://doi.org/10.1093/nar/gkac958). PMID: 36318249; PMCID: PMC9825606.
-
+
- Sayers EW, Beck J, Bolton EE, Brister JR, Chan J, Comeau DC, Connor R, DiCuccio M, Farrell CM, Feldgarden M, Fine AM, Funk K, Hatcher E, Hoeppner M, Kane M, Kannan S, Katz KS, Kelly C, Klimke W, Kim S, Kimchi A, Landrum M, Lathrop S, Lu Z, Malheiro A, Marchler-Bauer A, Murphy TD, Phan L, Prasad AB, Pujar S, Sawyer A, Schmieder E, Schneider VA, Schoch CL, Sharma S, Thibaud-Nissen F, Trawick BW, Venkatapathi T, Wang J, Pruitt KD, Sherry ST. Database resources of the National Center for Biotechnology Information. Nucleic Acids Res. 2024 Jan 5;52(D1):D33-D43. doi: [10.1093/nar/gkad1044](https://doi.org/10.1093/nar/gkad1044). PMID: 37994677; PMCID: PMC10767890.
-
+
- The UniProt Consortium , UniProt: the Universal Protein Knowledgebase in 2023, Nucleic Acids Research, Volume 51, Issue D1, 6 January 2023, Pages D523–D531, [https://doi.org/10.1093/nar/gkac1052](https://doi.org/10.1093/nar/gkac1052)
- If using `gget muscle`, please also cite:
- Edgar RC (2021), MUSCLE v5 enables improved estimates of phylogenetic tree confidence by ensemble bootstrapping, bioRxiv 2021.06.20.449169. [https://doi.org/10.1101/2021.06.20.449169](https://doi.org/10.1101/2021.06.20.449169)
-
+
- If using `gget opentargets`, please also cite:
- Ochoa D, Hercules A, Carmona M, Suveges D, Baker J, Malangone C, Lopez I, Miranda A, Cruz-Castillo C, Fumis L, Bernal-Llinares M, Tsukanov K, Cornu H, Tsirigos K, Razuvayevskaya O, Buniello A, Schwartzentruber J, Karim M, Ariano B, Martinez Osorio RE, Ferrer J, Ge X, Machlitt-Northen S, Gonzalez-Uriarte A, Saha S, Tirunagari S, Mehta C, Roldán-Romero JM, Horswell S, Young S, Ghoussaini M, Hulcoop DG, Dunham I, McDonagh EM. The next-generation Open Targets Platform: reimagined, redesigned, rebuilt. Nucleic Acids Res. 2023 Jan 6;51(D1):D1353-D1359. doi: [10.1093/nar/gkac1046](https://doi.org/10.1093/nar/gkac1046). PMID: 36399499; PMCID: PMC9825572.
-
+
- If using `gget pdb`, please also cite:
- Berman HM, Westbrook J, Feng Z, Gilliland G, Bhat TN, Weissig H, Shindyalov IN, Bourne PE. The Protein Data Bank. Nucleic Acids Res. 2000 Jan 1;28(1):235-42. doi: [10.1093/nar/28.1.235](https://doi.org/10.1093/nar/28.1.235). PMID: 10592235; PMCID: PMC102472.
- If using `gget ref` or `gget search`, please also cite:
- Martin FJ, Amode MR, Aneja A, Austine-Orimoloye O, Azov AG, Barnes I, Becker A, Bennett R, Berry A, Bhai J, Bhurji SK, Bignell A, Boddu S, Branco Lins PR, Brooks L, Ramaraju SB, Charkhchi M, Cockburn A, Da Rin Fiorretto L, Davidson C, Dodiya K, Donaldson S, El Houdaigui B, El Naboulsi T, Fatima R, Giron CG, Genez T, Ghattaoraya GS, Martinez JG, Guijarro C, Hardy M, Hollis Z, Hourlier T, Hunt T, Kay M, Kaykala V, Le T, Lemos D, Marques-Coelho D, Marugán JC, Merino GA, Mirabueno LP, Mushtaq A, Hossain SN, Ogeh DN, Sakthivel MP, Parker A, Perry M, Piližota I, Prosovetskaia I, Pérez-Silva JG, Salam AIA, Saraiva-Agostinho N, Schuilenburg H, Sheppard D, Sinha S, Sipos B, Stark W, Steed E, Sukumaran R, Sumathipala D, Suner MM, Surapaneni L, Sutinen K, Szpak M, Tricomi FF, Urbina-Gómez D, Veidenberg A, Walsh TA, Walts B, Wass E, Willhoft N, Allen J, Alvarez-Jarreta J, Chakiachvili M, Flint B, Giorgetti S, Haggerty L, Ilsley GR, Loveland JE, Moore B, Mudge JM, Tate J, Thybert D, Trevanion SJ, Winterbottom A, Frankish A, Hunt SE, Ruffier M, Cunningham F, Dyer S, Finn RD, Howe KL, Harrison PW, Yates AD, Flicek P. Ensembl 2023. Nucleic Acids Res. 2023 Jan 6;51(D1):D933-D941. doi: [10.1093/nar/gkac958](https://doi.org/10.1093/nar/gkac958). PMID: 36318249; PMCID: PMC9825606.
-
+
- If using `gget seq`, please also cite:
- Martin FJ, Amode MR, Aneja A, Austine-Orimoloye O, Azov AG, Barnes I, Becker A, Bennett R, Berry A, Bhai J, Bhurji SK, Bignell A, Boddu S, Branco Lins PR, Brooks L, Ramaraju SB, Charkhchi M, Cockburn A, Da Rin Fiorretto L, Davidson C, Dodiya K, Donaldson S, El Houdaigui B, El Naboulsi T, Fatima R, Giron CG, Genez T, Ghattaoraya GS, Martinez JG, Guijarro C, Hardy M, Hollis Z, Hourlier T, Hunt T, Kay M, Kaykala V, Le T, Lemos D, Marques-Coelho D, Marugán JC, Merino GA, Mirabueno LP, Mushtaq A, Hossain SN, Ogeh DN, Sakthivel MP, Parker A, Perry M, Piližota I, Prosovetskaia I, Pérez-Silva JG, Salam AIA, Saraiva-Agostinho N, Schuilenburg H, Sheppard D, Sinha S, Sipos B, Stark W, Steed E, Sukumaran R, Sumathipala D, Suner MM, Surapaneni L, Sutinen K, Szpak M, Tricomi FF, Urbina-Gómez D, Veidenberg A, Walsh TA, Walts B, Wass E, Willhoft N, Allen J, Alvarez-Jarreta J, Chakiachvili M, Flint B, Giorgetti S, Haggerty L, Ilsley GR, Loveland JE, Moore B, Mudge JM, Tate J, Thybert D, Trevanion SJ, Winterbottom A, Frankish A, Hunt SE, Ruffier M, Cunningham F, Dyer S, Finn RD, Howe KL, Harrison PW, Yates AD, Flicek P. Ensembl 2023. Nucleic Acids Res. 2023 Jan 6;51(D1):D933-D941. doi: [10.1093/nar/gkac958](https://doi.org/10.1093/nar/gkac958). PMID: 36318249; PMCID: PMC9825606.
-
+
- The UniProt Consortium , UniProt: the Universal Protein Knowledgebase in 2023, Nucleic Acids Research, Volume 51, Issue D1, 6 January 2023, Pages D523–D531, [https://doi.org/10.1093/nar/gkac1052](https://doi.org/10.1093/nar/gkac1052)
-
+
___
# Disclaimer
`gget` is only as accurate as the databases/servers/APIs it queries from. The accuracy or reliability of the data is not guaranteed or warranted in any way and the providers disclaim liability of any kind whatsoever, including, without limitation, liability for quality, performance, merchantability and fitness for a particular purpose arising out of the use, or inability to use the data.
diff --git a/docs/src/en/contributing.md b/docs/src/en/contributing.md
index b22b375d0..9cdba4c60 100644
--- a/docs/src/en/contributing.md
+++ b/docs/src/en/contributing.md
@@ -1,73 +1 @@
-[ View page source on GitHub ](https://github.com/pachterlab/gget/blob/main/docs/src/en/contributing.md)
-
-# Welcome to gget's contributing guide
-
-Thank you for investing your time in contributing to our project! Any contribution you make will be reflected on the [gget repo](https://github.com/pachterlab/gget). ✨
-
-Read our [Code of Conduct](./code_of_conduct.md) to keep our community approachable and respectable.
-
-In this guide you will get an overview of the contribution workflow from opening an issue or creating a pull request (PR) to reviewing and merging a PR.
-
-## Issues
-
-### Create a new issue
-
-If you spot a problem with gget or you have an idea for a new feature, [check if an issue already exists](https://github.com/pachterlab/gget/issues). If a related issue doesn't exist, you can open a new issue using the relevant [issue form](https://github.com/pachterlab/gget/issues/new/choose).
-
-### Solve an issue
-
-Scan through our [existing issues](https://github.com/pachterlab/gget/issues) to find one that interests you. You can narrow down the search using `labels` as filters. If you find an issue to work on, you are welcome to open a PR with a fix.
-
-## Contribute through pull requests
-
-### Getting started
-
-1. Fork the repository.
-- Using GitHub Desktop:
- - [Getting started with GitHub Desktop](https://docs.github.com/en/desktop/installing-and-configuring-github-desktop/getting-started-with-github-desktop) will guide you through setting up Desktop.
- - Once Desktop is set up, you can use it to [fork the repo](https://docs.github.com/en/desktop/contributing-and-collaborating-using-github-desktop/cloning-and-forking-repositories-from-github-desktop)!
-
-- Using the command line:
- - [Fork the repo](https://docs.github.com/en/github/getting-started-with-github/fork-a-repo#fork-an-example-repository) so that you can make your changes without affecting the original project until you're ready to merge them.
-
-2. Create a working branch and start with your changes!
-
-### Commit your update
-
-Commit the changes once you are happy with them.
-
-### ‼️ Self-review the following before creating a Pull Request ‼️
-
-1. Review the content for technical accuracy.
-2. Copy-edit the changes/comments for grammar, spelling, and adherence to the general style of existing gget code.
-3. Format your code using [black](https://black.readthedocs.io/en/stable/getting_started.html).
-4. Make sure the unit tests pass:
- - Developer dependencies can be installed with `pip install -r dev-requirements.txt`
- - Run existing unit tests from the gget repository root with `coverage run -m pytest -ra -v tests && coverage report --omit=main.py,tests*`
-5. Add new unit tests if applicable:
- - Arguments and expected results are stored in json files in ./tests/fixtures/
- - Unit tests can be added to ./tests/test_*.py and will be automatically detected
-6. Make sure the edits are compatible with both the Python and the command line interface
- - The command line interface and arguments are defined in ./gget/main.py
-8. Add new modules/arguments to the documentation if applicable:
- - The manual for each module can be added/edited in `./docs/src/en/*.md` (the Spanish version of the docs in `./docs/src/es/*.md` is automatically generated/updated, and does not need to be edited manually)
-
-If you have any questions, feel free to start a [discussion](https://github.com/pachterlab/gget/discussions) or create an issue as described above.
-
-### Pull Request
-
-When you're finished with the changes, [create a pull request](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request), also known as a PR.
-
-‼️ Please make all PRs against the `dev` branch of the gget repository.
-
-- Don't forget to [link PR to issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/linking-a-pull-request-to-an-issue) if you are solving one.
-- Enable the checkbox to [allow maintainer edits](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/allowing-changes-to-a-pull-request-branch-created-from-a-fork) so the branch can be updated for a merge.
-- If you run into any merge issues, checkout this [git tutorial](https://github.com/skills/resolve-merge-conflicts) to help you resolve merge conflicts and other issues.
-
-Once you submit your PR, a gget team member will review your proposal. We may ask questions or request additional information.
-
-### Your PR is merged!
-
-Congratulations! 🎉 The gget team thanks you. ✨
-
-Once your PR is merged, your contributions will be publicly visible on the [gget repo](https://github.com/pachterlab/gget).
+{{#include ../../../CONTRIBUTING.md}}
diff --git a/docs/src/en/cosmic.md b/docs/src/en/cosmic.md
index 2b1829c54..9d2c34250 100644
--- a/docs/src/en/cosmic.md
+++ b/docs/src/en/cosmic.md
@@ -3,7 +3,7 @@
> Python arguments are equivalent to long-option arguments (`--arg`), unless otherwise specified. Flags are True/False arguments in Python. The manual for any gget tool can be called from the command-line using the `-h` `--help` flag.
# gget cosmic 🪐
Search for genes, mutations, and other factors associated with cancer using the [COSMIC](https://cancer.sanger.ac.uk/cosmic) (Catalogue Of Somatic Mutations In Cancer) database.
-Return format: JSON (command-line) or data frame/CSV (Python) when `download_cosmic=False`. When `download_cosmic=True`, downloads the requested database into the specified folder.
+Return format: JSON (command-line) or data frame/CSV (Python) when `download_cosmic=False`. When `download_cosmic=True`, downloads the requested database into the specified folder.
This module was originally written in part by [@AubakirovArman](https://github.com/AubakirovArman) (information querying) and [@josephrich98](https://github.com/josephrich98) (database download).
@@ -12,13 +12,13 @@ NOTE: License fees apply for the commercial use of COSMIC. You can read more abo
NOTE: When using this module for the first time, first download a COSMIC database to obtain `cosmic_tsv_path` (see examples below).
**Positional argument (for querying information)**
-`searchterm`
-Search term, which can be a mutation, or gene name (or Ensembl ID), or sample, etc.
+`searchterm`
+Search term, which can be a mutation, or gene name (or Ensembl ID), or sample, etc.
Examples: 'EGFR', 'ENST00000275493', 'c.650A>T', 'p.Q217L', 'COSV51765119', 'BT2012100223LNCTB' (sample ID)
NOTE: (Python only) Set to `None` when downloading COSMIC databases with `download_cosmic=True`.
**Required argument (for querying information)**
-`-ctp` `--cosmic_tsv_path`
+`-ctp` `--cosmic_tsv_path`
Path to the COSMIC database tsv file, e.g. 'path/to/CancerMutationCensus_AllData_v101_GRCh37.tsv'.
This file is downloaded when downloading COSMIC databases using the arguments described below.
NOTE: This is a required argument when `download_cosmic=False`.
@@ -41,8 +41,8 @@ Creates a modified version of the COSMIC database for use with [`gget mutate`](m
**Optional arguments (for downloading COSMIC databases)**
`-cp` `--cosmic_project`
'cancer' (default), 'cancer_example', 'census', 'resistance', 'cell_line', 'genome_screen', or 'targeted_screen'
-Type of COSMIC database to download:
-
+Type of COSMIC database to download:
+
| cosmic_project | Description | Notes | Size |
|-----------------|-----------------------------------------------------------------------|------------------------------------------------------------------------------------|--------|
| cancer | Cancer Mutation Census (CMC) (most commonly used COSMIC mutation set) | Only available for GRCh37. Most feature-rich schema (takes the longest to search). | 2 GB |
@@ -82,18 +82,18 @@ Whether to remove duplicate rows from the modified database for use with `gget m
(str) Name of the mutation_id column in the csv file created by `gget_mutate`. Default: "mutation_id"
**Optional arguments (general)**
-`-o` `--out`
+`-o` `--out`
Path to the file (or folder when downloading databases with the `download_cosmic` flag) the results will be saved in, e.g. 'path/to/results.json'.
-Defaults:
+Defaults:
-> When `download_cosmic=False`: Results will be returned to standard out
-> When `download_cosmic=True`: Database will be downloaded into current working directory
**Flags (general)**
-`-q` `--quiet`
+`-q` `--quiet`
Command-line only. Prevents progress information from being displayed.
Python: Use `verbose=False` to prevent progress information from being displayed.
-
+
### Examples
#### Download the COSMIC "cancer" database and query information
```bash
@@ -143,7 +143,7 @@ gget.cosmic("EGFR", cosmic_tsv_path="Cosmic_MutantCensus_Tsv_v101_GRCh37/Cosmic_
# References
-If you use `gget cosmic` in a publication, please cite the following articles:
+If you use `gget cosmic` in a publication, please cite the following articles:
- Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836)
diff --git a/docs/src/en/dependents.md b/docs/src/en/dependents.md
index b0d0e9296..9d496fec5 100644
--- a/docs/src/en/dependents.md
+++ b/docs/src/en/dependents.md
@@ -32,7 +32,7 @@ The following applications build on `gget`:
> "Tools are grouped into families such as literature [...], genomics (biopython, **gget**), and machine learning (rdkit, pymol)."
- [PerTurboAgent](https://www.biorxiv.org/content/10.1101/2025.05.25.656020v1)
A Self-Planning Agent for Boosting Sequential Perturb-seq Experiments.
- > "We [...] use packages **gget** and blitzgsea for data enrichment analysis"
+ > "We [...] use packages **gget** and blitzgsea for data enrichment analysis"
- [Scientific skills for Claude](https://github.com/K-Dense-AI/claude-scientific-skills) by K-Dense-AI
> " This repository contains 138 scientific skills organized across multiple domains. Each skill provides comprehensive documentation, code examples, and best practices for working with scientific libraries, databases, and tools.
> 🧬 Bioinformatics & Genomics
@@ -92,7 +92,7 @@ ____
- Shanmugampillai Jeyarajaguru Kabilan et al., [Molecular modelling approaches for the identification of potent Sodium-Glucose Cotransporter 2 inhibitors from Boerhavia diffusa for the potential treatment of chronic kidney disease.](https://doi.org/10.21203/rs.3.rs-4520611/v1) *Journal of Computer-Aided Molecular Design (under review)* (2024). DOI: 10.21203/rs.3.rs-4520611/v1
- Joseph M Rich et al., [The impact of package selection and versioning on single-cell RNA-seq analysis.](https://pmc.ncbi.nlm.nih.gov/articles/PMC11014608/#:~:text=10.1101/2024.04.04.588111) *bioRxiv* (2024). DOI: 10.1101/2024.04.04.588111
- Sanjay C. Nagi et al., [AnoPrimer: Primer Design in malaria vectors informed by range-wide genomic variation.](https://wellcomeopenresearch.org/articles/9-255/v1) *Wellcome Open Research* (2024).
-- Yasmin Makki Mohialden et al., [A survey of the most recent Python packages for use in biology.](http://dx.doi.org/10.48047/NQ.2023.21.2.NQ23029) *NeuroQuantology* (2023). DOI: 10.48047/NQ.2023.21.2.NQ23029
+- Yasmin Makki Mohialden et al., [A survey of the most recent Python packages for use in biology.](http://dx.doi.org/10.48047/NQ.2023.21.2.NQ23029) *NeuroQuantology* (2023). DOI: 10.48047/NQ.2023.21.2.NQ23029
- Nicola A. Kearns et al., [Generation and molecular characterization of human pluripotent stem cell-derived pharyngeal foregut endoderm.](https://doi.org/10.1016/j.devcel.2023.08.024) *Cell Reports* (2023). DOI: 10.1016/j.devcel.2023.08.024
- Jonathan Rosenski et al., [Predicting gene knockout effects from expression data.](https://link.springer.com/article/10.1186/s12920-023-01446-6) *BMC Medical Genomics* (2023). DOI: 10.1186/s12920-023-01446-6
- Peter Overby et al., [Pharmacological or genetic inhibition of Scn9a protects beta-cells while reducing insulin secretion in type 1 diabetes.](https://doi.org/10.1101/2023.06.11.544521) *bioRxiv* (2023). DOI: 10.1101/2023.06.11.544521
@@ -113,4 +113,3 @@ ___
# 🚂 [gget code repository](https://github.com/pachterlab/gget/) traffic

Updates automatically every week on Sunday at 23:55 (UTC).
-
diff --git a/docs/src/en/diamond.md b/docs/src/en/diamond.md
index 3675c1fdc..7077733c2 100644
--- a/docs/src/en/diamond.md
+++ b/docs/src/en/diamond.md
@@ -2,7 +2,7 @@
> Python arguments are equivalent to long-option arguments (`--arg`), unless otherwise specified. Flags are True/False arguments in Python. The manual for any gget tool can be called from the command-line using the `-h` `--help` flag.
# gget diamond 💎
-Align multiple protein or translated DNA sequences using [DIAMOND](https://www.nature.com/articles/nmeth.3176) (DIAMOND is similar to BLAST, but this is a local computation).
+Align multiple protein or translated DNA sequences using [DIAMOND](https://www.nature.com/articles/nmeth.3176) (DIAMOND is similar to BLAST, but this is a local computation).
Return format: JSON (command-line) or data frame/CSV (Python).
**Positional argument**
@@ -20,7 +20,7 @@ Path to save DIAMOND database created from `reference` (str).
Default: None -> Temporary db file will be deleted after alignment or saved in `out` if `out` is provided.
`-s` `--sensitivity`
-Sensitivity of alignment (str). Default: "very-sensitive".
+Sensitivity of alignment (str). Default: "very-sensitive".
One of the following: fast, mid-sensitive, sensitive, more-sensitive, very-sensitive, or ultra-sensitive.
`-t` `--threads`
@@ -29,20 +29,20 @@ Number of threads used (int). Default: 1.
`-db` `--diamond_binary`
Path to DIAMOND binary (str). Default: None -> Uses DIAMOND binary installed with `gget`.
-`-o` `--out`
-Path to the folder to save results in (str), e.g. "path/to/directory". Default: Standard out; temporary files are deleted.
+`-o` `--out`
+Path to the folder to save results in (str), e.g. "path/to/directory". Default: Standard out; temporary files are deleted.
**Flags**
`-x` `--translated`
Perform translated alignment of nucleotide sequences to amino acid reference sequences.
-
+
`-csv` `--csv`
Command-line only. Returns results in CSV format.
Python: Use `json=True` to return output in JSON format.
-`-q` `--quiet`
+`-q` `--quiet`
Command-line only. Prevents progress information from being displayed.
-Python: Use `verbose=False` to prevent progress information from being displayed.
+Python: Use `verbose=False` to prevent progress information from being displayed.
### Example
```bash
@@ -53,7 +53,7 @@ gget diamond GGETISAWESQME ELVISISALIVE LQVEFRANKLIN PACHTERLABRQCKS -ref GGETIS
# Python
gget.diamond(["GGETISAWESQME", "ELVISISALIVE", "LQVEFRANKLIN", "PACHTERLABRQCKS"], reference=["GGETISAWESQMEELVISISALIVELQVEFRANKLIN", "PACHTERLABRQCKS"])
```
-→ Returns results in JSON (command-line) or data frame/CSV (Python) format:
+→ Returns results in JSON (command-line) or data frame/CSV (Python) format:
|query_accession|subject_accession|identity_percentage|query_seq_length|subject_seq_length|length|mismatches|gap_openings|query_start|query_end|subject_start|subject_end|e-value |bit_score|
|---------------|-----------------|-------------------|----------------|------------------|------|----------|------------|-----------|---------|-------------|-----------|--------|---------|
@@ -64,7 +64,7 @@ gget.diamond(["GGETISAWESQME", "ELVISISALIVE", "LQVEFRANKLIN", "PACHTERLABRQCKS"
#### [More examples](https://github.com/pachterlab/gget_examples)
# References
-If you use `gget diamond` in a publication, please cite the following articles:
+If you use `gget diamond` in a publication, please cite the following articles:
- Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836)
diff --git a/docs/src/en/elm.md b/docs/src/en/elm.md
index 07e4ab423..915030cdf 100644
--- a/docs/src/en/elm.md
+++ b/docs/src/en/elm.md
@@ -2,12 +2,12 @@
> Python arguments are equivalent to long-option arguments (`--arg`), unless otherwise specified. Flags are True/False arguments in Python. The manual for any gget tool can be called from the command-line using the `-h` `--help` flag.
# gget elm 🎭
-Locally predict Eukaryotic Linear Motifs from an amino acid sequence or UniProt Acc using data from the [ELM database](http://elm.eu.org/).
+Locally predict Eukaryotic Linear Motifs from an amino acid sequence or UniProt Acc using data from the [ELM database](http://elm.eu.org/).
Return format: JSON (command-line) or data frame/CSV (Python). This module returns two data frames (or JSON formatted files) (see examples).
-**ELM data can be downloaded & distributed for non-commercial use according to the [ELM Software License Agreement](http://elm.eu.org/media/Elm_academic_license.pdf).**
+**ELM data can be downloaded & distributed for non-commercial use according to the [ELM Software License Agreement](http://elm.eu.org/media/Elm_academic_license.pdf).**
-Before using `gget elm` for the first time, run `gget setup elm` (bash) / `gget.setup("elm")` (Python) once (also see [`gget setup`](setup.md)).
+Before using `gget elm` for the first time, run `gget setup elm` (bash) / `gget.setup("elm")` (Python) once (also see [`gget setup`](setup.md)).
**Positional argument**
`sequence`
@@ -16,7 +16,7 @@ When providing a Uniprot Acc, use flag `--uniprot` (Python: `uniprot=True`).
**Optional arguments**
`-s` `--sensitivity`
-Sensitivity of DIAMOND alignment (str). Default: "very-sensitive".
+Sensitivity of DIAMOND alignment (str). Default: "very-sensitive".
One of the following: fast, mid-sensitive, sensitive, more-sensitive, very-sensitive, or ultra-sensitive.
`-t` `--threads`
@@ -25,23 +25,23 @@ Number of threads used in DIAMOND alignment (int). Default: 1.
`-bin` `--diamond_binary`
Path to DIAMOND binary (str). Default: None -> Uses DIAMOND binary installed with `gget`.
-`-o` `--out`
-Path to the folder to save results in (str), e.g. "path/to/directory". Default: Standard out; temporary files are deleted.
+`-o` `--out`
+Path to the folder to save results in (str), e.g. "path/to/directory". Default: Standard out; temporary files are deleted.
**Flags**
`-u` `--uniprot`
Set to True if `sequence` is a Uniprot Acc instead of an amino acid sequence.
-`-e` `--expand`
-Expand the information returned in the regex data frame to include the protein names, organisms, and references that the motif was orignally validated on.
+`-e` `--expand`
+Expand the information returned in the regex data frame to include the protein names, organisms, and references that the motif was orignally validated on.
`-csv` `--csv`
Command-line only. Returns results in CSV format.
Python: Use `json=True` to return output in JSON format.
-`-q` `--quiet`
+`-q` `--quiet`
Command-line only. Prevents progress information from being displayed.
-Python: Use `verbose=False` to prevent progress information from being displayed.
+Python: Use `verbose=False` to prevent progress information from being displayed.
### Examples
Find ELMs in an amino acid sequence:
@@ -54,7 +54,7 @@ gget elm -o gget_elm_results LIAQSIGQASFV
gget.setup(“elm”) # Downloads/updates local ELM database
ortholog_df, regex_df = gget.elm("LIAQSIGQASFV")
```
-
+
Find ELMs giving a UniProt Acc as input:
```bash
gget setup elm # Downloads/updates local ELM database
@@ -68,14 +68,14 @@ ortholog_df, regex_df = gget.elm("Q02410", uniprot=True, expand=True)
→ Returns two data frames (or JSON formatted dictionaries for command line) containing extensive information about linear motifs associated with orthologous proteins and motifs found in the input sequence directly based on their regex expressions:
ortholog_df:
-
+
|Ortholog_UniProt_Acc|ProteinName|class_accession|ELMIdentifier |FunctionalSiteName |Description |Organism |… |
|:-----------------:|:---------:|:-------------:|:-------------:|:-----------------------------------:|:---------------------------------------------------------------------------------------------------------------------------------------:|:----------:|:-:|
|Q02410 |APBA1_HUMAN|ELME000357 |LIG_CaMK_CASK_1|CASK CaMK domain binding ligand motif|Motif that mediates binding to the calmodulin-dependent protein kinase (CaMK) domain of the peripheral plasma membrane protein CASK/Lin2.|Homo sapiens|… |
|Q02410 |APBA1_HUMAN|ELME000091 |LIG_PDZ_Class_2|PDZ domain ligands |The C-terminal class 2 PDZ-binding motif is classically represented by a pattern such as |Homo sapiens|… |
regex_df:
-
+
|Instance_accession|ELMIdentifier |FunctionalSiteName |ELMType|Description |Instances (Matched Sequence)|Organism |… |
|:----------------:|:----------------:|:-----------------------------:|:-----:|:-----------------------------------------------------------------------------------------------------------------------------------------------------:|:--------------------------:|:----------------------------:|:-:|
|ELME000321 |CLV_C14_Caspase3-7|Caspase cleavage motif |CLV |Caspase-3 and Caspase-7 cleavage site. |ERSDG |Mus musculus |… |
@@ -87,13 +87,13 @@ regex_df:
# Tutorials
### [🔗 General `gget elm` demo](https://github.com/pachterlab/gget_examples/blob/main/gget_elm_demo.ipynb)
-
+
### [🔗 A point mutation in BRCA2 is carcinogenic due to the loss of a protein interaction motif](https://github.com/pachterlab/gget_examples/blob/main/gget_elm_BRCA2_example.ipynb)
-
+
### [🔗 Filter `gget elm` results based on disordered protein regions](https://github.com/pachterlab/gget_examples/blob/main/gget_elm_IUPred3_tutorial.ipynb)
# References
-If you use `gget elm` in a publication, please cite the following articles:
+If you use `gget elm` in a publication, please cite the following articles:
- Laura Luebbert, Chi Hoang, Manjeet Kumar, Lior Pachter, Fast and scalable querying of eukaryotic linear motifs with gget elm, _Bioinformatics_, 2024, btae095, [https://doi.org/10.1093/bioinformatics/btae095](https://doi.org/10.1093/bioinformatics/btae095)
diff --git a/docs/src/en/enrichr.md b/docs/src/en/enrichr.md
index a529cb8cb..5df9e9496 100644
--- a/docs/src/en/enrichr.md
+++ b/docs/src/en/enrichr.md
@@ -4,7 +4,7 @@
# gget enrichr 💰
Perform an enrichment analysis on a list of genes using [Enrichr](https://maayanlab.cloud/Enrichr/) or [modEnrichr](https://maayanlab.cloud/modEnrichr/).
Return format: JSON (command-line) or data frame/CSV (Python).
-
+
**Positional argument**
`genes`
Short names (gene symbols) of genes to perform enrichment analysis on, e.g. PHF14 RBM3 MSL1 PHF21A.
@@ -17,12 +17,12 @@ Supports any database listed [here](https://maayanlab.cloud/Enrichr/#libraries)
'pathway' (KEGG_2021_Human)
'transcription' (ChEA_2016)
'ontology' (GO_Biological_Process_2021)
-'diseases_drugs' (GWAS_Catalog_2019)
+'diseases_drugs' (GWAS_Catalog_2019)
'celltypes' (PanglaoDB_Augmented_2021)
'kinase_interactions' (KEA_2015)
-
-NOTE: database shortcuts are not supported for species other than 'human' or 'mouse'. Click on the species databases listed below under `species` to view a list of databases available for each species.
-
+
+NOTE: database shortcuts are not supported for species other than 'human' or 'mouse'. Click on the species databases listed below under `species` to view a list of databases available for each species.
+
**Optional arguments**
`-s` `--species`
Species to use as reference for the enrichment analysis. (Default: human)
@@ -42,8 +42,8 @@ Short names (gene symbols) of background genes to perform enrichment analysis on
Alternatively: use flag `--ensembl_background` to input a list of Ensembl gene IDs.
See [this Tweetorial](https://x.com/ChiHoangCaltech/status/1689679611335155712?s=20) to learn why you should use a background gene list when performing an enrichment analysis.
-`-o` `--out`
-Path to the file the results will be saved in, e.g. path/to/directory/results.csv (or .json). (Default: Standard out.)
+`-o` `--out`
+Path to the file the results will be saved in, e.g. path/to/directory/results.csv (or .json). (Default: Standard out.)
Python: `save=True` will save the output in the current working directory.
`-ko` `--kegg_out`
@@ -58,9 +58,9 @@ Python only. (width, height) of plot in inches. (Default: (10,10))
`ax`
Python only. Pass a matplotlib axes object for plot customization. (Default: None)
-
+
**Flags**
-`-e` `--ensembl`
+`-e` `--ensembl`
Add this flag if `genes` are given as Ensembl gene IDs.
`-e_b` `--ensembl_bkg`
@@ -68,19 +68,19 @@ Add this flag if `background_list` are given as Ensembl gene IDs.
`-bkg` `--background`
If True, use set of > 20,000 default background genes listed [here](https://github.com/pachterlab/gget/blob/main/gget/constants/enrichr_bkg_genes.txt).
-
+
`-csv` `--csv`
Command-line only. Returns results in CSV format.
Python: Use `json=True` to return output in JSON format.
-`-q` `--quiet`
+`-q` `--quiet`
Command-line only. Prevents progress information from being displayed.
-Python: Use `verbose=False` to prevent progress information from being displayed.
-
+Python: Use `verbose=False` to prevent progress information from being displayed.
+
`plot`
Python only. `plot=True` provides a graphical overview of the first 15 results (default: False).
-
-
+
+
### Examples
```bash
gget enrichr -db ontology ACE2 AGT AGTR1
@@ -110,10 +110,10 @@ gget.enrichr(
genes = [
"PHF14", "RBM3", "MSL1", "PHF21A", "ARL10", "INSR", "JADE2", "P2RX7",
"LINC00662", "CCDC101", "PPM1B", "KANSL1L", "CRYZL1", "ANAPC16", "TMCC1",
- "CDH8", "RBM11", "CNPY2", "HSPA1L", "CUL2", "PLBD2", "LARP7", "TECPR2",
+ "CDH8", "RBM11", "CNPY2", "HSPA1L", "CUL2", "PLBD2", "LARP7", "TECPR2",
"ZNF302", "CUX1", "MOB2", "CYTH2", "SEC22C", "EIF4E3", "ROBO2",
"ADAMTS9-AS2", "CXXC1", "LINC01314", "ATF7", "ATP5F1"
- ],
+ ],
database = "ChEA_2022",
background_list = [
"NSUN3","POLRMT","NLRX1","SFXN5","ZC3H12C","SLC25A39","ARSG",
@@ -128,11 +128,11 @@ gget.enrichr(
"ZFP787","ZFP655","RABEPK","ZFP650","4732466D17RIK","EXOSC4",
"WDR42A","GPHN","2610528J11RIK","1110003E01RIK","MDH1","1200014M14RIK",
"AW209491","MUT","1700123L14RIK","2610036D13RIK",
- "PHF14", "RBM3", "MSL1", "PHF21A", "ARL10", "INSR", "JADE2",
- "P2RX7", "LINC00662", "CCDC101", "PPM1B", "KANSL1L", "CRYZL1",
- "ANAPC16", "TMCC1","CDH8", "RBM11", "CNPY2", "HSPA1L", "CUL2",
- "PLBD2", "LARP7", "TECPR2", "ZNF302", "CUX1", "MOB2", "CYTH2",
- "SEC22C", "EIF4E3", "ROBO2", "ADAMTS9-AS2", "CXXC1", "LINC01314", "ATF7",
+ "PHF14", "RBM3", "MSL1", "PHF21A", "ARL10", "INSR", "JADE2",
+ "P2RX7", "LINC00662", "CCDC101", "PPM1B", "KANSL1L", "CRYZL1",
+ "ANAPC16", "TMCC1","CDH8", "RBM11", "CNPY2", "HSPA1L", "CUL2",
+ "PLBD2", "LARP7", "TECPR2", "ZNF302", "CUX1", "MOB2", "CYTH2",
+ "SEC22C", "EIF4E3", "ROBO2", "ADAMTS9-AS2", "CXXC1", "LINC01314", "ATF7",
"ATP5F1""COX15","TMEM30A","NSMCE4A","TM2D2","RHBDD3","ATXN2","NFS1",
"3110001I20RIK","BC038156","C330002I19RIK","ZFYVE20","POLI","TOMM70A",
"LOC100047782","2410012H22RIK","RILP","A230062G08RIK",
@@ -226,15 +226,15 @@ df |>
[Using `gget enrichr` with background genes](https://github.com/pachterlab/gget_examples/blob/main/gget_enrichr_with_background_genes.ipynb)
# References
-If you use `gget enrichr` in a publication, please cite the following articles:
+If you use `gget enrichr` in a publication, please cite the following articles:
- Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836)
-- Chen EY, Tan CM, Kou Y, Duan Q, Wang Z, Meirelles GV, Clark NR, Ma'ayan A. Enrichr: interactive and collaborative HTML5 gene list enrichment analysis tool. BMC Bioinformatics. 2013; 128(14). [https://doi.org/10.1186/1471-2105-14-128 ](https://doi.org/10.1186/1471-2105-14-128)
+- Chen EY, Tan CM, Kou Y, Duan Q, Wang Z, Meirelles GV, Clark NR, Ma'ayan A. Enrichr: interactive and collaborative HTML5 gene list enrichment analysis tool. BMC Bioinformatics. 2013; 128(14). [https://doi.org/10.1186/1471-2105-14-128 ](https://doi.org/10.1186/1471-2105-14-128)
-- Kuleshov MV, Jones MR, Rouillard AD, Fernandez NF, Duan Q, Wang Z, Koplev S, Jenkins SL, Jagodnik KM, Lachmann A, McDermott MG, Monteiro CD, Gundersen GW, Ma'ayan A. Enrichr: a comprehensive gene set enrichment analysis web server 2016 update. Nucleic Acids Research. 2016; gkw377. doi: [10.1093/nar/gkw377](https://doi.org/10.1093/nar/gkw377)
+- Kuleshov MV, Jones MR, Rouillard AD, Fernandez NF, Duan Q, Wang Z, Koplev S, Jenkins SL, Jagodnik KM, Lachmann A, McDermott MG, Monteiro CD, Gundersen GW, Ma'ayan A. Enrichr: a comprehensive gene set enrichment analysis web server 2016 update. Nucleic Acids Research. 2016; gkw377. doi: [10.1093/nar/gkw377](https://doi.org/10.1093/nar/gkw377)
- Xie Z, Bailey A, Kuleshov MV, Clarke DJB., Evangelista JE, Jenkins SL, Lachmann A, Wojciechowicz ML, Kropiwnicki E, Jagodnik KM, Jeon M, & Ma’ayan A. Gene set knowledge discovery with Enrichr. Current Protocols, 1, e90. 2021. doi: [10.1002/cpz1.90](https://doi.org/10.1002/cpz1.90).
-
+
If working with non-human/mouse datasets, please also cite:
- Kuleshov MV, Diaz JEL, Flamholz ZN, Keenan AB, Lachmann A, Wojciechowicz ML, Cagan RL, Ma'ayan A. modEnrichr: a suite of gene set enrichment analysis tools for model organisms. Nucleic Acids Res. 2019 Jul 2;47(W1):W183-W190. doi: [10.1093/nar/gkz347](https://doi.org/10.1093/nar/gkz347). PMID: 31069376; PMCID: PMC6602483.
diff --git a/docs/src/en/gpt.md b/docs/src/en/gpt.md
index 9d02ae483..bc07fba2f 100644
--- a/docs/src/en/gpt.md
+++ b/docs/src/en/gpt.md
@@ -2,7 +2,7 @@
> Python arguments are equivalent to long-option arguments (`--arg`), unless otherwise specified. Flags are True/False arguments in Python. The manual for any gget tool can be called from the command-line using the `-h` `--help` flag.
# gget gpt 💬
-Generates natural language text based on a given prompt using the [OpenAI](https://openai.com/) API's 'openai.ChatCompletion.create' endpoint.
+Generates natural language text based on a given prompt using the [OpenAI](https://openai.com/) API's 'openai.ChatCompletion.create' endpoint.
This module, including its source code, documentation and unit tests, were partly written by OpenAI's Chat-GTP3.
NOTE:
@@ -27,34 +27,34 @@ Your OpenAI API key (str) ([get your API key](https://platform.openai.com/accoun
The name of the GPT model to use for generating the text (str). Default is "gpt-3.5-turbo".
See https://platform.openai.com/docs/models/gpt-4 for more information on the available models.
-`-temp` `--temperature`
+`-temp` `--temperature`
Value between 0 and 2 that controls the level of randomness and creativity in the generated text (float).
Higher values result in more creative and varied text. Default is 1.
-`-tp` `--top_p`
+`-tp` `--top_p`
Controls the diversity of the generated text as an alternative to sampling with temperature (float).
Higher values result in more diverse and unexpected text. Default is 1.
Note: OpenAI recommends altering this or temperature but not both.
-`-s` `--stop`
+`-s` `--stop`
A sequence of tokens to mark the end of the generated text (str). Default is None.
-`-mt` `--max_tokens`
+`-mt` `--max_tokens`
Controls the maximum length of the generated text, in tokens (int). Default is 200.
-`-pp` `--presence_penalty`
+`-pp` `--presence_penalty`
Number between -2.0 and 2.0. Higher values result increase the model's likelihood to talk about new topics (float). Default is 0.
-`-fp` `--frequency_penalty`
+`-fp` `--frequency_penalty`
Number between -2.0 and 2.0. Higher values decrease the model's likelihood to repeat the same line verbatim (float). Default is 0.
-`-lb` `--logit_bias`
+`-lb` `--logit_bias`
A dictionary that specifies a bias towards certain tokens in the generated text (dict). Default is None.
-`-o` `--out`
+`-o` `--out`
If provided, saves the generated text to a file with the specified path (str). Default: Standard out.
-
-
+
+
### Example
```bash
gget gpt "How are you today GPT?" your_api_token
diff --git a/docs/src/en/info.md b/docs/src/en/info.md
index e1f4ff192..a76aebdb6 100644
--- a/docs/src/en/info.md
+++ b/docs/src/en/info.md
@@ -6,33 +6,33 @@ Fetch extensive gene and transcript metadata from [Ensembl](https://www.ensembl.
Return format: JSON (command-line) or data frame/CSV (Python).
**Positional argument**
-`ens_ids`
+`ens_ids`
One or more Ensembl IDs (WormBase and Flybase IDs are also supported).
-NOTE: Providing a list of more than 1,000 Ensembl IDs at once might result in a server error (to process more than 1,000 IDs, split the list of IDs into chunks of 1,000 IDs and run these separately).
+NOTE: Providing a list of more than 1,000 Ensembl IDs at once might result in a server error (to process more than 1,000 IDs, split the list of IDs into chunks of 1,000 IDs and run these separately).
**Optional arguments**
-`-o` `--out`
-Path to the file the results will be saved in, e.g. path/to/directory/results.csv (or .json). Default: Standard out.
+`-o` `--out`
+Path to the file the results will be saved in, e.g. path/to/directory/results.csv (or .json). Default: Standard out.
Python: `save=True` will save the output in the current working directory.
**Flags**
`-n` `--ncbi`
TURN OFF results from [NCBI](https://www.ncbi.nlm.nih.gov/).
-Python: `ncbi=False` prevents data retrieval from NCBI (default: True).
+Python: `ncbi=False` prevents data retrieval from NCBI (default: True).
`-u` `--uniprot`
TURN OFF results from [UniProt](https://www.uniprot.org/).
-Python: `uniprot=False` prevents data retrieval from UniProt (default: True).
+Python: `uniprot=False` prevents data retrieval from UniProt (default: True).
`-pdb` `--pdb`
INCLUDE [PDB](https://www.ebi.ac.uk/pdbe/) IDs in output (might increase runtime).
-Python: `pdb=True` includes PDB IDs in the results (default: False).
+Python: `pdb=True` includes PDB IDs in the results (default: False).
`-csv` `--csv`
Command-line only. Returns results in CSV format.
Python: Use `json=True` to return output in JSON format.
-`-q` `--quiet`
+`-q` `--quiet`
Command-line only. Prevents progress information from being displayed.
Python: Use `verbose=False` to prevent progress information from being displayed.
@@ -54,17 +54,16 @@ gget.info(["ENSG00000034713", "ENSG00000104853", "ENSG00000170296"])
| -------------- |-------------------------| ------------------------| -------------- | ----------|-----|----|----|----|----|----|----|
| ENSG00000034713| P60520 | 11345 | GABARAPL2 | [ATG8, ATG8C, FLC3A, GABARAPL2, GATE-16, GATE16, GEF-2, GEF2] | Gamma-aminobutyric acid receptor-associated protein like 2 (GABA(A) receptor-associated protein-like 2)... | GABA type A receptor associated protein like 2 [Source:HGNC Symbol;Acc:HGNC:13291] | FUNCTION: Ubiquitin-like modifier involved in intra- Golgi traffic (By similarity). Modulates intra-Golgi transport through coupling between NSF activity and ... | Enables ubiquitin protein ligase binding activity. Involved in negative regulation of proteasomal protein catabolic process and protein... | protein_coding | ENST00000037243.7 |... |
| . . . | . . . | . . . | . . . | . . . | . . . | . . . | . . . | . . . | . . . | . . . | ... |
-
+
#### [More examples](https://github.com/pachterlab/gget_examples)
# References
-If you use `gget info` in a publication, please cite the following articles:
+If you use `gget info` in a publication, please cite the following articles:
- Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836)
- Martin FJ, Amode MR, Aneja A, Austine-Orimoloye O, Azov AG, Barnes I, Becker A, Bennett R, Berry A, Bhai J, Bhurji SK, Bignell A, Boddu S, Branco Lins PR, Brooks L, Ramaraju SB, Charkhchi M, Cockburn A, Da Rin Fiorretto L, Davidson C, Dodiya K, Donaldson S, El Houdaigui B, El Naboulsi T, Fatima R, Giron CG, Genez T, Ghattaoraya GS, Martinez JG, Guijarro C, Hardy M, Hollis Z, Hourlier T, Hunt T, Kay M, Kaykala V, Le T, Lemos D, Marques-Coelho D, Marugán JC, Merino GA, Mirabueno LP, Mushtaq A, Hossain SN, Ogeh DN, Sakthivel MP, Parker A, Perry M, Piližota I, Prosovetskaia I, Pérez-Silva JG, Salam AIA, Saraiva-Agostinho N, Schuilenburg H, Sheppard D, Sinha S, Sipos B, Stark W, Steed E, Sukumaran R, Sumathipala D, Suner MM, Surapaneni L, Sutinen K, Szpak M, Tricomi FF, Urbina-Gómez D, Veidenberg A, Walsh TA, Walts B, Wass E, Willhoft N, Allen J, Alvarez-Jarreta J, Chakiachvili M, Flint B, Giorgetti S, Haggerty L, Ilsley GR, Loveland JE, Moore B, Mudge JM, Tate J, Thybert D, Trevanion SJ, Winterbottom A, Frankish A, Hunt SE, Ruffier M, Cunningham F, Dyer S, Finn RD, Howe KL, Harrison PW, Yates AD, Flicek P. Ensembl 2023. Nucleic Acids Res. 2023 Jan 6;51(D1):D933-D941. doi: [10.1093/nar/gkac958](https://doi.org/10.1093/nar/gkac958). PMID: 36318249; PMCID: PMC9825606.
-
+
- Sayers EW, Beck J, Bolton EE, Brister JR, Chan J, Comeau DC, Connor R, DiCuccio M, Farrell CM, Feldgarden M, Fine AM, Funk K, Hatcher E, Hoeppner M, Kane M, Kannan S, Katz KS, Kelly C, Klimke W, Kim S, Kimchi A, Landrum M, Lathrop S, Lu Z, Malheiro A, Marchler-Bauer A, Murphy TD, Phan L, Prasad AB, Pujar S, Sawyer A, Schmieder E, Schneider VA, Schoch CL, Sharma S, Thibaud-Nissen F, Trawick BW, Venkatapathi T, Wang J, Pruitt KD, Sherry ST. Database resources of the National Center for Biotechnology Information. Nucleic Acids Res. 2024 Jan 5;52(D1):D33-D43. doi: [10.1093/nar/gkad1044](https://doi.org/10.1093/nar/gkad1044). PMID: 37994677; PMCID: PMC10767890.
-
-- The UniProt Consortium , UniProt: the Universal Protein Knowledgebase in 2023, Nucleic Acids Research, Volume 51, Issue D1, 6 January 2023, Pages D523–D531, [https://doi.org/10.1093/nar/gkac1052](https://doi.org/10.1093/nar/gkac1052)
+- The UniProt Consortium , UniProt: the Universal Protein Knowledgebase in 2023, Nucleic Acids Research, Volume 51, Issue D1, 6 January 2023, Pages D523–D531, [https://doi.org/10.1093/nar/gkac1052](https://doi.org/10.1093/nar/gkac1052)
diff --git a/docs/src/en/introduction.md b/docs/src/en/introduction.md
index 244a05930..4bd02ca88 100644
--- a/docs/src/en/introduction.md
+++ b/docs/src/en/introduction.md
@@ -8,10 +8,10 @@
# Welcome!
[
](https://raw.githubusercontent.com/pachterlab/gget/main/figures/gget_overview.png)
-
+
`gget` is a free, open-source command-line tool and Python package that enables efficient querying of genomic databases.
-`gget` consists of a collection of separate but interoperable modules, each designed to facilitate one type of database querying in a single line of code.
+`gget` consists of a collection of separate but interoperable modules, each designed to facilitate one type of database querying in a single line of code.
`gget` is part of the [scverse®](https://scverse.org) project and is fiscally sponsored by [NumFOCUS](https://numfocus.org/). If you like `gget` and want to support our mission, please consider making a tax-deductible [donation](https://opencollective.com/scverse/projects/scverse-gget/donate?interval=oneTime&amount=20&contributeAs=me).
@@ -63,7 +63,7 @@ These are the `gget` core modules. Click on any module to access detailed docume
-If you use `gget` in a publication, please [cite*](/gget/en/cite.md):
+If you use `gget` in a publication, please [cite*](/gget/en/cite.md):
```
Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. https://doi.org/10.1093/bioinformatics/btac836
```
@@ -73,7 +73,7 @@ Read the article here: [https://doi.org/10.1093/bioinformatics/btac836](https://
[](https://github.com/lauraluebbert/gget_downloads/tree/main)
-
+
@@ -98,4 +98,3 @@ Read the article here: [https://doi.org/10.1093/bioinformatics/btac836](https://
-
diff --git a/docs/src/en/muscle.md b/docs/src/en/muscle.md
index 6d8c18b3b..16a512540 100644
--- a/docs/src/en/muscle.md
+++ b/docs/src/en/muscle.md
@@ -6,12 +6,12 @@ Align multiple nucleotide or amino acid sequences to each other using [Muscle5](
Return format: ClustalW formatted standard out or aligned FASTA (.afa).
**Positional argument**
-`fasta`
+`fasta`
List of sequences or path to FASTA or .txt file containing the nucleotide or amino acid sequences to be aligned.
**Optional arguments**
-`-o` `--out`
-Path to the aligned FASTA file the results will be saved in, e.g. path/to/directory/results.afa. Default: Standard out.
+`-o` `--out`
+Path to the aligned FASTA file the results will be saved in, e.g. path/to/directory/results.afa. Default: Standard out.
Python: `save=True` will save the output in the current working directory.
**Flags**
@@ -19,11 +19,11 @@ Python: `save=True` will save the output in the current working directory.
Aligns input using the [Super5 algorithm](https://drive5.com/muscle5/Muscle5_SuppMat.pdf) instead of the [Parallel Perturbed Probcons (PPP) algorithm](https://drive5.com/muscle5/Muscle5_SuppMat.pdf) to decrease time and memory.
Use for large inputs (a few hundred sequences).
-`-q` `--quiet`
+`-q` `--quiet`
Command-line only. Prevents progress information from being displayed.
-Python: Use `verbose=False` to prevent progress information from being displayed.
-
-
+Python: Use `verbose=False` to prevent progress information from being displayed.
+
+
### Example
```bash
gget muscle MSSSSWLLLSLVAVTAAQSTIEEQAKTFLDKFNHEAEDLFYQSSLAS MSSSSWLLLSLVEVTAAQSTIEQQAKTFLDKFHEAEDLFYQSLLAS
@@ -40,7 +40,7 @@ gget muscle fasta.fa
# Python
gget.muscle("fasta.fa")
```
-→ Returns an overview of the aligned sequences with ClustalW coloring. (To return an aligned FASTA (.afa) file, use `--out` argument (or `save=True` in Jupyter Lab/Google Colab).) In the above example, the 'fasta.fa' includes several sequences to be aligned (e.g. isoforms returned from `gget seq`).
+→ Returns an overview of the aligned sequences with ClustalW coloring. (To return an aligned FASTA (.afa) file, use `--out` argument (or `save=True` in Jupyter Lab/Google Colab).) In the above example, the 'fasta.fa' includes several sequences to be aligned (e.g. isoforms returned from `gget seq`).

@@ -60,9 +60,8 @@ alv.view(msa)
#### [More examples](https://github.com/pachterlab/gget_examples)
# References
-If you use `gget muscle` in a publication, please cite the following articles:
+If you use `gget muscle` in a publication, please cite the following articles:
- Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836)
- Edgar RC (2021), MUSCLE v5 enables improved estimates of phylogenetic tree confidence by ensemble bootstrapping, bioRxiv 2021.06.20.449169. [https://doi.org/10.1101/2021.06.20.449169](https://doi.org/10.1101/2021.06.20.449169)
-
diff --git a/docs/src/en/mutate.md b/docs/src/en/mutate.md
index 3bb595780..35ca12ded 100644
--- a/docs/src/en/mutate.md
+++ b/docs/src/en/mutate.md
@@ -10,7 +10,7 @@ This module was written by [Joseph Rich](https://github.com/josephrich98).
** Update: The more complex functionality of gget mutate has been ported to https://github.com/pachterlab/kvar. kvar expands on this functionality in the context of screening for variants/mutations in sequencing data. If this sounds interesting to you, please check it out! **
**Positional argument**
-`sequences`
+`sequences`
Path to the FASTA file containing the sequences to be mutated, e.g., 'path/to/seqs.fa'.
Sequence identifiers following the '>' character must correspond to the identifiers in the seq_ID column of `mutations`.
@@ -57,20 +57,20 @@ Name of the column containing the IDs of the sequences to be mutated in `mutatio
`-mic` `--mut_id_column`
Name of the column containing the IDs of each mutation in `mutations`. Default: Same as `mut_column`.
-
+
**Optional mutant sequence generation/filtering arguments**
`-k` `--k`
Length of sequences flanking the mutation. Default: 30.
If k > total length of the sequence, the entire sequence will be kept.
-
+
**Optional general arguments**
-`-o` `--out`
+`-o` `--out`
Path to output FASTA file containing the mutated sequences, e.g., 'path/to/output_fasta.fa'.
-Default: None -> returns a list of the mutated sequences to standard out.
-The identifiers (following the '>') of the mutated sequences in the output FASTA will be '>[seq_ID]_[mut_ID]'.
+Default: None -> returns a list of the mutated sequences to standard out.
+The identifiers (following the '>') of the mutated sequences in the output FASTA will be '>[seq_ID]_[mut_ID]'.
**Optional general flags**
-`-q` `--quiet`
+`-q` `--quiet`
Command-line only. Prevents progress information from being displayed.
Python: Use `verbose=False` to prevent progress information from being displayed.
@@ -94,7 +94,7 @@ gget mutate ATCGCTAAGCT TAGCTA -m 'c.4G>T' 'c.1_3inv' -o mut_fasta.fa
# Python
gget.mutate(["ATCGCTAAGCT", "TAGCTA"], ["c.4G>T", "c.1_3inv"], out="mut_fasta.fa")
```
-→ Saves 'mut_fasta.fa' file containing:
+→ Saves 'mut_fasta.fa' file containing:
```
>seq1_mut1
ATCTCTAAGCT
@@ -116,7 +116,6 @@ gget.mutate(["ATCGCTAAGCT", "TAGCTA"], "c.1_3inv", k=3)
# References
-If you use `gget mutate` in a publication, please cite the following articles:
+If you use `gget mutate` in a publication, please cite the following articles:
- Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836)
-
diff --git a/docs/src/en/opentargets.md b/docs/src/en/opentargets.md
index 44be2a9d6..b1fa72975 100644
--- a/docs/src/en/opentargets.md
+++ b/docs/src/en/opentargets.md
@@ -2,7 +2,7 @@
> Python arguments are equivalent to long-option arguments (`--arg`), unless otherwise specified. Flags are True/False arguments in Python. The manual for any gget tool can be called from the command-line using the `-h` `--help` flag.
# gget opentargets 🎯
-Fetch associated diseases or drugs from [OpenTargets](https://platform.opentargets.org/) using Ensembl IDs.
+Fetch associated diseases or drugs from [OpenTargets](https://platform.opentargets.org/) using Ensembl IDs.
Return format: JSON/CSV (command-line) or data frame (Python).
This module was written by [Sam Wagenaar](https://github.com/techno-sam).
@@ -12,8 +12,8 @@ This module was written by [Sam Wagenaar](https://github.com/techno-sam).
Ensembl gene ID, e.g ENSG00000169194.
**Optional arguments**
-`-r` `--resource`
-Defines the type of information to return in the output. Default: 'diseases'.
+`-r` `--resource`
+Defines the type of information to return in the output. Default: 'diseases'.
Possible resources are:
| Resource | Return Value | Valid Filters | Sources |
@@ -27,35 +27,35 @@ Possible resources are:
| `interactions` | Protein⇄protein interactions | `protein_a_id`
`protein_b_id`
`gene_b_id` | - [Open Targets](https://platform-docs.opentargets.org/target/molecular-interactions)
- [IntAct](https://platform-docs.opentargets.org/target/molecular-interactions#intact)
- [Signor](https://platform-docs.opentargets.org/target/molecular-interactions#signor)
- [Reactome](https://platform-docs.opentargets.org/target/molecular-interactions#reactome)
- [String](https://platform-docs.opentargets.org/target/molecular-interactions#string)
|
`-l` `--limit`
-Limit the number of results, e.g 10. Default: No limit.
+Limit the number of results, e.g 10. Default: No limit.
Note: Not compatible with the `tractability` and `depmap` resources.
-`-o` `--out`
+`-o` `--out`
Path to the JSON file the results will be saved in, e.g. path/to/directory/results.json. Default: Standard out.
Python: `save=True` will save the output in the current working directory.
`--filters`
Filter results by exact equality using returned OpenTargets column names. Pass multiple filters by repeating the flag, e.g. '--filter disease.id=EFO_0000274 --filter drug.id=CHEMBL1743081'. Nested fields use dot notation, matching the column names returned by the API.
-**Flags**
+**Flags**
`-csv` `--csv`
Command-line only. Returns the output in CSV format, instead of JSON format.
Python: Use `json=True` to return output in JSON format.
-`-q` `--quiet`
+`-q` `--quiet`
Command-line only. Prevents progress information from being displayed.
-Python: Use `verbose=False` to prevent progress information from being displayed.
+Python: Use `verbose=False` to prevent progress information from being displayed.
`-or` `--or`
Command-line only. Filters are combined with OR logic. Default: AND logic.
`wrap_text`
Python only. `wrap_text=True` displays data frame with wrapped text for easy reading (default: False).
-
-
+
+
### Examples
-**Get associated diseases for a specific gene:**
+**Get associated diseases for a specific gene:**
```bash
gget opentargets ENSG00000169194 -r diseases -l 1
```
@@ -72,7 +72,7 @@ gget.opentargets('ENSG00000169194', resource='diseases', limit=1)
-**Get associated drugs for a specific gene:**
+**Get associated drugs for a specific gene:**
```bash
gget opentargets ENSG00000169194 -r drugs -l 2
```
@@ -93,7 +93,7 @@ gget.opentargets('ENSG00000169194', resource='drugs', limit=2)
-**Get tractability data for a specific gene:**
+**Get tractability data for a specific gene:**
```bash
gget opentargets ENSG00000169194 -r tractability
```
@@ -237,13 +237,12 @@ gget.opentargets(
| 0.400 | 1 | intact | P35225 | ENSG00000169194 | IL13 | unspecified role | 9606 | Q86XT9 | ENSG00000149932 | TMEM219 | stimulator | 9606 |
-
+
#### [More examples](https://github.com/pachterlab/gget_examples)
# References
-If you use `gget opentargets` in a publication, please cite the following articles:
+If you use `gget opentargets` in a publication, please cite the following articles:
- Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836)
- Ochoa D, Hercules A, Carmona M, Suveges D, Baker J, Malangone C, Lopez I, Miranda A, Cruz-Castillo C, Fumis L, Bernal-Llinares M, Tsukanov K, Cornu H, Tsirigos K, Razuvayevskaya O, Buniello A, Schwartzentruber J, Karim M, Ariano B, Martinez Osorio RE, Ferrer J, Ge X, Machlitt-Northen S, Gonzalez-Uriarte A, Saha S, Tirunagari S, Mehta C, Roldán-Romero JM, Horswell S, Young S, Ghoussaini M, Hulcoop DG, Dunham I, McDonagh EM. The next-generation Open Targets Platform: reimagined, redesigned, rebuilt. Nucleic Acids Res. 2023 Jan 6;51(D1):D1353-D1359. doi: [10.1093/nar/gkac1046](https://doi.org/10.1093/nar/gkac1046). PMID: 36399499; PMCID: PMC9825572.
-
diff --git a/docs/src/en/pdb.md b/docs/src/en/pdb.md
index b81c77c68..4f1d749ad 100644
--- a/docs/src/en/pdb.md
+++ b/docs/src/en/pdb.md
@@ -12,7 +12,7 @@ PDB ID to be queried, e.g. '7S7U'.
**Optional arguments**
`-r` `--resource`
Defines type of information to be returned. One of the following:
- 'pdb': Returns the protein structure in PDB format (default).
+ 'pdb': Returns the protein structure in PDB format (default).
'entry': Information about PDB structures at the top level of PDB structure hierarchical data organization.
'pubmed': Get PubMed annotations (data integrated from PubMed) for a given entry's primary citation.
'assembly': Information about PDB structures at the quaternary structure level.
@@ -23,15 +23,15 @@ PDB ID to be queried, e.g. '7S7U'.
'branched_entity_instance': Get branched entity instance description (define chain ID as 'identifier').
'polymer_entity_instance': Get polymer entity instance (a.k.a chain) data (define chain ID as 'identifier').
'nonpolymer_entity_instance': Get non-polymer entity instance description (define chain ID as 'identifier').
-
+
`-i` `--identifier`
Can be used to define assembly, entity or chain ID (default: None). Assembly/entity IDs are numbers (e.g. 1), and chain IDs are letters (e.g. 'A').
-
-`-o` `--out`
-Path to the file the results will be saved in, e.g. path/to/directory/7S7U.pdb or path/to/directory/7S7U_entry.json. Default: Standard out.
+
+`-o` `--out`
+Path to the file the results will be saved in, e.g. path/to/directory/7S7U.pdb or path/to/directory/7S7U_entry.json. Default: Standard out.
Python: `save=True` will save the output in the current working directory.
-
-
+
+
### Examples
```bash
gget pdb 7S7U -o 7S7U.pdb
@@ -44,10 +44,10 @@ gget.pdb("7S7U", save=True)
**Find PDB crystal structures for a comparative analysis of protein structure:**
```bash
-# Find PDB IDs associated with an Ensembl ID
+# Find PDB IDs associated with an Ensembl ID
gget info ENSG00000130234
-# Alternatively: Since many entries in the PDB do not have linked Ensembl IDs,
+# Alternatively: Since many entries in the PDB do not have linked Ensembl IDs,
# you will likely find more PDB entries by BLASTing the sequence agains the PDB.
# Get the amino acid sequence of a transcript from an Ensembl ID
@@ -61,10 +61,10 @@ gget pdb 7DQA -o 7DQA.pdb
gget pdb 7CT5 -o 7CT5.pdb
```
```python
-# Find PDB IDs associated with an Ensembl ID
+# Find PDB IDs associated with an Ensembl ID
gget.info("ENSG00000130234")
-# Alternatively: Since many entries in the PDB do not have linked Ensembl IDs,
+# Alternatively: Since many entries in the PDB do not have linked Ensembl IDs,
# you will likely find more PDB entries by BLASTing the sequence agains the PDB.
# Get the amino acid sequence of a transcript from an Ensembl ID
@@ -78,14 +78,12 @@ gget.pdb("7DQA", save=True)
gget.pdb("7CT5", save=True)
```
→ The use case above exemplifies how to find PDB files for comparative analysis of protein structure starting with Ensembl IDs or amino acid sequences. The fetched PDB files can also be compared to predicted structures generated by [`gget alphafold`](alphafold.md). PDB files can be viewed interactively in 3D [online](https://rcsb.org/3d-view), or using programs like [PyMOL](https://pymol.org/) or [Blender](https://www.blender.org/). To compare two PDB files, you can use [this website](https://rcsb.org/alignment).
-
+
#### [More examples](https://github.com/pachterlab/gget_examples)
# References
-If you use `gget pdb` in a publication, please cite the following articles:
+If you use `gget pdb` in a publication, please cite the following articles:
- Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836)
- Berman HM, Westbrook J, Feng Z, Gilliland G, Bhat TN, Weissig H, Shindyalov IN, Bourne PE. The Protein Data Bank. Nucleic Acids Res. 2000 Jan 1;28(1):235-42. doi: [10.1093/nar/28.1.235](https://doi.org/10.1093/nar/28.1.235). PMID: 10592235; PMCID: PMC102472.
-
-
diff --git a/docs/src/en/quick_start_guide.md b/docs/src/en/quick_start_guide.md
index 7a533de79..84ad90928 100644
--- a/docs/src/en/quick_start_guide.md
+++ b/docs/src/en/quick_start_guide.md
@@ -97,4 +97,3 @@ gget$pdb("1R42", save=TRUE)
gget$virus("Zika virus", host="Homo sapiens", nuc_completeness="complete")
```
#### [More examples](https://github.com/pachterlab/gget_examples)
-
diff --git a/docs/src/en/ref.md b/docs/src/en/ref.md
index 0e6cb63a3..42f0e6b15 100644
--- a/docs/src/en/ref.md
+++ b/docs/src/en/ref.md
@@ -9,7 +9,7 @@ Return format: dictionary/JSON.
`species`
Species for which the FTPs will be fetched in the format genus_species, e.g. homo_sapiens.
Supports all available vertebrate and invertebrate (plants, fungi, protists, and invertebrate metazoa) genomes from Ensembl, except bacteria.
-Note: Not required when using flags `--list_species` or `--list_iv_species`.
+Note: Not required when using flags `--list_species` or `--list_iv_species`.
Supported shortcuts: 'human', 'mouse', 'human_grch37' (accesses the GRCh37 genome assembly)
**Optional arguments**
@@ -26,34 +26,34 @@ Possible entries are one or a combination (as comma-separated list) of the follo
`-r` `--release`
Defines the Ensembl release number from which the files are fetched, e.g. 104. Default: latest Ensembl release.
-`-od` `--out_dir`
+`-od` `--out_dir`
Path to the directory where the FTPs will be saved, e.g. path/to/directory/. Default: Current working directory.
-`-o` `--out`
+`-o` `--out`
Path to the JSON file the results will be saved in, e.g. path/to/directory/results.json. Default: Standard out.
Python: `save=True` will save the output in the current working directory.
**Flags**
-`-l` `--list_species`
+`-l` `--list_species`
Lists all available vertebrate species. (Python: combine with `species=None`.)
-`-liv` `--list_iv_species`
+`-liv` `--list_iv_species`
Lists all available invertebrate species. (Python: combine with `species=None`.)
-`-ftp` `--ftp`
+`-ftp` `--ftp`
Returns only the requested FTP links.
-`-d` `--download`
+`-d` `--download`
Command-line only. Downloads the requested FTPs to the directory specified by `out_dir` (requires [curl](https://curl.se/docs/) to be installed).
-`-q` `--quiet`
+`-q` `--quiet`
Command-line only. Prevents progress information from being displayed.
-Python: Use `verbose=False` to prevent progress information from being displayed.
-
-
+Python: Use `verbose=False` to prevent progress information from being displayed.
+
+
### Examples
-**Get the genome reference for a specific species:**
+**Get the genome reference for a specific species:**
```bash
gget ref -w gtf,dna homo_sapiens
```
@@ -93,7 +93,7 @@ gget ref --list_species -r 103
# Python
gget.ref(species=None, list_species=True, release=103)
```
-→ Returns a list with all available genomes (checks if GTF and FASTAs are available) from Ensembl release 103.
+→ Returns a list with all available genomes (checks if GTF and FASTAs are available) from Ensembl release 103.
(If no release is specified, `gget ref` will always return information from the latest Ensembl release.)
@@ -111,7 +111,7 @@ kb ref \
#### [More examples](https://github.com/pachterlab/gget_examples)
# References
-If you use `gget ref` in a publication, please cite the following articles:
+If you use `gget ref` in a publication, please cite the following articles:
- Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836)
diff --git a/docs/src/en/search.md b/docs/src/en/search.md
index 9065530db..712a2dabc 100644
--- a/docs/src/en/search.md
+++ b/docs/src/en/search.md
@@ -2,33 +2,33 @@
> Python arguments are equivalent to long-option arguments (`--arg`), unless otherwise specified. Flags are True/False arguments in Python. The manual for any gget tool can be called from the command-line using the `-h` `--help` flag.
# gget search 🔎
-Fetch genes and transcripts from [Ensembl](https://www.ensembl.org/) using free-form search terms.
+Fetch genes and transcripts from [Ensembl](https://www.ensembl.org/) using free-form search terms.
Results are matched based on the "gene name" and "description" sections in the Ensembl database. `gget` version >= 0.27.9 also includes results that match the Ensembl "synonym" section.
Return format: JSON (command-line) or data frame/CSV (Python).
**Positional argument**
-`searchwords`
+`searchwords`
One or more free form search words, e.g. gaba nmda. (Note: Search is not case-sensitive.)
-**Other required arguments**
+**Other required arguments**
`-s` `--species`
Species or database to be searched.
A species can be passed in the format 'genus_species', e.g. 'homo_sapiens' or 'arabidopsis_thaliana'.
To pass a specific database, pass the name of the CORE database, e.g. 'mus_musculus_dba2j_core_105_1'.
-
+
All available core databases can be found here:
Vertebrates: [http://ftp.ensembl.org/pub/current/mysql/](http://ftp.ensembl.org/pub/current/mysql/)
Invertebrates: [http://ftp.ensemblgenomes.org/pub/current/](http://ftp.ensemblgenomes.org/pub/current/) + select kingdom + go to mysql/
-
+
Supported shortcuts: 'human', 'mouse'
**Optional arguments**
-`-r` `--release`
+`-r` `--release`
Defines the Ensembl release number from which the files are fetched, e.g. 104. Default: None -> latest Ensembl release is used.
-
-Note: *The release argument does not apply to invertebrate species* (you can pass a specific core database (which includes a release number) to the `species` argument instead). For invertebrate species, Ensembl only stores databases from 10 releases prior to the current release.
-
-This argument is overwritten if a specific database (which includes a release number) is passed to the species argument.
+
+Note: *The release argument does not apply to invertebrate species* (you can pass a specific core database (which includes a release number) to the `species` argument instead). For invertebrate species, Ensembl only stores databases from 10 releases prior to the current release.
+
+This argument is overwritten if a specific database (which includes a release number) is passed to the species argument.
`-t` `--id_type`
'gene' (default) or 'transcript'
@@ -39,11 +39,11 @@ Returns genes or transcripts, respectively.
'or': Returns all genes that INCLUDE AT LEAST ONE of the searchwords in their name/description.
'and': Returns only genes that INCLUDE ALL of the searchwords in their name/description.
-`-l` `--limit`
+`-l` `--limit`
Limits the number of search results, e.g. 10. Default: None.
`-o` `--out`
-Path to the csv the results will be saved in, e.g. path/to/directory/results.csv (or .json). Default: Standard out.
+Path to the csv the results will be saved in, e.g. path/to/directory/results.csv (or .json). Default: Standard out.
Python: `save=True` will save the output in the current working directory.
**Flags**
@@ -51,15 +51,15 @@ Python: `save=True` will save the output in the current working directory.
Command-line only. Returns results in CSV format.
Python: Use `json=True` to return output in JSON format.
-`-q` `--quiet`
+`-q` `--quiet`
Command-line only. Prevents progress information from being displayed.
-Python: Use `verbose=False` to prevent progress information from being displayed.
+Python: Use `verbose=False` to prevent progress information from being displayed.
`wrap_text`
Python only. `wrap_text=True` displays data frame with wrapped text for easy reading (default: False).
-
-
-
+
+
+
### Example
```bash
gget search -s human gaba gamma-aminobutyric
@@ -74,13 +74,12 @@ gget.search(["gaba", "gamma-aminobutyric"], "homo_sapiens")
| -------------- |-------------------------| ------------------------| -------------- | ----------|-----|
| ENSG00000034713| GABARAPL2 | GABA type A receptor associated protein like 2 [Source:HGNC Symbol;Acc:HGNC:13291] | GABA type A receptor associated protein like 2 | protein_coding | https://uswest.ensembl.org/homo_sapiens/Gene/Summary?g=ENSG00000034713 |
| . . . | . . . | . . . | . . . | . . . | . . . |
-
+
#### [More examples](https://github.com/pachterlab/gget_examples)
# References
-If you use `gget search` in a publication, please cite the following articles:
+If you use `gget search` in a publication, please cite the following articles:
- Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836)
- Martin FJ, Amode MR, Aneja A, Austine-Orimoloye O, Azov AG, Barnes I, Becker A, Bennett R, Berry A, Bhai J, Bhurji SK, Bignell A, Boddu S, Branco Lins PR, Brooks L, Ramaraju SB, Charkhchi M, Cockburn A, Da Rin Fiorretto L, Davidson C, Dodiya K, Donaldson S, El Houdaigui B, El Naboulsi T, Fatima R, Giron CG, Genez T, Ghattaoraya GS, Martinez JG, Guijarro C, Hardy M, Hollis Z, Hourlier T, Hunt T, Kay M, Kaykala V, Le T, Lemos D, Marques-Coelho D, Marugán JC, Merino GA, Mirabueno LP, Mushtaq A, Hossain SN, Ogeh DN, Sakthivel MP, Parker A, Perry M, Piližota I, Prosovetskaia I, Pérez-Silva JG, Salam AIA, Saraiva-Agostinho N, Schuilenburg H, Sheppard D, Sinha S, Sipos B, Stark W, Steed E, Sukumaran R, Sumathipala D, Suner MM, Surapaneni L, Sutinen K, Szpak M, Tricomi FF, Urbina-Gómez D, Veidenberg A, Walsh TA, Walts B, Wass E, Willhoft N, Allen J, Alvarez-Jarreta J, Chakiachvili M, Flint B, Giorgetti S, Haggerty L, Ilsley GR, Loveland JE, Moore B, Mudge JM, Tate J, Thybert D, Trevanion SJ, Winterbottom A, Frankish A, Hunt SE, Ruffier M, Cunningham F, Dyer S, Finn RD, Howe KL, Harrison PW, Yates AD, Flicek P. Ensembl 2023. Nucleic Acids Res. 2023 Jan 6;51(D1):D933-D941. doi: [10.1093/nar/gkac958](https://doi.org/10.1093/nar/gkac958). PMID: 36318249; PMCID: PMC9825606.
-
diff --git a/docs/src/en/seq.md b/docs/src/en/seq.md
index 1e7b3b897..bfe356c44 100644
--- a/docs/src/en/seq.md
+++ b/docs/src/en/seq.md
@@ -2,16 +2,16 @@
> Python arguments are equivalent to long-option arguments (`--arg`), unless otherwise specified. Flags are True/False arguments in Python. The manual for any gget tool can be called from the command-line using the `-h` `--help` flag.
# gget seq 🧬
-Fetch nucleotide or amino acid sequence(s) of a gene (and all its isoforms) or a transcript by Ensembl ID.
+Fetch nucleotide or amino acid sequence(s) of a gene (and all its isoforms) or a transcript by Ensembl ID.
Return format: FASTA.
**Positional argument**
-`ens_ids`
+`ens_ids`
One or more Ensembl IDs.
**Optional arguments**
-`-o` `--out`
-Path to the file the results will be saved in, e.g. path/to/directory/results.fa. Default: Standard out.
+`-o` `--out`
+Path to the file the results will be saved in, e.g. path/to/directory/results.fa. Default: Standard out.
Python: `save=True` will save the output in the current working directory.
**Flags**
@@ -20,11 +20,11 @@ Returns amino acid (instead of nucleotide) sequences.
Nucleotide sequences are fetched from [Ensembl](https://www.ensembl.org/).
Amino acid sequences are fetched from [UniProt](https://www.uniprot.org/).
-`-iso` `--isoforms`
+`-iso` `--isoforms`
Returns the sequences of all known transcripts.
(Only for gene IDs.)
-`-q` `--quiet`
+`-q` `--quiet`
Command-line only. Prevents progress information from being displayed.
Python: Use `verbose=False` to prevent progress information from being displayed.
@@ -52,7 +52,7 @@ gget.seq("ENSG00000034713", translate=True, isoforms=True)
#### [More examples](https://github.com/pachterlab/gget_examples)
# References
-If you use `gget seq` in a publication, please cite the following articles:
+If you use `gget seq` in a publication, please cite the following articles:
- Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836)
diff --git a/docs/src/en/setup.md b/docs/src/en/setup.md
index 80752db11..e3ad9e1c3 100644
--- a/docs/src/en/setup.md
+++ b/docs/src/en/setup.md
@@ -15,12 +15,12 @@ gget module for which dependencies should be installed.
`-o` `--out`
Path to the folder downloaded files will be saved in (currently only applies to module = 'elm').
NOTE: Do NOT use this argument when downloading the files for use with `gget.elm`.
-Default: None (downloaded files are saved inside the `gget` package installation folder).
+Default: None (downloaded files are saved inside the `gget` package installation folder).
**Flags**
-`-q` `--quiet`
+`-q` `--quiet`
Command-line only. Prevents progress information from being displayed.
-Python: Use `verbose=False` to prevent progress information from being displayed.
+Python: Use `verbose=False` to prevent progress information from being displayed.
### Example
diff --git a/docs/src/en/updates.md b/docs/src/en/updates.md
index 14419bd29..0ff444f9d 100644
--- a/docs/src/en/updates.md
+++ b/docs/src/en/updates.md
@@ -6,7 +6,7 @@
- The `species` argument (both Python and command line) now accepts all five supported organisms; the CLI `choices`, help text, and docstrings list them.
- Added early validation of the `species` argument that raises a clear `ValueError` listing the supported species, instead of failing later inside the Census API call.
- Note: the new primate species require `census_version="2025-11-08"` (LTS) or newer.
-
+
**Version ≥ 0.30.6** (Jun 10, 2026):
- [`gget blat`](blat.md): Improved resilience against UCSC BLAT endpoint failures (fixes intermittently failing tests).
- Added retry-with-exponential-backoff for transient failures (HTTP 429/5xx, network errors, and non-JSON 200 responses caused by UCSC rate-limiting or HTML error pages). Up to 4 attempts with 1.5s → 3s → 6s backoff.
@@ -26,6 +26,11 @@
- `utils.get_uniprot_seqs`: Collect per-ID DataFrames in a list and `pd.concat(..., ignore_index=True)` once at the end, avoiding the O(n²) cost of growing a DataFrame inside the request loop.
- Cached `utils.find_latest_ens_rel`, `utils.search_species_options`, `utils.ref_species_options`, and `utils.find_nv_kingdom` with `functools.lru_cache`. These hit Ensembl FTP listings that are stable for a release; repeated calls within one Python process are now free.
- Added `utils.parallel_map`, a thin `ThreadPoolExecutor` wrapper for I/O-bound work. Used to fan out `utils.get_uniprot_seqs` across the input ID list — looking up N IDs is now bounded by ~`N / pool_size` UniProt round-trips instead of `N`. Pool size defaults to 8 and can be overridden via the `GGET_MAX_WORKERS` environment variable.
+- Developer tooling / packaging:
+ - Migrated packaging to a single `pyproject.toml` (the [hatchling](https://hatch.pypa.io/) build backend); removed `setup.py`, `setup.cfg`, `requirements.txt`, `dev-requirements.txt`, and `MANIFEST.in`. Runtime dependencies and the `test` dependency group are now declared in `pyproject.toml`.
+ - The minimum supported Python version is now **3.12**.
+ - Added a [pre-commit](https://pre-commit.com/) configuration (lint + format via [ruff](https://docs.astral.sh/ruff/), plus standard hygiene hooks). Run `prek run --all-files` (or `pre-commit run --all-files`) before opening a PR.
+ - Modernized the test CI to use [uv](https://docs.astral.sh/uv/) and run on pull requests, and added package-build-check and PyPI trusted-publishing workflows.
**Version ≥ 0.30.5** (May 23, 2026):
- [`gget opentargets`](opentargets.md): Rewrote this module to reflect the new Open Targets API structure
@@ -69,7 +74,7 @@
- [`gget pdb`](pdb.md): Added wwpdb mirror; falls back to rcsb if wwpdb fails.
- [`gget cellxgene`](cellxgene.md): Improved argument handling; frontend unchanged. Fixes [issue 181](https://github.com/pachterlab/gget/issues/181).
- [`gget setup`](setup.md)/[`gget alphafold`](alphafold.md): Fixed pip_cmd bug in gget.setup("alphafold")
-
+
**Version ≥ 0.29.2** (Jul 03, 2025):
- gget can now be installed using `uv pip install gget`
- All package metadata (version, author, description, etc.) is now managed in setup.cfg for full compatibility with modern tools like uv, pip, and PyPI
@@ -94,7 +99,7 @@
- Allow querying multiple genes at once.
- [`gget diamond`](diamond.md):
- Now supports translated alignment of nucleotide sequences to amino acid reference sequences using the `--translated` flag.
-- [`gget elm`](elm.md):
+- [`gget elm`](elm.md):
- Improved server error handling.
**Version ≥ 0.29.0** (Sep 25, 2024):
@@ -122,12 +127,12 @@
- [`gget ref`](./ref.md): Can now fetch the GRCh37 genome assembly using `species='human_grch37'`
- [`gget search`](./search.md): Adjust access of human data to the structure of Ensembl release 112 (fixes [issue 129](https://github.com/pachterlab/gget/issues/129))
-~~**Version ≥ 0.28.5** (May 29, 2024):~~
+~~**Version ≥ 0.28.5** (May 29, 2024):~~
- Yanked due to logging bug in `gget.setup("alphafold")` + inversion mutations in `gget mutate` only reverse the string instead of also computing the complementary strand
-
+
**Version ≥ 0.28.4** (January 31, 2024):
- [`gget setup`](./setup.md): Fix bug with filepath when running `gget.setup("elm")` on Windows OS.
-
+
**Version ≥ 0.28.3** (January 22, 2024):
- **[`gget search`](./search.md) and [`gget ref`](./ref.md) now also support fungi 🍄, protists 🌝, and invertebrate metazoa 🐝 🐜 🐌 🐙 (in addition to vertebrates and plants)**
- **New module: [`gget cosmic`](./cosmic.md)**
@@ -140,7 +145,7 @@
- [`gget setup`](./setup.md): Use the `out` argument to specify a directory the ELM database will be downloaded into. Completes [this feature request](https://github.com/pachterlab/gget/issues/119).
- [`gget diamond`](./diamond.md): The DIAMOND command is now run with `--ignore-warnings` flag, allowing niche sequences such as amino acid sequences that only contain nucleotide characters and repeated sequences. This is also true for DIAMOND alignments performed within [`gget elm`](./elm.md).
- **[`gget ref`](./ref.md) and [`gget search`](./search.md) back-end change: the current Ensembl release is fetched from the new [release file](https://ftp.ensembl.org/pub/VERSION) on the Ensembl FTP site to avoid errors during uploads of new releases.**
-- [`gget search`](./search.md):
+- [`gget search`](./search.md):
- FTP link results (`--ftp`) are saved in txt file format instead of json.
- Fix URL links to Ensembl gene summary for species with a subspecies name and invertebrates.
- [`gget ref`](./ref.md):
@@ -152,7 +157,7 @@
- Replace deprecated 'text' argument to find()-type methods whenever used with dependency `BeautifulSoup`
- [`gget elm`](elm.md): Remove false positive and true negative instances from returned results
- [`gget elm`](elm.md): Add `expand` argument
-
+
**Version ≥ 0.28.0** (November 5, 2023):
- Updated documentation of [`gget muscle`](./muscle.md) to add a tutorial on how to visualize sequences with varying sequence name lengths + slight change to returned visualization so it's a bit more robust to varying sequence names
- [`gget muscle`](./muscle.md) now also allows a list of sequences as input (as an alternative to providing the path to a FASTA file)
@@ -160,7 +165,7 @@
- [`gget seq`](./seq.md): Allow missing gene names (fixes [https://github.com/pachterlab/gget/issues/107](https://github.com/pachterlab/gget/issues/107))
- **[`gget enrichr`](enrichr.md): Use new arguments `kegg_out` and `kegg_rank` to create an image of the KEGG pathway with the genes from the enrichment analysis highlighted (thanks to [this PR](https://github.com/pachterlab/gget/pull/106) by [Noriaki Sato](https://github.com/noriakis))**
- **New modules: [`gget elm`](elm.md) and [`gget diamond`](diamond.md)**
-
+
**Version ≥ 0.27.9** (August 7, 2023):
- **[`gget enrichr`](enrichr.md): Use new argument `background_list` to provide a list of background genes**
- [`gget search`](search.md) now also searches [Ensembl](https://ensembl.org/) synonyms (in addition to gene descriptions and names) to return more comprehensive search results (thanks to [Samuel Klein](https://github.com/KleinSamuel) for the [suggestion](https://github.com/pachterlab/gget/issues/90))
@@ -185,11 +190,11 @@
**Version ≥ 0.27.4** (March 19, 2023):
- **New module: [`gget gpt`](gpt.md)**
-
+
**Version ≥ 0.27.3** (March 11, 2023):
- [`gget info`](info.md) excludes PDB IDs by default to increase speed (PDB results can be included using flag `--pdb` / `pdb=True`).
-**Version ≥ 0.27.2** (January 1, 2023):
+**Version ≥ 0.27.2** (January 1, 2023):
- Updated [`gget alphafold`](alphafold.md) to [DeepMind's AlphaFold v2.3.0](https://github.com/deepmind/alphafold/releases/tag/v2.3.0) (including new arguments `multimer_for_monomer` and `multimer_recycles`)
**Version ≥ 0.27.0** (December 10, 2022):
diff --git a/docs/src/en/virus.md b/docs/src/en/virus.md
index be2cc0900..c60f67087 100644
--- a/docs/src/en/virus.md
+++ b/docs/src/en/virus.md
@@ -21,12 +21,12 @@ Add `--is_accession` when passing an NCBI accession number. Add `--is_sars_cov2`
For SARS-CoV-2 and Alphainfluenza cached downloads, supports:
- Single accession: `NC_045512.2`
- - Space-separated list: `NC_045512.2 MN908947.3 MT020781.1`
+ - Space-separated list: `NC_045512.2 MN908947.3 MT020781.1`
- Text file path: `accessions.txt` (one accession per line)
Use flag `--download_all_accessions` to apply filters without searching for a specific virus.
-**Optional arguments**
+**Optional arguments**
_Host filters_
@@ -87,7 +87,7 @@ Command line: `--annotated true` to fetch only that have been annotated with gen
Python: `annotated=True` or `annotated=False` (`annotated=None` for no filter).
`--lab_passaged`
-'true' or 'false'. Filter for or against lab-passaged samples.
+'true' or 'false'. Filter for or against lab-passaged samples.
Command line: `--lab_passaged true` to fetch only lab-passaged samples, or `--lab_passaged false` to exclude them.
Python: `lab_passaged=True` or `lab_passaged=False` (`lab_passaged=None` for no filter).
@@ -205,8 +205,8 @@ Python: `merge_results=False`
`-a` `--is_accession`
Flag to indicate that the `virus` positional argument is an accession number, a space-separated list of accessions, or a path to a text file containing accession numbers (one per line).
-`--download_all_accessions`
-Use this flag when applying filters without searching for a specific virus (leave `virus` argument empty).
+`--download_all_accessions`
+Use this flag when applying filters without searching for a specific virus (leave `virus` argument empty).
⚠️ **WARNING**: If you do not specify additional filters, this flag downloads ALL available viral sequences from NCBI (entire Viruses taxonomy, taxon ID 10239). This is an extremely large dataset that can take many hours to download and require significant disk space. Use with caution and ensure you have adequate storage and bandwidth. When this flag is set, the `virus` argument is ignored.
`--is_sars_cov2`
@@ -227,7 +227,7 @@ Flag to keep all intermediate/temporary files generated during processing. By de
`-q` `--quiet`
Command-line only. Prevents progress information from being displayed.
-Python: Use `verbose=False` to prevent progress information from being displayed.
+Python: Use `verbose=False` to prevent progress information from being displayed.
### Example
@@ -278,8 +278,8 @@ gget virus "SARS-CoV-2" --host human --nuc_completeness complete --min_seq_lengt
import gget
gget.virus(
- "SARS-CoV-2",
- host="human",
+ "SARS-CoV-2",
+ host="human",
nuc_completeness="complete",
min_seq_length=29000,
genbank_metadata=True,
@@ -302,8 +302,8 @@ gget virus "Influenza A virus" --host human --nuc_completeness complete --max_se
import gget
gget.virus(
- "Influenza A virus",
- host="human",
+ "Influenza A virus",
+ host="human",
nuc_completeness="complete",
max_seq_length=15000,
genbank_metadata=True,
@@ -660,5 +660,3 @@ If you use `gget virus` in a publication, please cite the following articles:
- Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836)
- O’Leary, N.A., Cox, E., Holmes, J.B. et al (2024). Exploring and retrieving sequence and metadata for species across the tree of life with NCBI Datasets. Sci Data 11, 732. [https://doi.org/10.1038/s41597-024-03571-y](https://doi.org/10.1038/s41597-024-03571-y)
-
-
diff --git a/docs/src/es/alphafold.md b/docs/src/es/alphafold.md
index 7c1a67c9a..a93445385 100644
--- a/docs/src/es/alphafold.md
+++ b/docs/src/es/alphafold.md
@@ -13,9 +13,9 @@ Antes de usar `gget alphafold` por primera vez:
`conda install -qy conda==24.1.2 && conda install -qy -c conda-forge openmm=7.7.0`
Para Python versión 3.11:
`conda install -qy conda==24.11.1 && conda install -qy -c conda-forge openmm=8.0.0`
-
+
Recomendación: siga con `conda update -qy conda` para actualizar _conda_ a la última versión.
-3. Corre `gget setup alphafold` / `gget.setup("alphafold")` (ver también [`gget setup`](setup.md)). Al ejecutar `gget setup alphafold` / `gget.setup("alphafold")` se descargará e instalará la última versión de AlphaFold2 alojada en el [AlphaFold GitHub Repo](https://github.com/deepmind/alphafold). Puede volver a ejecutar este comando en cualquier momento para actualizar el software cuando hay una nueva versión de AlphaFold.
+3. Corre `gget setup alphafold` / `gget.setup("alphafold")` (ver también [`gget setup`](setup.md)). Al ejecutar `gget setup alphafold` / `gget.setup("alphafold")` se descargará e instalará la última versión de AlphaFold2 alojada en el [AlphaFold GitHub Repo](https://github.com/deepmind/alphafold). Puede volver a ejecutar este comando en cualquier momento para actualizar el software cuando hay una nueva versión de AlphaFold.
**Parámetro posicional**
`sequence`
@@ -26,17 +26,17 @@ Secuencia de aminoácidos (str), o una lista de secuencias (*gget alphafold auto
El algoritmo de multímero se reciclara hasta que las predicciones dejen de cambiar, el limite de ciclos esta indicado aqui. Por defecto: 3
Para obtener más exactitud, ajusta este limite a 20 (al costo de ejecuciones mas tardadas).
-`-o` `--out`
+`-o` `--out`
Ruta a la carpeta para guardar los resultados de la predicción (str). Por defecto: "./[fecha_tiempo]_gget_alphafold_prediction".
-
-**Banderas**
+
+**Banderas**
`-mfm` `--multimer_for_monomer`
Usa el algoritmo de multímero para un monómero.
-`-r` `--relax`
+`-r` `--relax`
Relaja el mejor modelo con el algoritmo AMBER.
-`-q` `--quiet`
+`-q` `--quiet`
Uso limitado para Terminal. Impide la información de progreso de ser exhibida durante la ejecución del programa.
Para Python, usa `verbose=False`.
@@ -45,8 +45,8 @@ Solo para Python. `plot=True` provée una visualización interactiva de la predi
`show_sidechains`
Solo para Python. `show_sidechains=True` incluye las cadenas laterales de proteínas en el esquema (por defecto: True).
-
-
+
+
### Ejemplo
```bash
# Predice la estructura de una proteína derivada de su secuencia de aminoácidos
@@ -82,12 +82,12 @@ gget.pdb("2K42", save=True)
### [🔗 gget alphafold - preguntas más frecuentes](https://github.com/pachterlab/gget/discussions/39)
-# Citar
+# Citar
Si utiliza `gget alphafold` en una publicación, favor de citar los siguientes artículos:
- Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836)
- Jumper, J., Evans, R., Pritzel, A. et al. Highly accurate protein structure prediction with AlphaFold. Nature 596, 583–589 (2021). [https://doi.org/10.1038/s41586-021-03819-2](https://doi.org/10.1038/s41586-021-03819-2)
-Y, si corresponde:
+Y, si corresponde:
- Evans, R. et al. Protein complex prediction with AlphaFold-Multimer. bioRxiv 2021.10.04.463034; [https://doi.org/10.1101/2021.10.04.463034](https://doi.org/10.1101/2021.10.04.463034)
diff --git a/docs/src/es/archs4.md b/docs/src/es/archs4.md
index 80dabd91a..27e0eb66a 100644
--- a/docs/src/es/archs4.md
+++ b/docs/src/es/archs4.md
@@ -17,27 +17,27 @@ Alternativamente: usa la bandera `--ensembl` para ingresar un ID tipo Ensembl, p
'tissue' produce un atlas de expresión tisular calculado de todas las muestras humanas o de ratón (según lo definido usando el parámetro `--species` (especies)) en [ARCHS4](https://maayanlab.cloud/archs4/).
`-s` `--species`
-'human' (humano; se usa por defecto) o 'mouse' (ratón).
+'human' (humano; se usa por defecto) o 'mouse' (ratón).
Define si se usan muestras humanas o de ratón de [ARCHS4](https://maayanlab.cloud/archs4/).
(Solo aplica para el atlas de expresión tisular.)
-`-o` `--out`
+`-o` `--out`
Ruta al archivo en el que se guardarán los resultados, p. ej. ruta/al/directorio/resultados.csv (o .json). Por defecto: salida estándar (STDOUT).
Para Python, use `save=True` para guardar los resultados en el directorio de trabajo actual.
-
-**Banderas**
+
+**Banderas**
`-e` `--ensembl`
-Usa esta bandera si `gene` se ingresa como ID tipo Ensembl.
+Usa esta bandera si `gene` se ingresa como ID tipo Ensembl.
`-csv` `--csv`
-Solo para Terminal. Produce los resultados en formato CSV.
-Para Python, usa `json=True` para obtener los resultados en formato JSON.
+Solo para Terminal. Produce los resultados en formato CSV.
+Para Python, usa `json=True` para obtener los resultados en formato JSON.
-`-q` `--quiet`
+`-q` `--quiet`
Solo para Terminal. Impide la información de progreso de ser exhibida durante la ejecución del programa.
Para Python, usa `verbose=False` para impedir la información de progreso de ser exhibida durante la ejecución del programa.
-
-
+
+
### Ejemplo
```bash
gget archs4 ACE2
@@ -49,10 +49,10 @@ gget.archs4("ACE2")
→ Produce los 100 genes más correlacionados con el gen ACE2:
| gene_symbol | pearson_correlation |
-| -------------- |-------------------------|
-| SLC5A1 | 0.579634 |
-| CYP2C18 | 0.576577 |
-| . . . | . . . |
+| -------------- |-------------------------|
+| SLC5A1 | 0.579634 |
+| CYP2C18 | 0.576577 |
+| . . . | . . . |
@@ -66,9 +66,9 @@ gget.archs4("ACE2", which="tissue")
→ Produce la expresión tisular de ACE2 (por defecto, se utilizan datos humanos):
| id | min | q1 | median | q3 | max |
-| ------ |--------| ------ |--------| ------ |--------|
+| ------ |--------| ------ |--------| ------ |--------|
| System.Urogenital/Reproductive System.Kidney.RENAL CORTEX | 0.113644 | 8.274060 | 9.695840 | 10.51670 | 11.21970 |
-| System.Digestive System.Intestine.INTESTINAL EPITHELIAL CELL | 0.113644 | 5.905560 | 9.570450 | 13.26470 | 13.83590 |
+| System.Digestive System.Intestine.INTESTINAL EPITHELIAL CELL | 0.113644 | 5.905560 | 9.570450 | 13.26470 | 13.83590 |
| . . . | . . . | . . . | . . . | . . . | . . . |
@@ -79,7 +79,7 @@ Consulte [este tutorial](https://davetang.org/muse/2023/05/16/check-where-a-gene
#### [Más ejemplos](https://github.com/pachterlab/gget_examples)
-# Citar
+# Citar
Si utiliza `gget archs4` en una publicación, favor de citar los siguientes artículos:
- Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836)
diff --git a/docs/src/es/bgee.md b/docs/src/es/bgee.md
index aa13a4745..360458766 100644
--- a/docs/src/es/bgee.md
+++ b/docs/src/es/bgee.md
@@ -34,7 +34,7 @@ Python: Usa `json=True` para devolver la salida en formato JSON.
`-q` `--quiet`
Solo en línea de comandos. Evita que se muestre la información de progreso.
Python: Usa `verbose=False` para evitar que se muestre la información de progreso.
-
+
### Ejemplos
**Obtener ortólogos para un gen**
@@ -93,7 +93,7 @@ import gget
gget.bgee(["ENSBTAG00000047356", "ENSBTAG00000018317"], type="expression")
```
-→ Devuelve datos de expresión génica para los genes ENSBTAG00000047356 y ENSBTAG00000018317:
+→ Devuelve datos de expresión génica para los genes ENSBTAG00000047356 y ENSBTAG00000018317:
| anat_entity_id | anat_entity_name | score | score_confidence | expression_state |
|----------------|-----------------------------|-------|------------------|------------------|
@@ -102,10 +102,10 @@ gget.bgee(["ENSBTAG00000047356", "ENSBTAG00000018317"], type="expression")
| BGEE:0000000 | anatomical entity and cellular component | 89.12 | high | expressed |
| ... | ... | ... | ... | ... |
-
+
#### [Más ejemplos](https://github.com/pachterlab/gget_examples)
-# Citar
+# Citar
Si utiliza `gget bgee` en una publicación, favor de citar los siguientes artículos:
- Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836)
diff --git a/docs/src/es/blast.md b/docs/src/es/blast.md
index 694a33645..722afca40 100644
--- a/docs/src/es/blast.md
+++ b/docs/src/es/blast.md
@@ -6,7 +6,7 @@ BLAST una secuencia de nucleótidos o aminoácidos a cualquier base de datos [BL
Produce: Resultados en formato JSON (Terminal) o Dataframe/CSV (Python).
**Parámetro posicional**
-`sequence`
+`sequence`
Secuencia de nucleótidos o aminoácidos, o una ruta a un archivo tipo FASTA o .txt.
**Parámetros optionales**
@@ -25,7 +25,7 @@ Limita el número de resultados producidos. Por defecto: 50.
`-e` `--expect`
Define el umbral de ['expect value'](https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD=Web&PAGE_TYPE=BlastDocs&DOC_TYPE=FAQ#expect). Por defecto: 10.0.
-`-o` `--out`
+`-o` `--out`
Ruta al archivo en el que se guardarán los resultados, p. ej. ruta/al/directorio/resultados.csv (o .json). Por defecto: salida estándar (STDOUT).
Para Python, usa `save=True` para guardar los resultados en el directorio de trabajo actual.
@@ -37,16 +37,16 @@ Activa el ['low complexity filter'](https://blast.ncbi.nlm.nih.gov/Blast.cgi?CMD
Desactiva el algoritmo MegaBLAST. Por defecto: MegaBLAST esta activado (solo aplicable para blastn).
`-csv` `--csv`
-Solo para Terminal. Produce los resultados en formato CSV.
+Solo para Terminal. Produce los resultados en formato CSV.
Para Python, usa `json=True` para producir los resultados en formato JSON.
-`-q` `--quiet`
+`-q` `--quiet`
Solo para Terminal. Impide la información de progreso de ser exhibida durante la ejecución del programa.
Para Python, usa `verbose=False` para imipidir la informacion de progreso de ser exhibida durante la ejecución del programa.
`wrap_text`
-Solo para Python. `wrap_text=True` muestra los resultados con texto envuelto para facilitar la lectura (por defecto: False).
-
+Solo para Python. `wrap_text=True` muestra los resultados con texto envuelto para facilitar la lectura (por defecto: False).
+
### Por ejemplo
```bash
gget blast MKWMFKEDHSLEHRCVESAKIRAKYPDRVPVIVEKVSGSQIVDIDKRKYLVPSDITVAQFMWIIRKRIQLPSEKAIFLFVDKTVPQSR
@@ -60,7 +60,7 @@ gget.blast("MKWMFKEDHSLEHRCVESAKIRAKYPDRVPVIVEKVSGSQIVDIDKRKYLVPSDITVAQFMWIIRKRI
| Description | Scientific Name | Common Name | Taxid | Max Score | Total Score | Query Cover | ... |
| -------------- |-------------------------| ------------------------| -------------- | ----------|-----|---|---|
| PREDICTED: gamma-aminobutyric acid receptor-as...| Colobus angolensis palliatus | NaN | 336983 | 180 | 180 | 100% | ... |
-| . . . | . . . | . . . | . . . | . . . | . . . | . . . | ... |
+| . . . | . . . | . . . | . . . | . . . | . . . | . . . | ... |
**BLAST desde un archivo .fa o .txt:**
@@ -75,7 +75,7 @@ gget.blast("fasta.fa")
#### [Más ejemplos](https://github.com/pachterlab/gget_examples)
-# Citar
+# Citar
Si utiliza `gget blast` en una publicación, favor de citar los siguientes artículos:
- Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836)
diff --git a/docs/src/es/blat.md b/docs/src/es/blat.md
index 21109750c..44e154f35 100644
--- a/docs/src/es/blat.md
+++ b/docs/src/es/blat.md
@@ -2,32 +2,32 @@
> Parámetros de Python són iguales a los parámetros largos (`--parámetro`) de Terminal, si no especificado de otra manera. Banderas son parámetros de verdadero o falso (True/False) en Python. El manuál para cualquier modulo de gget se puede llamar desde la Terminal con la bandera `-h` `--help`.
# gget blat 🎯
-Encuentra la ubicación genómica de una secuencia de nucleótidos o aminoácidos usando [BLAT](https://genome.ucsc.edu/cgi-bin/hgBlat).
+Encuentra la ubicación genómica de una secuencia de nucleótidos o aminoácidos usando [BLAT](https://genome.ucsc.edu/cgi-bin/hgBlat).
Produce: Resultados en formato JSON (Terminal) o Dataframe/CSV (Python).
**Parámetro posicional**
-`sequence`
+`sequence`
Secuencia de nucleótidos o aminoácidos, o una ruta a un archivo tipo FASTA o .txt.
**Parámetros optionales**
-`-st` `--seqtype`
-'DNA', 'protein', 'translated%20RNA', o 'translated%20DNA'.
+`-st` `--seqtype`
+'DNA', 'protein', 'translated%20RNA', o 'translated%20DNA'.
Por defecto: 'DNA' para secuencias de nucleótidos; 'protein' para secuencias de aminoácidos.
-`-a` `--assembly`
-Ensamblaje del genoma. 'human' (hg38) (se usa por defecto), 'mouse' (mm39) (ratón), 'zebrafish' (taeGut2) (pinzón cebra),
+`-a` `--assembly`
+Ensamblaje del genoma. 'human' (hg38) (se usa por defecto), 'mouse' (mm39) (ratón), 'zebrafish' (taeGut2) (pinzón cebra),
o cualquiera de los ensamblajes de especies disponibles [aquí](https://genome.ucsc.edu/cgi-bin/hgBlat) (use el nombre corto del ensamblado, p. ej. 'hg38').
-`-o` `--out`
+`-o` `--out`
Ruta al archivo en el que se guardarán los resultados, p. ej. ruta/al/directorio/resultados.csv (o .json). Por defecto: salida estándar (STDOUT).
Para Python, usa `save=True` para guardar los resultados en el directorio de trabajo actual.
-
+
**Banderas**
`-csv` `--csv`
-Solo para Terminal. Produce los resultados en formato CSV.
+Solo para Terminal. Produce los resultados en formato CSV.
Para Python, usa `json=True` para producir los resultados en formato JSON.
-`-q` `--quiet`
+`-q` `--quiet`
Solo para Terminal. Impide la información de progreso de ser exhibida durante la ejecución del programa.
Para Python, usa `verbose=False` para impedir la información de progreso de ser exhibida durante la ejecución del programa.
@@ -40,7 +40,7 @@ gget blat -a taeGut2 MKWMFKEDHSLEHRCVESAKIRAKYPDRVPVIVEKVSGSQIVDIDKRKYLVPSDITVAQ
# Python
gget.blat("MKWMFKEDHSLEHRCVESAKIRAKYPDRVPVIVEKVSGSQIVDIDKRKYLVPSDITVAQFMWIIRKRIQLPSEKAIFLFVDKTVPQSR", assembly="taeGut2")
```
-→ Produce los resultados de BLAT para el ensamblaje taeGut2 (pinzón cebra). En este ejemplo, `gget blat` automáticamente detecta esta secuencia como una secuencia de aminoácidos y, por lo tanto, establece el tipo de secuencia (`--seqtype`) como *proteína*.
+→ Produce los resultados de BLAT para el ensamblaje taeGut2 (pinzón cebra). En este ejemplo, `gget blat` automáticamente detecta esta secuencia como una secuencia de aminoácidos y, por lo tanto, establece el tipo de secuencia (`--seqtype`) como *proteína*.
| genome | query_size | aligned_start | aligned_end | matches | mismatches | %_aligned | ... |
| -------------- |-------------------------| ------------------------| -------------- | ----------|-----|---|---|
@@ -48,7 +48,7 @@ gget.blat("MKWMFKEDHSLEHRCVESAKIRAKYPDRVPVIVEKVSGSQIVDIDKRKYLVPSDITVAQFMWIIRKRIQ
#### [Màs ejemplos](https://github.com/pachterlab/gget_examples)
-# Citar
+# Citar
Si utiliza `gget blat` en una publicación, favor de citar los siguientes artículos:
- Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836)
diff --git a/docs/src/es/cbio.md b/docs/src/es/cbio.md
index dc22820ed..eb3aea1d4 100644
--- a/docs/src/es/cbio.md
+++ b/docs/src/es/cbio.md
@@ -104,7 +104,7 @@ gget.cbio_search(['esophag', 'ovary', 'ovarian'])
-**Graficar un mapa de calor de ocurrencias de mutaciones para genes específicos en un estudio específico:**
+**Graficar un mapa de calor de ocurrencias de mutaciones para genes específicos en un estudio específico:**
```bash
gget cbio plot \
-s msk_impact_2017 \
@@ -131,7 +131,7 @@ gget.cbio_plot(
-**Graficar un mapa de calor de tipos de mutaciones para genes específicos en un estudio específico:**
+**Graficar un mapa de calor de tipos de mutaciones para genes específicos en un estudio específico:**
```bash
gget cbio plot \
-s msk_impact_2017 \
@@ -217,19 +217,18 @@ gget.cbio_plot(
→ Guarda un mapa de calor de los tipos de mutaciones para los genes especificados en el estudio especificado, filtrado por tejido, con el título "Mutaciones intestinales" en ./gget_cbio_figures/intestinal_mutations.png.

-
+
#### [Más ejemplos](https://github.com/pachterlab/gget_examples)
-# Citar
+# Citar
Si utiliza `gget cbio` en una publicación, favor de citar los siguientes artículos:
- Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836)
- Cerami E, Gao J, Dogrusoz U, Gross BE, Sumer SO, Aksoy BA, Jacobsen A, Byrne CJ, Heuer ML, Larsson E, Antipin Y, Reva B, Goldberg AP, Sander C, Schultz N. The cBio cancer genomics portal: an open platform for exploring multidimensional cancer genomics data. Cancer Discov. 2012 May;2(5):401-4. doi: [10.1158/2159-8290.CD-12-0095](https://doi.org/10.1158/2159-8290.cd-12-0095). Erratum in: Cancer Discov. 2012 Oct;2(10):960. PMID: 22588877; PMCID: PMC3956037.
-
+
- Gao J, Aksoy BA, Dogrusoz U, Dresdner G, Gross B, Sumer SO, Sun Y, Jacobsen A, Sinha R, Larsson E, Cerami E, Sander C, Schultz N. Integrative analysis of complex cancer genomics and clinical profiles using the cBioPortal. Sci Signal. 2013 Apr 2;6(269):pl1. doi: [10.1126/scisignal.2004088](https://doi.org/10.1126/scisignal.2004088). PMID: 23550210; PMCID: PMC4160307.
-
+
- de Bruijn I, Kundra R, Mastrogiacomo B, Tran TN, Sikina L, Mazor T, Li X, Ochoa A, Zhao G, Lai B, Abeshouse A, Baiceanu D, Ciftci E, Dogrusoz U, Dufilie A, Erkoc Z, Garcia Lara E, Fu Z, Gross B, Haynes C, Heath A, Higgins D, Jagannathan P, Kalletla K, Kumari P, Lindsay J, Lisman A, Leenknegt B, Lukasse P, Madela D, Madupuri R, van Nierop P, Plantalech O, Quach J, Resnick AC, Rodenburg SYA, Satravada BA, Schaeffer F, Sheridan R, Singh J, Sirohi R, Sumer SO, van Hagen S, Wang A, Wilson M, Zhang H, Zhu K, Rusk N, Brown S, Lavery JA, Panageas KS, Rudolph JE, LeNoue-Newton ML, Warner JL, Guo X, Hunter-Zinck H, Yu TV, Pilai S, Nichols C, Gardos SM, Philip J; AACR Project GENIE BPC Core Team, AACR Project GENIE Consortium; Kehl KL, Riely GJ, Schrag D, Lee J, Fiandalo MV, Sweeney SM, Pugh TJ, Sander C, Cerami E, Gao J, Schultz N. Analysis and Visualization of Longitudinal Genomic and Clinical Data from the AACR Project GENIE Biopharma Collaborative in cBioPortal. Cancer Res. 2023 Dec 1;83(23):3861-3867. doi: [10.1158/0008-5472.CAN-23-0816](https://doi.org/10.1158/0008-5472.CAN-23-0816). PMID: 37668528; PMCID: PMC10690089.
-
-- Please also cite the source of the data if you are using a publicly available dataset.
+- Please also cite the source of the data if you are using a publicly available dataset.
diff --git a/docs/src/es/cellxgene.md b/docs/src/es/cellxgene.md
index a66ae41a0..35d99c837 100644
--- a/docs/src/es/cellxgene.md
+++ b/docs/src/es/cellxgene.md
@@ -13,7 +13,7 @@ Antes de usar `gget cellxgene` por primera vez, corre `gget setup cellxgene` / `
`-g` `--gene`
Str o lista de genes de interés o ID(s) tipo Ensembl. Por defecto: None (ninguno).
-Atención: Utilice la bandera `-e / --ensembl` (Python: `ensembl=True`) cuando ingrese ID(s) tipo Ensembl.
+Atención: Utilice la bandera `-e / --ensembl` (Python: `ensembl=True`) cuando ingrese ID(s) tipo Ensembl.
Atención: ¡Los símbolos de genes distinguen mayúsculas y minúsculas! Usa la capitalización canónica al pasar símbolos de genes; p. ej., ‘PAX7’ (humano), ‘Pax7’ (ratón).
Ver https://cellxgene.cziscience.com/gene-expression para ejemplos de genes.
@@ -22,21 +22,21 @@ Versión del CZ CELLxGENE Discover Census (str), p. ej. "2023-05-15", o "latest"
`-cn` `--column_names`
Lista de columnas de metadatos a obtener (almacenadas en AnnData.obs).
-Por defecto: ['dataset_id', 'assay', 'suspension_type', 'sex', 'tissue_general', 'tissue', 'cell_type']
+Por defecto: ['dataset_id', 'assay', 'suspension_type', 'sex', 'tissue_general', 'tissue', 'cell_type']
Para más opciones, ver: https://api.cellxgene.cziscience.com/curation/ui/#/ -> 'Schemas' -> 'dataset'
-`-o` `--out`
+`-o` `--out`
Ruta al archivo para guardar el objeto AnnData formato .h5ad (o .csv con bandera `-mo / --meta_only`).
¡Requerido cuando se usa desde Terminal!
**Banderas**
`-e` `--ensembl`
-Usa esta bandera si `gene` se ingresa como ID tipo Ensembl.
+Usa esta bandera si `gene` se ingresa como ID tipo Ensembl.
`-mo` `--meta_only`
Solo produce la tabla (Dataframe) con metadatos (corresponde a AnnData.obs).
-`-q` `--quiet`
+`-q` `--quiet`
Solo para Terminal. Impide la información de progreso de ser exhibida durante la ejecución del programa.
Para Python, usa `verbose=False` para impedir la información de progreso de ser exhibida durante la ejecución del programa.
@@ -70,7 +70,7 @@ Str o lista de tejido(s) del tipo high-level. Por defecto: None.
Tejidos y sus IDs de UBERON se enumeran [aquí](https://github.com/chanzuckerberg/single-cell-data-portal/blob/9b94ccb0a2e0a8f6182b213aa4852c491f6f6aff/backend/wmg/data/tissue_mapper.py).
`--tissue_ontology_term_id`
-Str o lista de ID(s) de 'tissue ontology term' como están definidos en el [esquema de datos del CELLxGENE](https://github.com/chanzuckerberg/single-cell-curation/tree/main/schema). Por defecto: None.
+Str o lista de ID(s) de 'tissue ontology term' como están definidos en el [esquema de datos del CELLxGENE](https://github.com/chanzuckerberg/single-cell-curation/tree/main/schema). Por defecto: None.
`--assay_ontology_term_id`
Str o lista de ID(s) de 'assay ontology term' como están definidos en el [esquema de datos del CELLxGENE](https://github.com/chanzuckerberg/single-cell-curation/tree/main/schema). Por defecto: None.
@@ -81,7 +81,7 @@ Str o lista de 'assays' (métodos) como están definidos en el [esquema de datos
`--cell_type_ontology_term_id`
Str o lista de ID(s) de 'celltype ontology term' como están definidos en el [esquema de datos del CELLxGENE](https://github.com/chanzuckerberg/single-cell-curation/tree/main/schema). Por defecto: None.
-`--development_stage_ontology_term_id`
+`--development_stage_ontology_term_id`
Str o lista de ID(s) de 'development stage ontology term' como están definidos en el [esquema de datos del CELLxGENE](https://github.com/chanzuckerberg/single-cell-curation/tree/main/schema). Por defecto: None.
`--disease_ontology_term_id`
@@ -102,7 +102,7 @@ Str o lista de ID(s) de 'sex ontology' como están definidos en el [esquema de d
`--suspension_type`
Str o lista de tipo(s) de suspensión como están definidos en el [esquema de datos del CELLxGENE](https://github.com/chanzuckerberg/single-cell-curation/tree/main/schema). Por defecto: None.
-
+
### Ejemplo
```bash
gget cellxgene --gene ACE2 ABCA1 SLC5A1 --tissue lung --cell_type 'mucus secreting cell' 'neuroendocrine cell' -o example_adata.h5ad
@@ -139,7 +139,7 @@ df
Ver también: [https://chanzuckerberg.github.io/cellxgene-census/notebooks/api_demo/census_gget_demo.html](https://chanzuckerberg.github.io/cellxgene-census/notebooks/api_demo/census_gget_demo.html)
-# Citar
+# Citar
Si utiliza `gget cellxgene` en una publicación, favor de citar los siguientes artículos:
- Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836)
diff --git a/docs/src/es/cite.md b/docs/src/es/cite.md
index 74d4449c7..62a15d272 100644
--- a/docs/src/es/cite.md
+++ b/docs/src/es/cite.md
@@ -4,7 +4,7 @@
# Citar
-Si utiliza `gget` en una publicación, favor de citar:
+Si utiliza `gget` en una publicación, favor de citar:
Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. https://doi.org/10.1093/bioinformatics/btac836
- Si utiliza `gget alphafold`, favor de citar también:
@@ -20,7 +20,7 @@ Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference data
- Si utiliza `gget bgee`, favor de citar también:
- Frederic B Bastian, Julien Roux, Anne Niknejad, Aurélie Comte, Sara S Fonseca Costa, Tarcisio Mendes de Farias, Sébastien Moretti, Gilles Parmentier, Valentine Rech de Laval, Marta Rosikiewicz, Julien Wollbrett, Amina Echchiki, Angélique Escoriza, Walid H Gharib, Mar Gonzales-Porta, Yohan Jarosz, Balazs Laurenczy, Philippe Moret, Emilie Person, Patrick Roelli, Komal Sanjeev, Mathieu Seppey, Marc Robinson-Rechavi (2021). The Bgee suite: integrated curated expression atlas and comparative transcriptomics in animals. Nucleic Acids Research, Volume 49, Issue D1, 8 January 2021, Pages D831–D847, [https://doi.org/10.1093/nar/gkaa793](https://doi.org/10.1093/nar/gkaa793)
-
+
- Si utiliza `gget blast`, favor de citar también:
- Altschul SF, Gish W, Miller W, Myers EW, Lipman DJ. Basic local alignment search tool. J Mol Biol. 1990 Oct 5;215(3):403-10. doi: 10.1016/S0022-2836(05)80360-2. PMID: 2231712.
@@ -29,16 +29,16 @@ Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference data
- Si utiliza `gget cbio`, favor de citar también:
- Cerami E, Gao J, Dogrusoz U, Gross BE, Sumer SO, Aksoy BA, Jacobsen A, Byrne CJ, Heuer ML, Larsson E, Antipin Y, Reva B, Goldberg AP, Sander C, Schultz N. The cBio cancer genomics portal: an open platform for exploring multidimensional cancer genomics data. Cancer Discov. 2012 May;2(5):401-4. doi: [10.1158/2159-8290.CD-12-0095](https://doi.org/10.1158/2159-8290.cd-12-0095). Erratum in: Cancer Discov. 2012 Oct;2(10):960. PMID: 22588877; PMCID: PMC3956037.
-
+
- Gao J, Aksoy BA, Dogrusoz U, Dresdner G, Gross B, Sumer SO, Sun Y, Jacobsen A, Sinha R, Larsson E, Cerami E, Sander C, Schultz N. Integrative analysis of complex cancer genomics and clinical profiles using the cBioPortal. Sci Signal. 2013 Apr 2;6(269):pl1. doi: [10.1126/scisignal.2004088](https://doi.org/10.1126/scisignal.2004088). PMID: 23550210; PMCID: PMC4160307.
-
+
- de Bruijn I, Kundra R, Mastrogiacomo B, Tran TN, Sikina L, Mazor T, Li X, Ochoa A, Zhao G, Lai B, Abeshouse A, Baiceanu D, Ciftci E, Dogrusoz U, Dufilie A, Erkoc Z, Garcia Lara E, Fu Z, Gross B, Haynes C, Heath A, Higgins D, Jagannathan P, Kalletla K, Kumari P, Lindsay J, Lisman A, Leenknegt B, Lukasse P, Madela D, Madupuri R, van Nierop P, Plantalech O, Quach J, Resnick AC, Rodenburg SYA, Satravada BA, Schaeffer F, Sheridan R, Singh J, Sirohi R, Sumer SO, van Hagen S, Wang A, Wilson M, Zhang H, Zhu K, Rusk N, Brown S, Lavery JA, Panageas KS, Rudolph JE, LeNoue-Newton ML, Warner JL, Guo X, Hunter-Zinck H, Yu TV, Pilai S, Nichols C, Gardos SM, Philip J; AACR Project GENIE BPC Core Team, AACR Project GENIE Consortium; Kehl KL, Riely GJ, Schrag D, Lee J, Fiandalo MV, Sweeney SM, Pugh TJ, Sander C, Cerami E, Gao J, Schultz N. Analysis and Visualization of Longitudinal Genomic and Clinical Data from the AACR Project GENIE Biopharma Collaborative in cBioPortal. Cancer Res. 2023 Dec 1;83(23):3861-3867. doi: [10.1158/0008-5472.CAN-23-0816](https://doi.org/10.1158/0008-5472.CAN-23-0816). PMID: 37668528; PMCID: PMC10690089.
-
+
- Please also cite the source of the data if you are using a publicly available dataset.
-
+
- Si utiliza `gget cellxgene`, favor de citar también:
- Chanzuckerberg Initiative. (n.d.). CZ CELLxGENE Discover. Retrieved [insert date here], from [https://cellxgene.cziscience.com/](https://cellxgene.cziscience.com/)
-
+
- Si utiliza `gget cosmic`, favor de citar también:
- Tate JG, Bamford S, Jubb HC, Sondka Z, Beare DM, Bindal N, Boutselakis H, Cole CG, Creatore C, Dawson E, Fish P, Harsha B, Hathaway C, Jupe SC, Kok CY, Noble K, Ponting L, Ramshaw CC, Rye CE, Speedy HE, Stefancsik R, Thompson SL, Wang S, Ward S, Campbell PJ, Forbes SA. COSMIC: the Catalogue Of Somatic Mutations In Cancer. Nucleic Acids Res. 2019 Jan 8;47(D1):D941-D947. doi: [10.1093/nar/gky1015](https://doi.org/10.1093/nar/gky1015). PMID: 30371878; PMCID: PMC6323903.
@@ -47,41 +47,41 @@ Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference data
- Si utiliza `gget elm`, favor de citar también:
- Laura Luebbert, Chi Hoang, Manjeet Kumar, Lior Pachter, Fast and scalable querying of eukaryotic linear motifs with gget elm, Bioinformatics, 2024, btae095, [https://doi.org/10.1093/bioinformatics/btae095](https://doi.org/10.1093/bioinformatics/btae095)
-
+
- Manjeet Kumar, Sushama Michael, Jesús Alvarado-Valverde, Bálint Mészáros, Hugo Sámano‐Sánchez, András Zeke, Laszlo Dobson, Tamas Lazar, Mihkel Örd, Anurag Nagpal, Nazanin Farahi, Melanie Käser, Ramya Kraleti, Norman E Davey, Rita Pancsa, Lucía B Chemes, Toby J Gibson, The Eukaryotic Linear Motif resource: 2022 release, Nucleic Acids Research, Volume 50, Issue D1, 7 January 2022, Pages D497–D508, [https://doi.org/10.1093/nar/gkab975](https://doi.org/10.1093/nar/gkab975)
-
-- Si utiliza `gget enrichr`, favor de citar también:
- - Chen EY, Tan CM, Kou Y, Duan Q, Wang Z, Meirelles GV, Clark NR, Ma'ayan A. Enrichr: interactive and collaborative HTML5 gene list enrichment analysis tool. BMC Bioinformatics. 2013; 128(14). [https://doi.org/10.1186/1471-2105-14-128 ](https://doi.org/10.1186/1471-2105-14-128)
- - Kuleshov MV, Jones MR, Rouillard AD, Fernandez NF, Duan Q, Wang Z, Koplev S, Jenkins SL, Jagodnik KM, Lachmann A, McDermott MG, Monteiro CD, Gundersen GW, Ma'ayan A. Enrichr: a comprehensive gene set enrichment analysis web server 2016 update. Nucleic Acids Research. 2016; gkw377. doi: [10.1093/nar/gkw377](https://doi.org/10.1093/nar/gkw377)
+- Si utiliza `gget enrichr`, favor de citar también:
+ - Chen EY, Tan CM, Kou Y, Duan Q, Wang Z, Meirelles GV, Clark NR, Ma'ayan A. Enrichr: interactive and collaborative HTML5 gene list enrichment analysis tool. BMC Bioinformatics. 2013; 128(14). [https://doi.org/10.1186/1471-2105-14-128 ](https://doi.org/10.1186/1471-2105-14-128)
+
+ - Kuleshov MV, Jones MR, Rouillard AD, Fernandez NF, Duan Q, Wang Z, Koplev S, Jenkins SL, Jagodnik KM, Lachmann A, McDermott MG, Monteiro CD, Gundersen GW, Ma'ayan A. Enrichr: a comprehensive gene set enrichment analysis web server 2016 update. Nucleic Acids Research. 2016; gkw377. doi: [10.1093/nar/gkw377](https://doi.org/10.1093/nar/gkw377)
- Xie Z, Bailey A, Kuleshov MV, Clarke DJB., Evangelista JE, Jenkins SL, Lachmann A, Wojciechowicz ML, Kropiwnicki E, Jagodnik KM, Jeon M, & Ma’ayan A. Gene set knowledge discovery with Enrichr. Current Protocols, 1, e90. 2021. doi: [10.1002/cpz1.90](https://doi.org/10.1002/cpz1.90).
-
+
Si trabaja con conjuntos de datos no humanos/ratón, cite también:
- Kuleshov MV, Diaz JEL, Flamholz ZN, Keenan AB, Lachmann A, Wojciechowicz ML, Cagan RL, Ma'ayan A. modEnrichr: a suite of gene set enrichment analysis tools for model organisms. Nucleic Acids Res. 2019 Jul 2;47(W1):W183-W190. doi: [10.1093/nar/gkz347](https://doi.org/10.1093/nar/gkz347). PMID: 31069376; PMCID: PMC6602483.
- Si utiliza `gget info`, favor de citar también:
- Martin FJ, Amode MR, Aneja A, Austine-Orimoloye O, Azov AG, Barnes I, Becker A, Bennett R, Berry A, Bhai J, Bhurji SK, Bignell A, Boddu S, Branco Lins PR, Brooks L, Ramaraju SB, Charkhchi M, Cockburn A, Da Rin Fiorretto L, Davidson C, Dodiya K, Donaldson S, El Houdaigui B, El Naboulsi T, Fatima R, Giron CG, Genez T, Ghattaoraya GS, Martinez JG, Guijarro C, Hardy M, Hollis Z, Hourlier T, Hunt T, Kay M, Kaykala V, Le T, Lemos D, Marques-Coelho D, Marugán JC, Merino GA, Mirabueno LP, Mushtaq A, Hossain SN, Ogeh DN, Sakthivel MP, Parker A, Perry M, Piližota I, Prosovetskaia I, Pérez-Silva JG, Salam AIA, Saraiva-Agostinho N, Schuilenburg H, Sheppard D, Sinha S, Sipos B, Stark W, Steed E, Sukumaran R, Sumathipala D, Suner MM, Surapaneni L, Sutinen K, Szpak M, Tricomi FF, Urbina-Gómez D, Veidenberg A, Walsh TA, Walts B, Wass E, Willhoft N, Allen J, Alvarez-Jarreta J, Chakiachvili M, Flint B, Giorgetti S, Haggerty L, Ilsley GR, Loveland JE, Moore B, Mudge JM, Tate J, Thybert D, Trevanion SJ, Winterbottom A, Frankish A, Hunt SE, Ruffier M, Cunningham F, Dyer S, Finn RD, Howe KL, Harrison PW, Yates AD, Flicek P. Ensembl 2023. Nucleic Acids Res. 2023 Jan 6;51(D1):D933-D941. doi: [10.1093/nar/gkac958](https://doi.org/10.1093/nar/gkac958). PMID: 36318249; PMCID: PMC9825606.
-
+
- Sayers EW, Beck J, Bolton EE, Brister JR, Chan J, Comeau DC, Connor R, DiCuccio M, Farrell CM, Feldgarden M, Fine AM, Funk K, Hatcher E, Hoeppner M, Kane M, Kannan S, Katz KS, Kelly C, Klimke W, Kim S, Kimchi A, Landrum M, Lathrop S, Lu Z, Malheiro A, Marchler-Bauer A, Murphy TD, Phan L, Prasad AB, Pujar S, Sawyer A, Schmieder E, Schneider VA, Schoch CL, Sharma S, Thibaud-Nissen F, Trawick BW, Venkatapathi T, Wang J, Pruitt KD, Sherry ST. Database resources of the National Center for Biotechnology Information. Nucleic Acids Res. 2024 Jan 5;52(D1):D33-D43. doi: [10.1093/nar/gkad1044](https://doi.org/10.1093/nar/gkad1044). PMID: 37994677; PMCID: PMC10767890.
-
+
- The UniProt Consortium , UniProt: the Universal Protein Knowledgebase in 2023, Nucleic Acids Research, Volume 51, Issue D1, 6 January 2023, Pages D523–D531, [https://doi.org/10.1093/nar/gkac1052](https://doi.org/10.1093/nar/gkac1052)
- Si utiliza `gget muscle`, favor de citar también:
- Edgar RC (2021), MUSCLE v5 enables improved estimates of phylogenetic tree confidence by ensemble bootstrapping, bioRxiv 2021.06.20.449169. [https://doi.org/10.1101/2021.06.20.449169](https://doi.org/10.1101/2021.06.20.449169)
-
+
- Si utiliza `gget opentargets`, favor de citar también:
- Ochoa D, Hercules A, Carmona M, Suveges D, Baker J, Malangone C, Lopez I, Miranda A, Cruz-Castillo C, Fumis L, Bernal-Llinares M, Tsukanov K, Cornu H, Tsirigos K, Razuvayevskaya O, Buniello A, Schwartzentruber J, Karim M, Ariano B, Martinez Osorio RE, Ferrer J, Ge X, Machlitt-Northen S, Gonzalez-Uriarte A, Saha S, Tirunagari S, Mehta C, Roldán-Romero JM, Horswell S, Young S, Ghoussaini M, Hulcoop DG, Dunham I, McDonagh EM. The next-generation Open Targets Platform: reimagined, redesigned, rebuilt. Nucleic Acids Res. 2023 Jan 6;51(D1):D1353-D1359. doi: [10.1093/nar/gkac1046](https://doi.org/10.1093/nar/gkac1046). PMID: 36399499; PMCID: PMC9825572.
-
+
- Si utiliza `gget pdb`, favor de citar también:
- Berman HM, Westbrook J, Feng Z, Gilliland G, Bhat TN, Weissig H, Shindyalov IN, Bourne PE. The Protein Data Bank. Nucleic Acids Res. 2000 Jan 1;28(1):235-42. doi: [10.1093/nar/28.1.235](https://doi.org/10.1093/nar/28.1.235). PMID: 10592235; PMCID: PMC102472.
- Si utiliza `gget ref` o `gget search`, favor de citar también:
- Martin FJ, Amode MR, Aneja A, Austine-Orimoloye O, Azov AG, Barnes I, Becker A, Bennett R, Berry A, Bhai J, Bhurji SK, Bignell A, Boddu S, Branco Lins PR, Brooks L, Ramaraju SB, Charkhchi M, Cockburn A, Da Rin Fiorretto L, Davidson C, Dodiya K, Donaldson S, El Houdaigui B, El Naboulsi T, Fatima R, Giron CG, Genez T, Ghattaoraya GS, Martinez JG, Guijarro C, Hardy M, Hollis Z, Hourlier T, Hunt T, Kay M, Kaykala V, Le T, Lemos D, Marques-Coelho D, Marugán JC, Merino GA, Mirabueno LP, Mushtaq A, Hossain SN, Ogeh DN, Sakthivel MP, Parker A, Perry M, Piližota I, Prosovetskaia I, Pérez-Silva JG, Salam AIA, Saraiva-Agostinho N, Schuilenburg H, Sheppard D, Sinha S, Sipos B, Stark W, Steed E, Sukumaran R, Sumathipala D, Suner MM, Surapaneni L, Sutinen K, Szpak M, Tricomi FF, Urbina-Gómez D, Veidenberg A, Walsh TA, Walts B, Wass E, Willhoft N, Allen J, Alvarez-Jarreta J, Chakiachvili M, Flint B, Giorgetti S, Haggerty L, Ilsley GR, Loveland JE, Moore B, Mudge JM, Tate J, Thybert D, Trevanion SJ, Winterbottom A, Frankish A, Hunt SE, Ruffier M, Cunningham F, Dyer S, Finn RD, Howe KL, Harrison PW, Yates AD, Flicek P. Ensembl 2023. Nucleic Acids Res. 2023 Jan 6;51(D1):D933-D941. doi: [10.1093/nar/gkac958](https://doi.org/10.1093/nar/gkac958). PMID: 36318249; PMCID: PMC9825606.
-
+
- Si utiliza `gget seq`, favor de citar también:
- Martin FJ, Amode MR, Aneja A, Austine-Orimoloye O, Azov AG, Barnes I, Becker A, Bennett R, Berry A, Bhai J, Bhurji SK, Bignell A, Boddu S, Branco Lins PR, Brooks L, Ramaraju SB, Charkhchi M, Cockburn A, Da Rin Fiorretto L, Davidson C, Dodiya K, Donaldson S, El Houdaigui B, El Naboulsi T, Fatima R, Giron CG, Genez T, Ghattaoraya GS, Martinez JG, Guijarro C, Hardy M, Hollis Z, Hourlier T, Hunt T, Kay M, Kaykala V, Le T, Lemos D, Marques-Coelho D, Marugán JC, Merino GA, Mirabueno LP, Mushtaq A, Hossain SN, Ogeh DN, Sakthivel MP, Parker A, Perry M, Piližota I, Prosovetskaia I, Pérez-Silva JG, Salam AIA, Saraiva-Agostinho N, Schuilenburg H, Sheppard D, Sinha S, Sipos B, Stark W, Steed E, Sukumaran R, Sumathipala D, Suner MM, Surapaneni L, Sutinen K, Szpak M, Tricomi FF, Urbina-Gómez D, Veidenberg A, Walsh TA, Walts B, Wass E, Willhoft N, Allen J, Alvarez-Jarreta J, Chakiachvili M, Flint B, Giorgetti S, Haggerty L, Ilsley GR, Loveland JE, Moore B, Mudge JM, Tate J, Thybert D, Trevanion SJ, Winterbottom A, Frankish A, Hunt SE, Ruffier M, Cunningham F, Dyer S, Finn RD, Howe KL, Harrison PW, Yates AD, Flicek P. Ensembl 2023. Nucleic Acids Res. 2023 Jan 6;51(D1):D933-D941. doi: [10.1093/nar/gkac958](https://doi.org/10.1093/nar/gkac958). PMID: 36318249; PMCID: PMC9825606.
-
+
- The UniProt Consortium , UniProt: the Universal Protein Knowledgebase in 2023, Nucleic Acids Research, Volume 51, Issue D1, 6 January 2023, Pages D523–D531, [https://doi.org/10.1093/nar/gkac1052](https://doi.org/10.1093/nar/gkac1052)
___
diff --git a/docs/src/es/contributing.md b/docs/src/es/contributing.md
index 81a677182..be40cbc17 100644
--- a/docs/src/es/contributing.md
+++ b/docs/src/es/contributing.md
@@ -51,7 +51,7 @@ Confirme sus cambios una vez que esté satisfecho con ellos.
- Los parámetros para la Terminal se definen en ./gget/main.py
8. Agregue módulos/argumentos nuevos a la documentación, si corresponde:
- El manual de cada módulo se puede agregar/editar en `./docs/src/en/*.md` (la versión en español de la documentación en `./docs/src/es/*.md` se genera/actualiza automáticamente, y no necesita ser editada manualmente)
-
+
Si tiene alguna pregunta, no dude en iniciar una [discusión](https://github.com/pachterlab/gget/discussions) o crear un Issue como se describe anteriormente.
### Crear un Pull Request (PR)
diff --git a/docs/src/es/cosmic.md b/docs/src/es/cosmic.md
index 41e631a0c..e779cb1a8 100644
--- a/docs/src/es/cosmic.md
+++ b/docs/src/es/cosmic.md
@@ -1,9 +1,9 @@
[ Ver el codigo fuente de la pagina en GitHub ](https://github.com/pachterlab/gget/blob/main/docs/src/es/cosmic.md)
-> Parámetros de Python són iguales a los parámetros largos (`--parámetro`) de Terminal, si no especificado de otra manera. Las banderas son parámetros de verdadero o falso (True/False) en Python. El manuál para cualquier modulo de gget se puede llamar desde la Terminal con la bandera `-h` `--help`.
+> Parámetros de Python són iguales a los parámetros largos (`--parámetro`) de Terminal, si no especificado de otra manera. Las banderas son parámetros de verdadero o falso (True/False) en Python. El manuál para cualquier modulo de gget se puede llamar desde la Terminal con la bandera `-h` `--help`.
# gget cosmic 🪐
Busca genes, mutaciones y otros factores asociados con el cáncer utilizando la base de datos [COSMIC](https://cancer.sanger.ac.uk/cosmic) (Catalogue Of Somatic Mutations In Cancer).
-Formato de retorno: JSON (línea de comandos) o data frame/CSV (Python) cuando `download_cosmic=False`. Cuando `download_cosmic=True`, se descarga la base de datos solicitada en la carpeta especificada.
+Formato de retorno: JSON (línea de comandos) o data frame/CSV (Python) cuando `download_cosmic=False`. Cuando `download_cosmic=True`, se descarga la base de datos solicitada en la carpeta especificada.
Este módulo fue escrito originalmente en parte por [@AubakirovArman](https://github.com/AubakirovArman) (consultas de información) y [@josephrich98](https://github.com/josephrich98) (descarga de bases de datos).
@@ -143,13 +143,9 @@ gget.cosmic("EGFR", cosmic_tsv_path="Cosmic_MutantCensus_Tsv_v101_GRCh37/Cosmic_
| ... | ... | ... | ... | ... | ... | ... |
-# Citar
+# Citar
Si utiliza `gget cosmic` en una publicación, favor de citar los siguientes artículos:
- Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836)
- Tate JG, Bamford S, Jubb HC, Sondka Z, Beare DM, Bindal N, Boutselakis H, Cole CG, Creatore C, Dawson E, Fish P, Harsha B, Hathaway C, Jupe SC, Kok CY, Noble K, Ponting L, Ramshaw CC, Rye CE, Speedy HE, Stefancsik R, Thompson SL, Wang S, Ward S, Campbell PJ, Forbes SA. COSMIC: the Catalogue Of Somatic Mutations In Cancer. Nucleic Acids Res. 2019 Jan 8;47(D1):D941-D947. doi: [10.1093/nar/gky1015](https://doi.org/10.1093/nar/gky1015). PMID: 30371878; PMCID: PMC6323903.
-
-
-
-
diff --git a/docs/src/es/dependents.md b/docs/src/es/dependents.md
index 4013cc770..1b03d7f78 100644
--- a/docs/src/es/dependents.md
+++ b/docs/src/es/dependents.md
@@ -22,8 +22,8 @@ Las siguientes aplicaciones usan *gget*:
- [https://mcpservers.org/servers/longevity-genie/holy-bio-mcp](https://mcpservers.org/servers/longevity-genie/holy-bio-mcp)
- [https://biocontext.ai](https://biocontext.ai/registry/longevity-genie/gget-mcp)
- [https://mcpmarket.com/zh/tools/skills/gget-bioinformatics-tool](https://mcpmarket.com/zh/tools/skills/gget-bioinformatics-tool)
-- [PantheonOS](https://pantheonos.stanford.edu/)
- Un sistema evolutivo de agentes biológicos multiagente diseñado para conciliar la generalidad con la especificidad de dominio, desarrollado en Stanford.
+- [PantheonOS](https://pantheonos.stanford.edu/)
+ Un sistema evolutivo de agentes biológicos multiagente diseñado para conciliar la generalidad con la especificidad de dominio, desarrollado en Stanford.
> "Acceso a bases de datos: utilizando las habilidades de **gget**, iSeq y cellxgene para acceder a una variedad de bases de datos, incluyendo SRA, GEO, Ensembl, UniProt, UCSC, Enrichr y CZI cellxgene."
- [Biomni](https://biomni.stanford.edu/environment)
Un agente de inteligencia artificial biomédica de propósito general que se está desarrollando en Stanford y Genentech.
@@ -32,7 +32,7 @@ Las siguientes aplicaciones usan *gget*:
> "Las herramientas se agrupan en familias como literatura [...], genómica (biopython, **gget**) y aprendizaje automático (rdkit, pymol)."
- [PerTurboAgent](https://www.biorxiv.org/content/10.1101/2025.05.25.656020v1)
Un agente de auto-planificación para potenciar experimentos secuenciales de Perturb-seq.
- > "Nosotros [...] usamos los paquetes **gget** y blitzgsea para análisis de enriquecimiento de datos"
+ > "Nosotros [...] usamos los paquetes **gget** y blitzgsea para análisis de enriquecimiento de datos"
- [Habilidades científicas para Claude](https://github.com/K-Dense-AI/claude-scientific-skills), desarrolladas por K-Dense-AI
> " Este repositorio contiene 138 habilidades científicas organizadas en múltiples dominios. Cada habilidad proporciona documentación completa, ejemplos de código y mejores prácticas para trabajar con librerías científicas, bases de datos y herramientas.
> 🧬 Bioinformática y Genómica
@@ -92,7 +92,7 @@ ____
- Shanmugampillai Jeyarajaguru Kabilan et al., [Molecular modelling approaches for the identification of potent Sodium-Glucose Cotransporter 2 inhibitors from Boerhavia diffusa for the potential treatment of chronic kidney disease.](https://doi.org/10.21203/rs.3.rs-4520611/v1) *Journal of Computer-Aided Molecular Design (en revisión)* (2024). DOI: 10.21203/rs.3.rs-4520611/v1
- Joseph M Rich et al., [The impact of package selection and versioning on single-cell RNA-seq analysis.](https://pmc.ncbi.nlm.nih.gov/articles/PMC11014608/#:~:text=10.1101/2024.04.04.588111) *bioRxiv* (2024). DOI: 10.1101/2024.04.04.588111
- Sanjay C. Nagi et al., [AnoPrimer: Primer Design in malaria vectors informed by range-wide genomic variation.](https://wellcomeopenresearch.org/articles/9-255/v1) *Wellcome Open Research* (2024).
-- Yasmin Makki Mohialden et al., [A survey of the most recent Python packages for use in biology.](http://dx.doi.org/10.48047/NQ.2023.21.2.NQ23029) *NeuroQuantology* (2023). DOI: 10.48047/NQ.2023.21.2.NQ23029
+- Yasmin Makki Mohialden et al., [A survey of the most recent Python packages for use in biology.](http://dx.doi.org/10.48047/NQ.2023.21.2.NQ23029) *NeuroQuantology* (2023). DOI: 10.48047/NQ.2023.21.2.NQ23029
- Nicola A. Kearns et al., [Generation and molecular characterization of human pluripotent stem cell-derived pharyngeal foregut endoderm.](https://doi.org/10.1016/j.devcel.2023.08.024) *Cell Reports* (2023). DOI: 10.1016/j.devcel.2023.08.024
- Jonathan Rosenski et al., [Predicting gene knockout effects from expression data.](https://link.springer.com/article/10.1186/s12920-023-01446-6) *BMC Medical Genomics* (2023). DOI: 10.1186/s12920-023-01446-6
- Peter Overby et al., [Pharmacological or genetic inhibition of Scn9a protects beta-cells while reducing insulin secretion in type 1 diabetes.](https://doi.org/10.1101/2023.06.11.544521) *bioRxiv* (2023). DOI: 10.1101/2023.06.11.544521
diff --git a/docs/src/es/diamond.md b/docs/src/es/diamond.md
index ec8d4a802..6a6e85aaf 100644
--- a/docs/src/es/diamond.md
+++ b/docs/src/es/diamond.md
@@ -2,12 +2,12 @@
> Parámetros de Python són iguales a los parámetros largos (`--parámetro`) de Terminal, si no especificado de otra manera. Las banderas son parámetros de verdadero o falso (True/False) en Python. El manuál para cualquier modulo de gget se puede llamar desde la Terminal con la bandera `-h` `--help`.
# gget diamond 💎
-Alinee múltiples proteínas o secuencias de ADN traducidas usando [DIAMOND](https://www.nature.com/articles/nmeth.3176) (DIAMOND es similar a BLAST, pero este es un cálculo local).
+Alinee múltiples proteínas o secuencias de ADN traducidas usando [DIAMOND](https://www.nature.com/articles/nmeth.3176) (DIAMOND es similar a BLAST, pero este es un cálculo local).
Produce: Resultados en formato JSON (Terminal) o Dataframe/CSV (Python).
**Parámetro posicional**
`query`
-Secuencia(s) (str o lista) de aminoácidos, o una ruta a un archivo tipo FASTA.
+Secuencia(s) (str o lista) de aminoácidos, o una ruta a un archivo tipo FASTA.
**Parámetro requerido**
`-ref` `--reference`
@@ -20,7 +20,7 @@ Por defecto: None -> El archivo de base de datos DIAMOND temporal se eliminará
`-s` `--sensitivity`
Sensibilidad de la alineación (str). Por defecto: "very-sensitive" (muy sensible).
-Uno de los siguientes: fast, mid-sensitive, sensitive, more-sensitive, very-sensitive, or ultra-sensitive.
+Uno de los siguientes: fast, mid-sensitive, sensitive, more-sensitive, very-sensitive, or ultra-sensitive.
`-t` `--threads`
Número de hilos de procesamiento utilizados (int). Por defecto: 1.
@@ -28,18 +28,18 @@ Número de hilos de procesamiento utilizados (int). Por defecto: 1.
`-db` `--diamond_binary`
Ruta al binario DIAMOND (str). Por defecto: None -> Utiliza el binario DIAMOND instalado automáticamente con `gget`.
-`-o` `--out`
-Ruta al archivo en el que se guardarán los resultados (str), p. ej. "ruta/al/directorio". Por defecto: salida estándar (STDOUT); los archivos temporales se eliminan.
+`-o` `--out`
+Ruta al archivo en el que se guardarán los resultados (str), p. ej. "ruta/al/directorio". Por defecto: salida estándar (STDOUT); los archivos temporales se eliminan.
**Banderas**
`-u` `--uniprot`
-Use esta bandera cuando `sequence` es un ID de Uniprot en lugar de una secuencia de aminoácidos.
+Use esta bandera cuando `sequence` es un ID de Uniprot en lugar de una secuencia de aminoácidos.
`-csv` `--csv`
-Solo para Terminal. Produce los resultados en formato CSV.
+Solo para Terminal. Produce los resultados en formato CSV.
Para Python, usa `json=True` para producir los resultados en formato JSON.
-`-q` `--quiet`
+`-q` `--quiet`
Solo para Terminal. Impide la información de progreso de ser exhibida durante la ejecución del programa.
Para Python, usa `verbose=False` para impedir la información de progreso de ser exhibida durante la ejecución del programa.
@@ -63,7 +63,7 @@ gget.diamond(["GGETISAWESQME", "ELVISISALIVE", "LQVEFRANKLIN", "PACHTERLABRQCKS"
#### [Màs ejemplos](https://github.com/pachterlab/gget_examples)
-# Citar
+# Citar
Si utiliza `gget diamond` en una publicación, favor de citar los siguientes artículos:
- Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836)
diff --git a/docs/src/es/elm.md b/docs/src/es/elm.md
index dc32f739f..b5ce56ad8 100644
--- a/docs/src/es/elm.md
+++ b/docs/src/es/elm.md
@@ -2,12 +2,12 @@
> Parámetros de Python són iguales a los parámetros largos (`--parámetro`) de Terminal, si no especificado de otra manera. Banderas son parámetros de verdadero o falso (True/False) en Python. El manuál para cualquier modulo de gget se puede llamar desde la Terminal con la bandera `-h` `--help`.
# gget elm 🎭
-Prediga localmente motivos lineales eucarióticos (ELMs) a partir de una secuencia de aminoácidos o UniProt Acc utilizando datos de la [base de datos ELM](http://elm.eu.org/).
-Produce: Resultados en formato JSON (Terminal) o Dataframe/CSV (Python). Este módulo devuelve dos tipos de resultados (ver ejemplos).
+Prediga localmente motivos lineales eucarióticos (ELMs) a partir de una secuencia de aminoácidos o UniProt Acc utilizando datos de la [base de datos ELM](http://elm.eu.org/).
+Produce: Resultados en formato JSON (Terminal) o Dataframe/CSV (Python). Este módulo devuelve dos tipos de resultados (ver ejemplos).
**Los datos de ELM se pueden descargar y distribuir para uso no comercial de acuerdo con el [acuerdo de licencia de software de ELM](http://elm.eu.org/media/Elm_academic_license.pdf).**
-Antes de usar `gget elm` por primera vez, ejecute `gget setup elm` / `gget.setup("elm")` una vez (consulte también [`gget setup`](setup.md)).
+Antes de usar `gget elm` por primera vez, ejecute `gget setup elm` / `gget.setup("elm")` una vez (consulte también [`gget setup`](setup.md)).
**Parámetro posicional**
`sequence`
@@ -25,21 +25,21 @@ Número de hilos de procesamiento utilizados en la alineación de secuencias con
`-bin` `diamond_binary`
Ruta al binario DIAMOND (str). Por defecto: None -> Utiliza el binario DIAMOND instalado automáticamente con `gget`.
-`-o` `--out`
+`-o` `--out`
Ruta al archivo en el que se guardarán los resultados (str), p. ej. "ruta/al/directorio". Por defecto: salida estándar (STDOUT); los archivos temporales se eliminan.
**Banderas**
`-u` `--uniprot`
-Use esta bandera cuando `sequence` es una Uniprot Acc en lugar de una secuencia de aminoácidos.
+Use esta bandera cuando `sequence` es una Uniprot Acc en lugar de una secuencia de aminoácidos.
`-e` `--expand`
Amplíe la información devuelta en el marco de datos de expresiones regulares para incluir los nombres de proteínas, los organismos y las referencias en las que se validó originalmente el motivo.
`-csv` `--csv`
-Solo para Terminal. Produce los resultados en formato CSV.
+Solo para Terminal. Produce los resultados en formato CSV.
Para Python, usa `json=True` para producir los resultados en formato JSON.
-`-q` `--quiet`
+`-q` `--quiet`
Solo para Terminal. Impide la información de progreso de ser exhibida durante la ejecución del programa.
Para Python, usa `verbose=False` para impedir la información de progreso de ser exhibida durante la ejecución del programa.
@@ -54,8 +54,8 @@ gget elm -o gget_elm_results LIAQSIGQASFV
gget.setup(“elm”) # Descarga/actualiza la base de datos ELM local
ortholog_df, regex_df = gget.elm("LIAQSIGQASFV")
```
-
-Encuentre ELM que proporcionen a una UniProt Acc:
+
+Encuentre ELM que proporcionen a una UniProt Acc:
```bash
gget setup elm # Descarga/actualiza la base de datos ELM local
gget elm -o gget_elm_results --uniprot Q02410 -e
@@ -68,14 +68,14 @@ ortholog_df, regex_df = gget.elm("Q02410", uniprot=True, expand=True)
→ Produce dos resultados con información extensa sobre ELMs asociados con proteínas ortólogas y motivos encontrados en la secuencia de entrada directamente en función de sus expresiones regex:
ortholog_df:
-
+
|Ortholog_UniProt_Acc|ProteinName|class_accession|ELMIdentifier |FunctionalSiteName |Description |Organism |… |
|:-----------------:|:---------:|:-------------:|:-------------:|:-----------------------------------:|:---------------------------------------------------------------------------------------------------------------------------------------:|:----------:|:-:|
|Q02410 |APBA1_HUMAN|ELME000357 |LIG_CaMK_CASK_1|CASK CaMK domain binding ligand motif|Motif that mediates binding to the calmodulin-dependent protein kinase (CaMK) domain of the peripheral plasma membrane protein CASK/Lin2.|Homo sapiens|… |
|Q02410 |APBA1_HUMAN|ELME000091 |LIG_PDZ_Class_2|PDZ domain ligands |The C-terminal class 2 PDZ-binding motif is classically represented by a pattern such as |Homo sapiens|… |
regex_df:
-
+
|Instance_accession|ELMIdentifier |FunctionalSiteName |ELMType|Description |Instances (Matched Sequence)|Organism |… |
|:----------------:|:----------------:|:-----------------------------:|:-----:|:-----------------------------------------------------------------------------------------------------------------------------------------------------:|:--------------------------:|:----------------------------:|:-:|
|ELME000321 |CLV_C14_Caspase3-7|Caspase cleavage motif |CLV |Caspase-3 and Caspase-7 cleavage site. |ERSDG |Mus musculus |… |
@@ -89,9 +89,8 @@ regex_df:
#### [Màs ejemplos](https://github.com/pachterlab/gget_examples)
-# Citar
+# Citar
Si utiliza `gget elm` en una publicación, favor de citar los siguientes artículos:
- Laura Luebbert, Chi Hoang, Manjeet Kumar, Lior Pachter, Fast and scalable querying of eukaryotic linear motifs with gget elm, _Bioinformatics_, 2024, btae095, [https://doi.org/10.1093/bioinformatics/btae095](https://doi.org/10.1093/bioinformatics/btae095)
- Manjeet Kumar, Sushama Michael, Jesús Alvarado-Valverde, Bálint Mészáros, Hugo Sámano‐Sánchez, András Zeke, Laszlo Dobson, Tamas Lazar, Mihkel Örd, Anurag Nagpal, Nazanin Farahi, Melanie Käser, Ramya Kraleti, Norman E Davey, Rita Pancsa, Lucía B Chemes, Toby J Gibson, The Eukaryotic Linear Motif resource: 2022 release, Nucleic Acids Research, Volume 50, Issue D1, 7 January 2022, Pages D497–D508, [https://doi.org/10.1093/nar/gkab975](https://doi.org/10.1093/nar/gkab975)
-
diff --git a/docs/src/es/enrichr.md b/docs/src/es/enrichr.md
index 7ab6c6bc2..f633e76f7 100644
--- a/docs/src/es/enrichr.md
+++ b/docs/src/es/enrichr.md
@@ -4,7 +4,7 @@
# gget enrichr 💰
Realice un análisis de enriquecimiento de una lista de genes utilizando [Enrichr](https://maayanlab.cloud/Enrichr/).
Produce: Resultados en formato JSON (Terminal) o Dataframe/CSV (Python).
-
+
**Parámetro posicional**
`genes`
Lista de nombres cortos (símbolos) de los genes de interés para realizar el análisis de enriquecimiento, p. PHF14 RBM3 MSL1 PHF21A.
@@ -17,10 +17,10 @@ Admite cualquier base de datos enumerada [aquí](https://maayanlab.cloud/Enrichr
'pathway' (KEGG_2021_Human)
'transcription' (ChEA_2016)
'ontology' (GO_Biological_Process_2021)
-'diseases_drugs' (GWAS_Catalog_2019)
+'diseases_drugs' (GWAS_Catalog_2019)
'celltypes' (PanglaoDB_Augmented_2021)
'kinase_interactions' (KEA_2015)
-
+
**Parámetros opcionales**
`-s` `--species`
Especies a utilizar como referencia para el análisis de enriquecimiento. (Por defecto: human)
@@ -39,12 +39,12 @@ Opciones:
Lista de nombres cortos (símbolos) de genes de 'background' (de fondo/control), p. NSUN3 POLRMT NLRX1.
Alternativamente: usa la bandera `--ensembl_background` para ingresar IDs tipo Ensembl.
-`-o` `--out`
+`-o` `--out`
Ruta al archivo en el que se guardarán los resultados, p. ruta/al/directorio/resultados.csv (o .json). Por defecto: salida estándar (STDOUT).
Para Python, usa `save=True` para guardar los resultados en el directorio de trabajo actual.
`-ko` `--kegg_out`
-Ruta al archivo png en el que se guardará la imágen de la vía de señalización celular KEGG, p. ej. ruta/al/directorio/KEGG.png. (Por defecto: None)
+Ruta al archivo png en el que se guardará la imágen de la vía de señalización celular KEGG, p. ej. ruta/al/directorio/KEGG.png. (Por defecto: None)
`-kr` `--kegg_rank`
Rango de la ruta KEGG que se va a trazar. (Por defecto: 1)
@@ -52,33 +52,33 @@ Rango de la ruta KEGG que se va a trazar. (Por defecto: 1)
`figsize`
Solo para Python. (ancho, alto) de la visualización en pulgadas. (Por defecto: (10,10))
-`ax`
+`ax`
Solo para Python. Ingresa un objeto de ejes matplotlib para personalizar la visualización.(Por defecto: None)
-
+
**Banderas**
-`-e` `--ensembl`
-Usa esta bandera si `genes` se ingresa como una lista de IDs tipo Ensembl.
+`-e` `--ensembl`
+Usa esta bandera si `genes` se ingresa como una lista de IDs tipo Ensembl.
`-e_b` `--ensembl_bkg`
Usa esta bandera si `background_list` se ingresa como una lista de IDs tipo Ensembl.
`-bkg` `--background`
-Use un conjunto de 20,625 genes 'background'
+Use un conjunto de 20,625 genes 'background'
listados [aquí](https://github.com/pachterlab/gget/blob/main/gget/constants/enrichr_bkg_genes.txt).
-
+
`-csv` `--csv`
-Solo para Terminal. Produce los resultados en formato CSV.
-Para Python, usa `json=True` produce los resultados en formato JSON.
+Solo para Terminal. Produce los resultados en formato CSV.
+Para Python, usa `json=True` produce los resultados en formato JSON.
-`-q` `--quiet`
+`-q` `--quiet`
Solo para Terminal. Impide la información de progreso de ser exhibida durante la ejecución del programa.
Para Python, usa `verbose=False` para imipidir la información de progreso de ser exhibida durante la ejecución del programa.
-
+
`plot`
Solo para Python. `plot=True` provée la visualización de los primeros 15 resultados (por defecto: False).
-
-
+
+
### Ejemplo
```bash
gget enrichr -db ontology ACE2 AGT AGTR1
@@ -107,10 +107,10 @@ gget.enrichr(
genes = [
"PHF14", "RBM3", "MSL1", "PHF21A", "ARL10", "INSR", "JADE2", "P2RX7",
"LINC00662", "CCDC101", "PPM1B", "KANSL1L", "CRYZL1", "ANAPC16", "TMCC1",
- "CDH8", "RBM11", "CNPY2", "HSPA1L", "CUL2", "PLBD2", "LARP7", "TECPR2",
+ "CDH8", "RBM11", "CNPY2", "HSPA1L", "CUL2", "PLBD2", "LARP7", "TECPR2",
"ZNF302", "CUX1", "MOB2", "CYTH2", "SEC22C", "EIF4E3", "ROBO2",
"ADAMTS9-AS2", "CXXC1", "LINC01314", "ATF7", "ATP5F1"
- ],
+ ],
database = "ChEA_2022",
background_list = [
"NSUN3","POLRMT","NLRX1","SFXN5","ZC3H12C","SLC25A39","ARSG",
@@ -125,11 +125,11 @@ gget.enrichr(
"ZFP787","ZFP655","RABEPK","ZFP650","4732466D17RIK","EXOSC4",
"WDR42A","GPHN","2610528J11RIK","1110003E01RIK","MDH1","1200014M14RIK",
"AW209491","MUT","1700123L14RIK","2610036D13RIK",
- "PHF14", "RBM3", "MSL1", "PHF21A", "ARL10", "INSR", "JADE2",
- "P2RX7", "LINC00662", "CCDC101", "PPM1B", "KANSL1L", "CRYZL1",
- "ANAPC16", "TMCC1","CDH8", "RBM11", "CNPY2", "HSPA1L", "CUL2",
- "PLBD2", "LARP7", "TECPR2", "ZNF302", "CUX1", "MOB2", "CYTH2",
- "SEC22C", "EIF4E3", "ROBO2", "ADAMTS9-AS2", "CXXC1", "LINC01314", "ATF7",
+ "PHF14", "RBM3", "MSL1", "PHF21A", "ARL10", "INSR", "JADE2",
+ "P2RX7", "LINC00662", "CCDC101", "PPM1B", "KANSL1L", "CRYZL1",
+ "ANAPC16", "TMCC1","CDH8", "RBM11", "CNPY2", "HSPA1L", "CUL2",
+ "PLBD2", "LARP7", "TECPR2", "ZNF302", "CUX1", "MOB2", "CYTH2",
+ "SEC22C", "EIF4E3", "ROBO2", "ADAMTS9-AS2", "CXXC1", "LINC01314", "ATF7",
"ATP5F1""COX15","TMEM30A","NSMCE4A","TM2D2","RHBDD3","ATXN2","NFS1",
"3110001I20RIK","BC038156","C330002I19RIK","ZFYVE20","POLI","TOMM70A",
"LOC100047782","2410012H22RIK","RILP","A230062G08RIK",
@@ -164,7 +164,7 @@ gget.enrichr(["ZBP1", "IRF3", "RIPK1"], database="pathway", kegg_out="kegg.png",
El siguiente ejemplo fue enviado por [Dylan Lawless](https://github.com/DylanLawless) a través de un [PR](https://github.com/pachterlab/gget/pull/54) (con ajustes de [Laura Luebbert](https://github.com/lauraluebbert)):
**Use `gget enrichr` en R y cree unq visualización similar usando [ggplot](https://ggplot2.tidyverse.org/reference/ggplot.html).**
-TENGA EN CUENTA el cambio de ejes en comparación con la visualización en Python.
+TENGA EN CUENTA el cambio de ejes en comparación con la visualización en Python.
```r
system("pip install gget")
install.packages("reticulate")
@@ -221,16 +221,16 @@ df |>
#### [Más ejemplos](https://github.com/pachterlab/gget_examples)
-# Citar
+# Citar
Si utiliza `gget enrichr` en una publicación, favor de citar los siguientes artículos:
- Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836)
-- Chen EY, Tan CM, Kou Y, Duan Q, Wang Z, Meirelles GV, Clark NR, Ma'ayan A. Enrichr: interactive and collaborative HTML5 gene list enrichment analysis tool. BMC Bioinformatics. 2013; 128(14). [https://doi.org/10.1186/1471-2105-14-128 ](https://doi.org/10.1186/1471-2105-14-128)
+- Chen EY, Tan CM, Kou Y, Duan Q, Wang Z, Meirelles GV, Clark NR, Ma'ayan A. Enrichr: interactive and collaborative HTML5 gene list enrichment analysis tool. BMC Bioinformatics. 2013; 128(14). [https://doi.org/10.1186/1471-2105-14-128 ](https://doi.org/10.1186/1471-2105-14-128)
-- Kuleshov MV, Jones MR, Rouillard AD, Fernandez NF, Duan Q, Wang Z, Koplev S, Jenkins SL, Jagodnik KM, Lachmann A, McDermott MG, Monteiro CD, Gundersen GW, Ma'ayan A. Enrichr: a comprehensive gene set enrichment analysis web server 2016 update. Nucleic Acids Research. 2016; gkw377. doi: [10.1093/nar/gkw377](https://doi.org/10.1093/nar/gkw377)
+- Kuleshov MV, Jones MR, Rouillard AD, Fernandez NF, Duan Q, Wang Z, Koplev S, Jenkins SL, Jagodnik KM, Lachmann A, McDermott MG, Monteiro CD, Gundersen GW, Ma'ayan A. Enrichr: a comprehensive gene set enrichment analysis web server 2016 update. Nucleic Acids Research. 2016; gkw377. doi: [10.1093/nar/gkw377](https://doi.org/10.1093/nar/gkw377)
- Xie Z, Bailey A, Kuleshov MV, Clarke DJB., Evangelista JE, Jenkins SL, Lachmann A, Wojciechowicz ML, Kropiwnicki E, Jagodnik KM, Jeon M, & Ma’ayan A. Gene set knowledge discovery with Enrichr. Current Protocols, 1, e90. 2021. doi: [10.1002/cpz1.90](https://doi.org/10.1002/cpz1.90).
-
+
Si trabaja con conjuntos de datos no humanos/ratón, cite también:
- Kuleshov MV, Diaz JEL, Flamholz ZN, Keenan AB, Lachmann A, Wojciechowicz ML, Cagan RL, Ma'ayan A. modEnrichr: a suite of gene set enrichment analysis tools for model organisms. Nucleic Acids Res. 2019 Jul 2;47(W1):W183-W190. doi: [10.1093/nar/gkz347](https://doi.org/10.1093/nar/gkz347). PMID: 31069376; PMCID: PMC6602483.
diff --git a/docs/src/es/gpt.md b/docs/src/es/gpt.md
index 49e87693b..e9b9ba30e 100644
--- a/docs/src/es/gpt.md
+++ b/docs/src/es/gpt.md
@@ -27,33 +27,33 @@ Su clave API de OpenAI (str) ([obtenga su clave API](https://platform.openai.com
El nombre del algoritmo GPT que se usará para generar el texto (str). Por defecto: "gpt-3.5-turbo".
See https://platform.openai.com/docs/models/gpt-4 for more information on the available models.
-`-temp` `--temperature`
+`-temp` `--temperature`
Valor entre 0 y 2 que controla el nivel de aleatoriedad y creatividad en el texto generado (float).
Los valores más altos resultan en un texto más creativo y variado. Por defecto: 1.
-`-tp` `--top_p`
+`-tp` `--top_p`
Controla la diversidad del texto generado como alternativa al muestreo con `--temperature` (float).
Los valores más altos resultan en un texto más diverso e inesperado. Por defecto: 1.
Tenga en cuenta que OpenAI recomienda modificar `--top_p` o el parámetro `--temperature`, pero no ambas.
-`-s` `--stop`
+`-s` `--stop`
Una secuencia de tokens para marcar el final del texto generado (str). Por defecto: None.
-`-mt` `--max_tokens`
+`-mt` `--max_tokens`
Controla la longitud máxima del texto generado, en tokens (int). Por defecto: 200.
-`-pp` `--presence_penalty`
+`-pp` `--presence_penalty`
Número entre -2.0 y 2.0. Los valores más altos aumentan la probabilidad de que el modelo hable sobre temas nuevos (float). Por defecto: 0.
-`-fp` `--frequency_penalty`
+`-fp` `--frequency_penalty`
Número entre -2.0 y 2.0. Los valores más altos reducen la probabilidad de que el modelo repita la misma línea palabra por palabra (float). Por defecto: 0.
-`-lb` `--logit_bias`
+`-lb` `--logit_bias`
Un diccionario que especifica un sesgo hacia ciertos tokens en el texto generado (dict). Por defecto: None.
-`-o` `--out`
+`-o` `--out`
Ruta al archivo en el que se guardarán los resultados, p. ej. ruta/al/directorio/resultados.txt. Por defecto: salida estándar (STDOUT).
-
+
### Por ejemplo
```bash
gget gpt "Cómo estás hoy GPT?" su_clave_api
diff --git a/docs/src/es/info.md b/docs/src/es/info.md
index d2009c868..b2a90a62f 100644
--- a/docs/src/es/info.md
+++ b/docs/src/es/info.md
@@ -6,33 +6,33 @@ Obtenga información detallada sobre genes y transcripciones de [Ensembl](https:
Regresa: Resultados en formato JSON (Terminal) o Dataframe/CSV (Python).
**Parámetro posicional**
-`ens_ids`
+`ens_ids`
Uno o más ID del tipo Ensembl.
-NOTA: Proporcionar una lista de más de 1000 ID de Ensembl a la vez puede provocar un error del servidor (para procesar más de 1000 ID, divida la lista de ID en fragmentos de 1000 ID y ejecútelos por separado).
+NOTA: Proporcionar una lista de más de 1000 ID de Ensembl a la vez puede provocar un error del servidor (para procesar más de 1000 ID, divida la lista de ID en fragmentos de 1000 ID y ejecútelos por separado).
**Parámetros optionales**
-`-o` `--out`
+`-o` `--out`
Ruta al archivo en el que se guardarán los resultados, p. ej. ruta/al/directorio/resultados.csv (o .json). Por defecto: salida estándar (STDOUT).
Para Python, usa `save=True` para guardar los resultados en el directorio de trabajo actual.
**Banderas**
`-n` `--ncbi`
DESACTIVA los resultados de [NCBI](https://www.ncbi.nlm.nih.gov/).
-Para Python: `ncbi=False` evita la incluida de datos de NCBI (por defecto: True).
+Para Python: `ncbi=False` evita la incluida de datos de NCBI (por defecto: True).
`-u` `--uniprot`
DESACTIVA los resultados de [UniProt](https://www.uniprot.org/).
-Para Python: `uniprot=False` evita la incluida de datos de UniProt (por defecto: True).
+Para Python: `uniprot=False` evita la incluida de datos de UniProt (por defecto: True).
`-pdb` `--pdb`
INCLUYE [PDB](https://www.ebi.ac.uk/pdbe/) IDs en los resultados (podría aumentar el tiempo de ejecución).
-Para Python: `pdb=True` incluye IDs de PDB en los resultados (por defecto: False).
+Para Python: `pdb=True` incluye IDs de PDB en los resultados (por defecto: False).
`-csv` `--csv`
-Solo para la Terminal. Regresa los resultados en formato CSV.
+Solo para la Terminal. Regresa los resultados en formato CSV.
Para Python, usa `json=True` para regresar los resultados en formato JSON.
-`-q` `--quiet`
+`-q` `--quiet`
Solo para la Terminal. Impide la informacion de progreso de ser exhibida durante la corrida.
Para Python, usa `verbose=False` para imipidir la informacion de progreso de ser exhibida durante la corrida.
@@ -54,16 +54,16 @@ gget.info(["ENSG00000034713", "ENSG00000104853", "ENSG00000170296"])
| -------------- |-------------------------| ------------------------| -------------- | ----------|-----|----|----|----|----|----|----|
| ENSG00000034713| P60520 | 11345 | GABARAPL2 | [ATG8, ATG8C, FLC3A, GABARAPL2, GATE-16, GATE16, GEF-2, GEF2] | Gamma-aminobutyric acid receptor-associated protein like 2 (GABA(A) receptor-associated protein-like 2)... | GABA type A receptor associated protein like 2 [Source:HGNC Symbol;Acc:HGNC:13291] | FUNCTION: Ubiquitin-like modifier involved in intra- Golgi traffic (By similarity). Modulates intra-Golgi transport through coupling between NSF activity and ... | Enables ubiquitin protein ligase binding activity. Involved in negative regulation of proteasomal protein catabolic process and protein... | protein_coding | ENST00000037243.7 |... |
| . . . | . . . | . . . | . . . | . . . | . . . | . . . | . . . | . . . | . . . | . . . | ... |
-
+
#### [More examples](https://github.com/pachterlab/gget_examples)
-# Citar
+# Citar
Si utiliza `gget info` en una publicación, favor de citar los siguientes artículos:
- Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836)
- Martin FJ, Amode MR, Aneja A, Austine-Orimoloye O, Azov AG, Barnes I, Becker A, Bennett R, Berry A, Bhai J, Bhurji SK, Bignell A, Boddu S, Branco Lins PR, Brooks L, Ramaraju SB, Charkhchi M, Cockburn A, Da Rin Fiorretto L, Davidson C, Dodiya K, Donaldson S, El Houdaigui B, El Naboulsi T, Fatima R, Giron CG, Genez T, Ghattaoraya GS, Martinez JG, Guijarro C, Hardy M, Hollis Z, Hourlier T, Hunt T, Kay M, Kaykala V, Le T, Lemos D, Marques-Coelho D, Marugán JC, Merino GA, Mirabueno LP, Mushtaq A, Hossain SN, Ogeh DN, Sakthivel MP, Parker A, Perry M, Piližota I, Prosovetskaia I, Pérez-Silva JG, Salam AIA, Saraiva-Agostinho N, Schuilenburg H, Sheppard D, Sinha S, Sipos B, Stark W, Steed E, Sukumaran R, Sumathipala D, Suner MM, Surapaneni L, Sutinen K, Szpak M, Tricomi FF, Urbina-Gómez D, Veidenberg A, Walsh TA, Walts B, Wass E, Willhoft N, Allen J, Alvarez-Jarreta J, Chakiachvili M, Flint B, Giorgetti S, Haggerty L, Ilsley GR, Loveland JE, Moore B, Mudge JM, Tate J, Thybert D, Trevanion SJ, Winterbottom A, Frankish A, Hunt SE, Ruffier M, Cunningham F, Dyer S, Finn RD, Howe KL, Harrison PW, Yates AD, Flicek P. Ensembl 2023. Nucleic Acids Res. 2023 Jan 6;51(D1):D933-D941. doi: [10.1093/nar/gkac958](https://doi.org/10.1093/nar/gkac958). PMID: 36318249; PMCID: PMC9825606.
-
+
- Sayers EW, Beck J, Bolton EE, Brister JR, Chan J, Comeau DC, Connor R, DiCuccio M, Farrell CM, Feldgarden M, Fine AM, Funk K, Hatcher E, Hoeppner M, Kane M, Kannan S, Katz KS, Kelly C, Klimke W, Kim S, Kimchi A, Landrum M, Lathrop S, Lu Z, Malheiro A, Marchler-Bauer A, Murphy TD, Phan L, Prasad AB, Pujar S, Sawyer A, Schmieder E, Schneider VA, Schoch CL, Sharma S, Thibaud-Nissen F, Trawick BW, Venkatapathi T, Wang J, Pruitt KD, Sherry ST. Database resources of the National Center for Biotechnology Information. Nucleic Acids Res. 2024 Jan 5;52(D1):D33-D43. doi: [10.1093/nar/gkad1044](https://doi.org/10.1093/nar/gkad1044). PMID: 37994677; PMCID: PMC10767890.
-
+
- The UniProt Consortium , UniProt: the Universal Protein Knowledgebase in 2023, Nucleic Acids Research, Volume 51, Issue D1, 6 January 2023, Pages D523–D531, [https://doi.org/10.1093/nar/gkac1052](https://doi.org/10.1093/nar/gkac1052)
diff --git a/docs/src/es/installation.md b/docs/src/es/installation.md
index 26a9899a4..f2e94bf7b 100644
--- a/docs/src/es/installation.md
+++ b/docs/src/es/installation.md
@@ -74,4 +74,3 @@ pip install .
```
o elimina el ejecutable de tu `PATH` del sistema.
- Si sigues teniendo problemas, por favor [contáctanos](https://github.com/pachterlab/gget/issues).
-
diff --git a/docs/src/es/introduction.md b/docs/src/es/introduction.md
index 599fbd508..e032e7078 100644
--- a/docs/src/es/introduction.md
+++ b/docs/src/es/introduction.md
@@ -8,7 +8,7 @@
[
](https://raw.githubusercontent.com/pachterlab/gget/main/figures/gget_overview.png)
# ¡Bienvenidos!
-
+
`gget` es un programa gratuito de código fuente abierta de Terminal y Python que permite la consulta eficiente de bases de datos genómicas.
`gget` consiste en un conjunto de módulos separados pero interoperables, cada uno diseñado para facilitar un tipo de consulta de base de datos en una sola línea de código.
@@ -65,7 +65,7 @@ Estos son los módulos principales de `gget`. Haga clic en cualquier módulo par
-Si usa `gget` en una publicación, por favor [cite*](cite.md):
+Si usa `gget` en una publicación, por favor [cite*](cite.md):
```
Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. https://doi.org/10.1093/bioinformatics/btac836
```
diff --git a/docs/src/es/muscle.md b/docs/src/es/muscle.md
index b83e4b644..d3c33b304 100644
--- a/docs/src/es/muscle.md
+++ b/docs/src/es/muscle.md
@@ -6,11 +6,11 @@ Alinea múltiples secuencias de nucleótidos o aminoácidos usando el algoritmo
Regresa: Salida estándar (STDOUT) en formato ClustalW o archivo de tipo 'aligned FASTA' (.afa).
**Parámetro posicional**
-`fasta`
+`fasta`
Lista de secuencias o ruta al archivo FASTA o .txt que contiene las secuencias de nucleótidos o aminoácidos que se van a alinear.
**Parámetros optionales**
-`-o` `--out`
+`-o` `--out`
Ruta al archivo en el que se guardarán los resultados, p. ej. ruta/al/directorio/resultados.afa. Por defecto: salida estándar (STDOUT).
Para Python, usa `save=True` para guardar los resultados en el directorio de trabajo actual.
@@ -19,11 +19,11 @@ Para Python, usa `save=True` para guardar los resultados en el directorio de tra
Alinea las secuencies usando el algoritmo [Super5](https://drive5.com/muscle5/Muscle5_SuppMat.pdf) en lugar del algoritmo [Parallel Perturbed Probcons (PPP)](https://drive5.com/muscle5/Muscle5_SuppMat.pdf) para disminuir el tiempo y la memoria usada durante la corrida.
Use para ingresos grandes (unos cientos secuencias).
-`-q` `--quiet`
+`-q` `--quiet`
Solo para la Terminal. Impide la informacion de progreso de ser exhibida durante la corrida.
Para Python, usa `verbose=False` para imipidir la informacion de progreso de ser exhibida durante la corrida.
-
-
+
+
### Por ejemplo
```bash
gget muscle MSSSSWLLLSLVAVTAAQSTIEEQAKTFLDKFNHEAEDLFYQSSLAS MSSSSWLLLSLVEVTAAQSTIEQQAKTFLDKFHEAEDLFYQSLLAS
@@ -32,7 +32,7 @@ gget muscle MSSSSWLLLSLVAVTAAQSTIEEQAKTFLDKFNHEAEDLFYQSSLAS MSSSSWLLLSLVEVTAAQST
# Python
gget.muscle(["MSSSSWLLLSLVAVTAAQSTIEEQAKTFLDKFNHEAEDLFYQSSLAS", "MSSSSWLLLSLVEVTAAQSTIEQQAKTFLDKFHEAEDLFYQSLLAS"])
```
-
+
```bash
gget muscle fasta.fa
```
@@ -59,7 +59,7 @@ alv.view(msa)
#### [More examples](https://github.com/pachterlab/gget_examples)
-# Citar
+# Citar
Si utiliza `gget muscle` en una publicación, favor de citar los siguientes artículos:
- Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836)
diff --git a/docs/src/es/mutate.md b/docs/src/es/mutate.md
index 8b2d58022..9bb0656f9 100644
--- a/docs/src/es/mutate.md
+++ b/docs/src/es/mutate.md
@@ -1,6 +1,6 @@
[ Ver el codigo fuente de la pagina en GitHub ](https://github.com/pachterlab/gget/blob/main/docs/src/es/mutate.md)
-> Parámetros de Python són iguales a los parámetros largos (`--parámetro`) de Terminal, si no especificado de otra manera. Banderas son parámetros de verdadero o falso (True/False) en Python. El manuál para cualquier modulo de gget se puede llamar desde la Terminal con la bandera `-h` `--help`.
+> Parámetros de Python són iguales a los parámetros largos (`--parámetro`) de Terminal, si no especificado de otra manera. Banderas son parámetros de verdadero o falso (True/False) en Python. El manuál para cualquier modulo de gget se puede llamar desde la Terminal con la bandera `-h` `--help`.
# gget mutate 🧟
Recibe secuencias de nucleótidos y mutaciones (en [anotación de mutación estándar](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1867422/)) y devuelve versiones mutadas de las secuencias según las mutaciones proporcionadas.
Resultado: Guarda las secuencias mutadas en formato FASTA (o devuelve una lista que contiene las secuencias mutadas si `out=None`).
@@ -71,53 +71,53 @@ Si k > longitud total de la secuencia, se mantendrá toda la secuencia.
`-msl` `--min_seq_len`
Longitud mínima de la secuencia de salida mutante, por ejemplo, 100. Las secuencias mutantes más pequeñas que esto serán descartadas. Predeterminado: Ninguno
-`-ma` `--max_ambiguous`
+`-ma` `--max_ambiguous`
Número máximo de caracteres 'N' (o 'n') permitidos en la secuencia de salida, por ejemplo, 10. Predeterminado: Ninguno (no se aplicará filtro de caracteres ambiguos)
**Banderas opcionales para la generación/filtrado de secuencias mutantes**
`-ofr` `--optimize_flanking_regions`
-Elimina nucleótidos de cualquiera de los extremos de la secuencia mutante para asegurar (cuando sea posible) que la secuencia mutante no contenga ningún k-mer que también se encuentre en la secuencia de tipo salvaje/entrada.
+Elimina nucleótidos de cualquiera de los extremos de la secuencia mutante para asegurar (cuando sea posible) que la secuencia mutante no contenga ningún k-mer que también se encuentre en la secuencia de tipo salvaje/entrada.
`-rswk` `--remove_seqs_with_wt_kmers`
Elimina las secuencias de salida donde al menos un k-mer también está presente en la secuencia de tipo salvaje/entrada en la misma región.
Cuando se utiliza con `--optimize_flanking_regions`, solo se eliminarán las secuencias para las cuales un k-mer de tipo salvaje aún está presente después de la optimización.
-`-mio` `--merge_identical_off`
+`-mio` `--merge_identical_off`
No fusionar secuencias mutantes idénticas en la salida (por defecto, las secuencias idénticas se fusionarán concatenando los encabezados de secuencia para todas las secuencias idénticas).
-**Argumentos opcionales para generar salida adicional**
+**Argumentos opcionales para generar salida adicional**
Esta salida se activa utilizando la bandera `--update_df` y se almacenará en una copia del DataFrame `mutations`.
-`-udf_o` `--update_df_out`
+`-udf_o` `--update_df_out`
Ruta al archivo csv de salida que contiene el DataFrame actualizado, por ejemplo, 'path/to/mutations_updated.csv'. Solo válido cuando se usa con `--update_df`.
Predeterminado: Ninguno -> el nuevo archivo csv se guardará en el mismo directorio que el DataFrame `mutations` con el apéndice '_updated'
-`-ts` `--translate_start`
+`-ts` `--translate_start`
(int o str) La posición en la secuencia de nucleótidos de entrada para comenzar a traducir, por ejemplo, 5. Si se proporciona una cadena, debe corresponder a un nombre de columna en `mutations` que contenga las posiciones de inicio del marco de lectura abierto para cada secuencia/mutación. Solo válido cuando se usa con `--translate`.
Predeterminado: traduce desde el principio de cada secuencia
-`-te` `--translate_end`
+`-te` `--translate_end`
(int o str) La posición en la secuencia de nucleótidos de entrada para finalizar la traducción, por ejemplo, 35. Si se proporciona una cadena, debe corresponder a un nombre de columna en `mutations` que contenga las posiciones de fin del marco de lectura abierto para cada secuencia/mutación. Solo válido cuando se usa con `--translate`.
Predeterminado: traduce hasta el final de cada secuencia
**Banderas opcionales para modificar salida adicional**
-`-udf` `--update_df`
+`-udf` `--update_df`
Actualiza el DataFrame de entrada `mutations` para incluir columnas adicionales con el tipo de mutación, la secuencia de nucleótidos de tipo salvaje y la secuencia de nucleótidos mutante (solo válido si `mutations` es un archivo .csv o .tsv).
-`-sfs` `--store_full_sequences`
-Incluye las secuencias completas de tipo salvaje y mutantes en el DataFrame actualizado `mutations` (no solo la sub-secuencia con flancos de longitud k). Solo válido cuando se usa con `--update_df`.
+`-sfs` `--store_full_sequences`
+Incluye las secuencias completas de tipo salvaje y mutantes en el DataFrame actualizado `mutations` (no solo la sub-secuencia con flancos de longitud k). Solo válido cuando se usa con `--update_df`.
+
+`-tr` `--translate`
+Agrega columnas adicionales al DataFrame actualizado `mutations` que contienen las secuencias de aminoácidos de tipo salvaje y mutantes. Solo válido cuando se usa con `--store_full_sequences`.
-`-tr` `--translate`
-Agrega columnas adicionales al DataFrame actualizado `mutations` que contienen las secuencias de aminoácidos de tipo salvaje y mutantes. Solo válido cuando se usa con `--store_full_sequences`.
-
**Argumentos generales opcionales**
-`-o` `--out`
+`-o` `--out`
Ruta al archivo FASTA de salida que contiene las secuencias mutadas, por ejemplo, 'path/to/output_fasta.fa'.
-Predeterminado: Ninguno -> devuelve una lista de las secuencias mutadas a la salida estándar.
-Los identificadores (que siguen al '>') de las secuencias mutadas en el FASTA de salida serán '>[seq_ID]_[mut_ID]'.
+Predeterminado: Ninguno -> devuelve una lista de las secuencias mutadas a la salida estándar.
+Los identificadores (que siguen al '>') de las secuencias mutadas en el FASTA de salida serán '>[seq_ID]_[mut_ID]'.
**Banderas generales opcionales**
-`-q` `--quiet`
+`-q` `--quiet`
Solo en línea de comandos. Previene que se muestre información de progreso.
Python: Usa `verbose=False` para prevenir que se muestre información de progreso.
@@ -221,7 +221,7 @@ gget.mutate(
| 1 | g.224411A>C | ENST00000193812 | 0 | 100 |
| 8 | g.25111del | ENST00000174411 | 0 | 294 |
| X | g.1011_1012insAA | ENST00000421914 | 9 | 1211 |
-```
+```
→ Guarda el archivo 'mut_fasta.fa' que contiene:
```
>1:g.224411A>C
@@ -230,7 +230,7 @@ TGCTCTGCT
GAGTCGAT
>X:g.1011_1012insAA
TTAGAACTT
-```
+```
→ Guarda el archivo 'mutations_updated.csv' que contiene:
```
@@ -242,8 +242,7 @@ TTAGAACTT
```
-# Citar
+# Citar
Si utiliza `gget mutate` en una publicación, favor de citar los siguientes artículos:
- Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836)
-
diff --git a/docs/src/es/opentargets.md b/docs/src/es/opentargets.md
index 799b6c400..0c72d3066 100644
--- a/docs/src/es/opentargets.md
+++ b/docs/src/es/opentargets.md
@@ -12,8 +12,8 @@ Este módulo fue escrito por [Sam Wagenaar](https://github.com/techno-sam).
ID de gen Ensembl, por ejemplo, ENSG00000169194.
**Argumentos opcionales**
-`-r` `--resource`
-Define el tipo de información a devolver en la salida. Predeterminado: 'diseases' (enfermedades).
+`-r` `--resource`
+Define el tipo de información a devolver en la salida. Predeterminado: 'diseases' (enfermedades).
Los recursos posibles son:
| Recurso | Valor devuelto | Filtros válidos | Fuentes |
@@ -27,24 +27,24 @@ Los recursos posibles son:
| `interactions` | Interacciones proteína⇄proteína | `protein_a_id`
`protein_b_id`
`gene_b_id` | - [Open Targets](https://platform-docs.opentargets.org/target/molecular-interactions)
- [IntAct](https://platform-docs.opentargets.org/target/molecular-interactions#intact)
- [Signor](https://platform-docs.opentargets.org/target/molecular-interactions#signor)
- [Reactome](https://platform-docs.opentargets.org/target/molecular-interactions#reactome)
- [String](https://platform-docs.opentargets.org/target/molecular-interactions#string)
|
`-l` `--limit`
-Limitar el número de resultados, por ejemplo, 10. Predeterminado: Sin límite.
+Limitar el número de resultados, por ejemplo, 10. Predeterminado: Sin límite.
Nota: No es compatible con los recursos `tractability` y `depmap`.
-`-o` `--out`
+`-o` `--out`
Ruta al archivo JSON donde se guardarán los resultados, por ejemplo, path/to/directory/results.json. Predeterminado: Salida estándar.
Python: `save=True` guardará la salida en el directorio de trabajo actual.
`--filters`
Filtrar resultados por igualdad exacta usando nombres de columnas de OpenTargets devueltos. Pase múltiples filtros repitiendo la bandera, p. ej. '--filter disease.id=EFO_0000274 --filter drug.id=CHEMBL1743081'. Los campos anidados usan notación de punto, coincidiendo con los nombres de columna devueltos por la API.
-**Banderas**
+**Banderas**
`-csv` `--csv`
Solo en línea de comandos. Devuelve la salida en formato CSV, en lugar de formato JSON.
Python: Use `json=True` para devolver la salida en formato JSON.
-`-q` `--quiet`
+`-q` `--quiet`
Solo en línea de comandos. Evita que se muestre la información de progreso.
-Python: Use `verbose=False` para evitar que se muestre la información de progreso.
+Python: Use `verbose=False` para evitar que se muestre la información de progreso.
`-or` `--or`
Solo en línea de comandos. Los filtros se combinan con lógica OR. Predeterminado: lógica AND.
@@ -54,7 +54,7 @@ Solo para Python. `wrap_text=True` muestra el marco de datos con texto ajustado
### Ejemplos
-**Obtenga enfermedades asociadas a un gen específico:**
+**Obtenga enfermedades asociadas a un gen específico:**
```bash
gget opentargets ENSG00000169194 -r diseases -l 1
```
@@ -71,7 +71,7 @@ gget.opentargets('ENSG00000169194', resource='diseases', limit=1)
-**Obtener medicamentos asociados para un gen específico:**
+**Obtener medicamentos asociados para un gen específico:**
```bash
gget opentargets ENSG00000169194 -r drugs -l 2
```
@@ -92,7 +92,7 @@ gget.opentargets('ENSG00000169194', resource='drugs', limit=2)
-**Obtenga datos de trazabilidad para un gen específico:**
+**Obtenga datos de trazabilidad para un gen específico:**
```bash
gget opentargets ENSG00000169194 -r tractability
```
@@ -235,10 +235,10 @@ gget.opentargets(
| 0.400 | 1 | intact | P35225 | ENSG00000169194 | IL13 | unspecified role | 9606 | Q86XT9 | ENSG00000149932 | TMEM219 | stimulator | 9606 |
-
+
#### [Más ejemplos](https://github.com/pachterlab/gget_examples)
-# Citar
+# Citar
Si utiliza `gget opentargets` en una publicación, favor de citar los siguientes artículos:
- Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836)
diff --git a/docs/src/es/pdb.md b/docs/src/es/pdb.md
index 3469577a6..459d79f42 100644
--- a/docs/src/es/pdb.md
+++ b/docs/src/es/pdb.md
@@ -3,7 +3,7 @@
> Parámetros de Python són iguales a los parámetros largos (`--parámetro`) de Terminal, si no especificado de otra manera. Banderas son parámetros de verdadero o falso (True/False) en Python. El manuál para cualquier modulo de gget se puede llamar desde la Terminal con la bandera `-h` `--help`.
# gget pdb 🔮
Obtenga la estructura o los metadatos de una proteína usando data de [RCSB Protein Data Bank (PDB)](https://www.rcsb.org/).
-Regresa: El archivo 'pdb' se regresa en formato PDB. Todos los demás datos se regresan en formato JSON.
+Regresa: El archivo 'pdb' se regresa en formato PDB. Todos los demás datos se regresan en formato JSON.
**Parámetro posicional**
`pdb_id`
@@ -12,7 +12,7 @@ ID del tipo PDB, p. ej. '7S7U'.
**Parámetros optionales**
`-r` `--resource`
Define el tipo de información a regresar. Uno de los siguientes:
- 'pdb': Regresa la estructura de la proteína en formato PDB (regresa por defecto).
+ 'pdb': Regresa la estructura de la proteína en formato PDB (regresa por defecto).
'entry': Regresa información sobre las estructuras PDB en el nivel superior de la organización de datos PDB jerárquicos.
'pubmed': Regresa anotaciones de PubMed (datos integrados de PubMed) para la cita principal de un ID PDB.
'assembly': Regresa información sobre estructuras PDB en el nivel de estructura cuaternaria.
@@ -22,15 +22,15 @@ Define el tipo de información a regresar. Uno de los siguientes:
'uniprot': Regresa anotaciones UniProt para una entidad macromolecular (defina el ID de la entidad como `identifier`).
'branched_entity_instance': Regresa la descripción de instancia de entidad ramificada (defina el ID de cadena como `identifier`).
'polymer_entity_instance': Regresa datos de instancia de entidad polimérica (también conocida como cadena) (defina el ID de cadena como `identifier`).
- 'nonpolymer_entity_instance': Regresa datos de instancia de entidad no polimérica (defina el ID de cadena como `identifier`).
-
+ 'nonpolymer_entity_instance': Regresa datos de instancia de entidad no polimérica (defina el ID de cadena como `identifier`).
+
`-i` `--identifier`
Este parámetro se puede utilizar para definir el ID de ensamblaje, entidad o cadena (po defecto: None). Los IDs de ensamblaje/entidad son números (p. ej., 1) y los IDs de cadena son letras (p. ej., 'A').
-
-`-o` `--out`
+
+`-o` `--out`
Ruta al archivo en el que se guardarán los resultados, p. ej. ruta/al/directorio/7S7U.pdb (o 7S7U_entry.json). Por defecto: salida estándar (STDOUT).
-Para Python, usa `save=True` para guardar los resultados en el directorio de trabajo actual.
-
+Para Python, usa `save=True` para guardar los resultados en el directorio de trabajo actual.
+
### Por ejemplo
```bash
gget pdb 7S7U -o 7S7U.pdb
@@ -77,10 +77,10 @@ gget.pdb("7DQA", save=True)
gget.pdb("7CT5", save=True)
```
→ Este caso de uso ejemplifica cómo encontrar archivos PDB para un análisis comparativo de la estructura de las proteínas asociado con IDs de Ensembl o secuencias de aminoácidos. Los archivos PDB obtenidos también se pueden comparar con las estructuras predichas generadas por [`gget alphafold`](alphafold.md). Los archivos PDB se pueden ver de forma interactiva en 3D [aquí](https://rcsb.org/3d-view), o usando programas como [PyMOL](https://pymol.org/) o [Blender](https://www.blender.org/). Múltiple archivos PDB se pueden visualizar para comparación [aquí](https://rcsb.org/alignment).
-
+
#### [Más ejemplos](https://github.com/pachterlab/gget_examples)
-# Citar
+# Citar
Si utiliza `gget pdb` en una publicación, favor de citar los siguientes artículos:
- Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836)
diff --git a/docs/src/es/ref.md b/docs/src/es/ref.md
index 9b8744a59..f25a2edd1 100644
--- a/docs/src/es/ref.md
+++ b/docs/src/es/ref.md
@@ -8,44 +8,44 @@ Regresa: Resultados en formato JSON.
**Parámetro posicional**
`species`
La especie por la cual que se buscará los FTP en el formato género_especies, p. ej. homo_sapiens.
-Nota: No se requiere cuando se llama a la bandera `--list_species`.
+Nota: No se requiere cuando se llama a la bandera `--list_species`.
Accesos directos: 'human', 'mouse', 'human_grch37' (accede al ensamblaje del genoma GRCh37)
**Parámetros optionales**
`-w` `--which`
-Define qué resultados devolver. Por defecto: 'all' -> Regresa todos los resultados disponibles.
-Las entradas posibles son uno solo o una combinación de las siguientes (como lista separada por comas):
+Define qué resultados devolver. Por defecto: 'all' -> Regresa todos los resultados disponibles.
+Las entradas posibles son uno solo o una combinación de las siguientes (como lista separada por comas):
'gtf' - Regresa la anotación (GTF).
'cdna' - Regresa el transcriptoma (cDNA).
'dna' - Regresa el genoma (DNA).
'cds' - Regresa las secuencias codificantes correspondientes a los genes Ensembl. (No contiene UTR ni secuencia intrónica).
-'cdrna' - Regresa secuencias de transcripción correspondientes a genes de ARN no codificantes (ncRNA).
-'pep' - Regresa las traducciones de proteínas de los genes Ensembl.
+'cdrna' - Regresa secuencias de transcripción correspondientes a genes de ARN no codificantes (ncRNA).
+'pep' - Regresa las traducciones de proteínas de los genes Ensembl.
`-r` `--release`
Define el número de versión de Ensembl desde el que se obtienen los archivos, p. ej. 104. Default: latest Ensembl release.
-`-od` `--out_dir`
+`-od` `--out_dir`
Ruta al directorio donde se guardarán los archivos FTP, p. ruta/al/directorio/. Por defecto: directorio de trabajo actual.
-`-o` `--out`
+`-o` `--out`
Ruta al archivo en el que se guardarán los resultados, p. ej. ruta/al/directorio/resultados.json. Por defecto: salida estándar (STDOUT).
Para Python, usa `save=True` para guardar los resultados en el directorio de trabajo actual.
**Banderas**
-`-l` `--list_species`
+`-l` `--list_species`
Enumera todas las especies disponibles. (Para Python: combina con `species=None`.)
-`-ftp` `--ftp`
+`-ftp` `--ftp`
Regresa solo los enlaces FTP solicitados.
-`-d` `--download`
+`-d` `--download`
Solo para Terminal. Descarga los FTP solicitados al directorio actual (requiere [curl](https://curl.se/docs/) para ser instalado).
-`-q` `--quiet`
+`-q` `--quiet`
Solo para la Terminal. Impide la informacion de progreso de ser exhibida durante la corrida.
Para Python, usa `verbose=False` para imipidir la informacion de progreso de ser exhibida durante la corrida.
-
+
### Por ejemplo
**Use `gget ref` en combinación con [kallisto | bustools](https://www.kallistobus.tools/kb_usage/kb_ref/) para construir un índice de referencia:**
```bash
@@ -67,8 +67,8 @@ gget.ref(species=None, list_species=True, release=103)
(Si no se especifica ninguna versión, `gget ref` siempre devolverá información de la última versión de Ensembl).
-
-**Obtenga la referencia del genoma para una especie específica:**
+
+**Obtenga la referencia del genoma para una especie específica:**
```bash
gget ref -w gtf,dna homo_sapiens
```
@@ -100,7 +100,7 @@ gget.ref("homo_sapiens", which=["gtf", "dna"])
#### [Más ejemplos](https://github.com/pachterlab/gget_examples)
-# Citar
+# Citar
Si utiliza `gget ref` en una publicación, favor de citar los siguientes artículos:
- Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836)
diff --git a/docs/src/es/search.md b/docs/src/es/search.md
index a41ee730a..8f824de67 100644
--- a/docs/src/es/search.md
+++ b/docs/src/es/search.md
@@ -2,36 +2,36 @@
> Parámetros de Python són iguales a los parámetros largos (`--parámetro`) de Terminal, si no especificado de otra manera. Banderas son parámetros de verdadero o falso (True/False) en Python. El manuál para cualquier modulo de gget se puede llamar desde la Terminal con la bandera `-h` `--help`.
# gget search 🔎
-Obtenga genes y transcripciones de [Ensembl](https://www.ensembl.org/) usando términos de búsqueda de forma libre.
-Los resultados se comparan según las secciones "nombre del gen" y "descripción" en la base de datos de Ensembl. `gget` versión >= 0.27.9 también incluye resultados que coinciden con la sección "sinónimo" de Ensembl.
+Obtenga genes y transcripciones de [Ensembl](https://www.ensembl.org/) usando términos de búsqueda de forma libre.
+Los resultados se comparan según las secciones "nombre del gen" y "descripción" en la base de datos de Ensembl. `gget` versión >= 0.27.9 también incluye resultados que coinciden con la sección "sinónimo" de Ensembl.
Regresa: Resultados en formato JSON (Terminal) o Dataframe/CSV (Python).
**Parámetro posicional**
-`searchwords`
+`searchwords`
Una o más palabras de búsqueda de forma libre, p. ej. gaba nmda. (Nota: la búsqueda no distingue entre mayúsculas y minúsculas).
-**Otros parámetros requeridos**
+**Otros parámetros requeridos**
`-s` `--species`
-Especies o base de datos a buscar.
+Especies o base de datos a buscar.
Una especie se puede pasar en el formato 'género_especie', p. ej. 'homo_sapiens' o 'arabidopsis_thaliana'.
Para pasar una base de datos específica, pase el nombre de la base de datos CORE, p. ej. 'mus_musculus_dba2j_core_105_1'.
-
+
Todas las bases de datos disponibles para cada versión de Ensembl se pueden encontrar aquí:
Vertebrados: [http://ftp.ensembl.org/pub/current/mysql/](http://ftp.ensembl.org/pub/current/mysql/)
Invertebrados: [http://ftp.ensemblgenomes.org/pub/current/](http://ftp.ensemblgenomes.org/pub/current/) + selecciona reino animal + selecciona mysql/
-
+
Accesos directos: 'human', 'mouse'
**Parámetros optionales**
-`-r` `--release`
+`-r` `--release`
Define el número de versión de Ensembl desde el que se obtienen los archivos, p. ej. 104. Por defecto: None -> se usa la última versión de Ensembl.
-
-Nota: *No se aplica a las especies invertebrados* (en su lugar, puede pasar una base de datos de una especies específica (incluyen un número de versión) al argumento `species`). Para especies de invertebrados, Ensembl solo almacena bases de datos de 10 versiones anteriores a la versión actual.
-
+
+Nota: *No se aplica a las especies invertebrados* (en su lugar, puede pasar una base de datos de una especies específica (incluyen un número de versión) al argumento `species`). Para especies de invertebrados, Ensembl solo almacena bases de datos de 10 versiones anteriores a la versión actual.
+
Este argumento se sobrescribe si se pasa una base de datos específica (que incluye un número de publicación) al argumento `species`.
`-t` `--id_type`
-'gene' (esto se use por defecto) o 'transcript'
+'gene' (esto se use por defecto) o 'transcript'
Regesa genes o transcripciones, respectivamente.
`-ao` `--andor`
@@ -39,26 +39,26 @@ Regesa genes o transcripciones, respectivamente.
'or' ('o'): Regresa todos los genes que INCLUYEN AL MENOS UNA de las palabras de búsqueda en su nombre/descripción.
'and' ('y'): Regresa solo los genes que INCLUYEN TODAS las palabras de búsqueda en su nombre/descripción.
-`-l` `--limit`
+`-l` `--limit`
Limita el número de resultados de búsqueda, p. ej. 10. Por defecto: None.
-`-o` `--out`
+`-o` `--out`
Ruta al archivo en el que se guardarán los resultados, p. ej. ruta/al/directorio/resultados.csv (o .json). Por defecto: salida estándar (STDOUT).
Para Python, usa `save=True` para guardar los resultados en el directorio de trabajo actual.
**Banderas**
`-csv` `--csv`
-Solo para la Terminal. Regresa los resultados en formato CSV.
+Solo para la Terminal. Regresa los resultados en formato CSV.
Para Python, usa `json=True` para regresar los resultados en formato JSON.
-`-q` `--quiet`
+`-q` `--quiet`
Solo para la Terminal. Impide la informacion de progreso de ser exhibida durante la corrida.
Para Python, usa `verbose=False` para imipidir la informacion de progreso de ser exhibida durante la corrida.
`wrap_text`
-Solo para Python. `wrap_text=True` muestra los resultados con texto envuelto para facilitar la lectura (por defecto: False).
+Solo para Python. `wrap_text=True` muestra los resultados con texto envuelto para facilitar la lectura (por defecto: False).
+
-
### Por ejemplo
```bash
gget search -s human gaba gamma-aminobutyric
@@ -73,10 +73,10 @@ gget.search(["gaba", "gamma-aminobutyric"], "homo_sapiens")
| -------------- |-------------------------| ------------------------| -------------- | ----------|-----|
| ENSG00000034713| GABARAPL2 | GABA type A receptor associated protein like 2 [Source:HGNC Symbol;Acc:HGNC:13291] | GABA type A receptor associated protein like 2 | protein_coding | https://uswest.ensembl.org/homo_sapiens/Gene/Summary?g=ENSG00000034713 |
| . . . | . . . | . . . | . . . | . . . | . . . |
-
+
#### [Más ejemplos](https://github.com/pachterlab/gget_examples)
-# Citar
+# Citar
Si utiliza `gget search` en una publicación, favor de citar los siguientes artículos:
- Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836)
diff --git a/docs/src/es/seq.md b/docs/src/es/seq.md
index d8495123f..8b4badc93 100644
--- a/docs/src/es/seq.md
+++ b/docs/src/es/seq.md
@@ -6,11 +6,11 @@ Obtenga la(s) secuencia(s) nucleótidos o aminoácidos de un gen (y todas sus is
Regresa: Archivo de tipo FASTA.
**Parámetro posicional**
-`ens_ids`
+`ens_ids`
One or more Ensembl IDs.
**Parámetros optionales**
-`-o` `--out`
+`-o` `--out`
Ruta al archivo en el que se guardarán los resultados, p. ruta/al/directorio/resultados.fa. Por defecto: salida estándar (STDOUT).
Para Python, usa `save=True` para guardar los resultados en el directorio de trabajo actual.
@@ -20,11 +20,11 @@ Regresa secuencias de aminoácidos (en lugar de nucleótidos).
Las secuencias de nucleótidos se obtienen de [Ensembl](https://www.ensembl.org/).
Las secuencias de aminoácidos se obtienen de [UniProt](https://www.uniprot.org/).
-`-iso` `--isoforms`
+`-iso` `--isoforms`
Regresa las secuencias de todas las transcripciones conocidas.
(Solo para IDs de genes).
-`-q` `--quiet`
+`-q` `--quiet`
Solo para la Terminal. Impide la informacion de progreso de ser exhibida durante la corrida.
Para Python, usa `verbose=False` para imipidir la informacion de progreso de ser exhibida durante la corrida.
@@ -52,7 +52,7 @@ gget.seq("ENSG00000034713", translate=True, isoforms=True)
#### [Más ejemplos](https://github.com/pachterlab/gget_examples)
-# Citar
+# Citar
Si utiliza `gget seq` en una publicación, favor de citar los siguientes artículos:
- Luebbert, L., & Pachter, L. (2023). Efficient querying of genomic reference databases with gget. Bioinformatics. [https://doi.org/10.1093/bioinformatics/btac836](https://doi.org/10.1093/bioinformatics/btac836)
diff --git a/docs/src/es/setup.md b/docs/src/es/setup.md
index 954308947..bbe52f131 100644
--- a/docs/src/es/setup.md
+++ b/docs/src/es/setup.md
@@ -8,7 +8,7 @@ Función para instalar/descargar dependencias de terceros para un módulo de gge
> **Nota:** Algunas dependencias (por ejemplo, `cellxgene-census`) pueden no ser compatibles con las versiones más recientes de Python. Si encuentras errores durante la instalación, intenta usar un entorno con una versión anterior de Python.
**Parámetro posicional**
-`module`
+`module`
Módulo gget para el que se deben instalar las dependencias.
### Por ejemplo
@@ -20,4 +20,3 @@ gget setup alphafold
gget.setup("alphafold")
```
→ Instala todas las dependencias de terceros (modificadas) y descarga los parámetros del algoritmo (~4 GB) necesarios para ejecutar [`gget alphafold`](alphafold.md).
-
diff --git a/docs/src/es/updates.md b/docs/src/es/updates.md
index 9f044b2ef..c9559b7c5 100644
--- a/docs/src/es/updates.md
+++ b/docs/src/es/updates.md
@@ -64,7 +64,7 @@
- [`gget pdb`](pdb.md): Agregado el sitio web de `wwpdb`, retrocede a `rcsb` si las solicitudes fallan.
- [`gget cellxgene`](cellxgene.md): Mejora el manejo de argumentos; el frontend no cambia.
- [`gget setup`](setup.md)/[`gget alphafold`](alphafold.md): Corrige el error pip_cmd en `gget.setup("alphafold")`.
-
+
**Versión ≥ 0.29.2** (03 de julio de 2025):
- Ahora se puede instalar `gget` usando `uv pip install gget`
- Toda la metadata del paquete (versión, autor, descripción, etc.) ahora se gestiona en `setup.cfg` para una compatibilidad total con herramientas modernas como `uv`, `pip` y PyPI
@@ -89,7 +89,7 @@
- Se permite la consulta de múltiples genes a la vez.
- [`gget diamond`](diamond.md):
- Ahora soporta alineamiento traducido de secuencias nucleotídicas contra secuencias de referencia de aminoácidos usando la opción `--translated`.
-- [`gget elm`](elm.md):
+- [`gget elm`](elm.md):
- Mejorado el manejo de errores del servidor.
**Versión ≥ 0.29.0** (25 de septiembre de 2024):
@@ -110,14 +110,14 @@
- Pruebas unitarias reorganizadas para aumentar la velocidad y disminuir el código
- Requisitos actualizados para [permitir versiones más nuevas de mysql-connector](https://github.com/pachterlab/gget/pull/159)
- [Soporte para Numpy>= 2.0](https://github.com/pachterlab/gget/issues/157)
-
+
**Versión ≥ 0.28.6 (2 de junio de 2024):**
- **Nuevo módulo: [`gget mutate`](./mutate.md)**
- [`gget cosmic`](./cosmic.md): Ahora puedes descargar bases de datos completas de COSMIC utilizando el argumento `download_cosmic`
- [`gget ref`](./ref.md): Ahora puede obtener la ensambladura del genoma GRCh27 usando `species='human_grch37'`
- [`gget search`](./search.md): Ajusta el acceso a los datos humanos a la estructura de la versión 112 de Ensembl (corrige [issue 129](https://github.com/pachterlab/gget/issues/129))
-~~**Version ≥ 0.28.5** (May 29, 2024):~~
+~~**Version ≥ 0.28.5** (May 29, 2024):~~
- Retirado debido a un error con 'logging' en `gget.setup("alphafold")` + mutaciones de inversión en `gget mutate` solo invierten la cadena en lugar de también calcular la hebra complementaria
**Versión ≥ 0.28.4** (31 de enero de 2024):
@@ -141,13 +141,13 @@
- [`gget ref`](./ref.md):
- Cambios de back-end para aumentar la velocidad.
- Nuevo argumento: `list_iv_species` para enumerar todas las especies de invertebrados disponibles (se puede combinar con el argumento `release` para obtener todas las especies disponibles de una liberación específica de Ensembl)
-
+
**Versión ≥ 0.28.2** (15 de noviembre de 2023):
- [`gget info`](./info.md): devuelve un mensaje de error cuando el servidor NCBI falla por un motivo distinto a un error de recuperación (esto es un error en el lado del servidor en lugar de un error con `gget`)
- Reemplace el argumento obsoleto 'texto' para los métodos de tipo find() siempre que se usen con la dependencia `BeautifulSoup`
- [`gget elm`](elm.md): Elimina instancias de falsos positivos y verdaderos negativos de los resultados devueltos.
- [`gget elm`](elm.md): agrega el argumento `expand`
-
+
**Versión ≥ 0.28.0** (5 de noviembre de 2023):
- Documentación actualizada de [`gget muscle`](./muscle.md) para agregar un tutorial sobre cómo visualizar secuencias con diferentes longitudes de nombres de secuencia + ligero cambio en la visualización devuelta para que sea un poco más sólida ante diferentes nombres de secuencia
- [`gget muscle`](./muscle.md) ahora también permite una lista de secuencias como entrada (como alternativa a proporcionar la ruta a un archivo FASTA)
@@ -155,11 +155,11 @@
- [`gget seq`](./seq.md): permite nombres de genes faltantes (correccione [https://github.com/pachterlab/gget/issues/107](https://github.com/pachterlab/gget /números/107))
- Nuevos argumentos para [`gget enrichr`](enrichr.md): use el argumento `kegg_out` y `kegg_rank` para crear una imagen de la vía KEGG con los genes del análisis de enriquecimiento resaltados (gracias a [este PR](https ://github.com/pachterlab/gget/pull/106) por [Noriaki Sato](https://github.com/noriakis))
- Nuevos módulos: [`gget elm`](elm.md) y [`gget Diamond`](diamond.md)
-
+
**Versión ≥ 0.27.9** (7 de agosto de 2023):
- Nuevos argumentos para [`gget enrichr`](enrichr.md): use el argumento `background_list` para proporcionar una lista de genes 'background'
- [`gget search`](search.md) ahora también busca sinónimos [Ensembl](https://ensembl.org/) (además de nombres y descripciones de genes) para obtener resultados de búsqueda más completos (gracias a [Samuel Klein](https://github.com/KleinSamuel) por la [sugerencia](https://github.com/pachterlab/gget/issu90))
-
+
**Versión ≥ 0.27.8** (12 de julio de 2023):
- Nuevo argumento para [`gget search`](search.md): especifique la versión de Ensembl desde la cual se obtiene la información con `-r` `--release`
- Se corrigió un [error](https://github.com/pachterlab/gget/issu91) en [`gget pdb`](pdb.md) (este error se introdujo en la versión 0.27.5)
@@ -179,7 +179,7 @@
- Todos los módulos gget ahora tienen una bandera `-q / --quiet` (para Python: `verbose=False`) para desactivar la información de progreso
**Versión ≥ 0.27.4** (19 de marzo de 2023):
-- Nuevo módulo: [`gget gpt`](gpt.md)
+- Nuevo módulo: [`gget gpt`](gpt.md)
**Versión ≥ 0.27.3** (11 de marzo de 2023):
- [`gget info`](info.md) excluye los ID de PDB de forma predeterminada para aumentar la velocidad (los resultados de PDB se pueden incluir usando la marca `--pdb` / `pdb=True`).
diff --git a/docs/src/es/virus.md b/docs/src/es/virus.md
index bf18e444e..c4caafb44 100644
--- a/docs/src/es/virus.md
+++ b/docs/src/es/virus.md
@@ -27,7 +27,7 @@ Para descargas en caché de SARS-CoV-2 y Alphainfluenza, se admite:
Use la opción `--download_all_accessions` para aplicar filtros sin buscar un virus específico.
-**Argumentos opcionales**
+**Argumentos opcionales**
_Filtros de hospedador_
@@ -279,8 +279,8 @@ gget virus "SARS-CoV-2" --host human --nuc_completeness complete --min_seq_lengt
import gget
gget.virus(
- "SARS-CoV-2",
- host="human",
+ "SARS-CoV-2",
+ host="human",
nuc_completeness="complete",
min_seq_length=29000,
genbank_metadata=True,
@@ -303,8 +303,8 @@ gget virus "Influenza A virus" --host human --nuc_completeness complete --max_se
import gget
gget.virus(
- "Influenza A virus",
- host="human",
+ "Influenza A virus",
+ host="human",
nuc_completeness="complete",
max_seq_length=15000,
genbank_metadata=True,
diff --git a/gget/__init__.py b/gget/__init__.py
index 3c65663f0..506788c94 100644
--- a/gget/__init__.py
+++ b/gget/__init__.py
@@ -1,44 +1,39 @@
-from .gget_ref import ref
-from .gget_search import search
-from .gget_info import info
-from .gget_seq import seq
-from .gget_muscle import muscle
+"""gget: efficient querying of genomic databases."""
+
+import logging
+from importlib.metadata import PackageNotFoundError, version
+
+from .gget_8cube import gene_expression, psi_block, specificity
+from .gget_alphafold import alphafold
+from .gget_archs4 import archs4
+from .gget_bgee import bgee
from .gget_blast import blast
from .gget_blat import blat
-from .gget_enrichr import enrichr
-from .gget_archs4 import archs4
-from .gget_alphafold import alphafold
-from .gget_setup import setup
-from .gget_pdb import pdb
-from .gget_gpt import gpt
+from .gget_cbio import cbio_plot, cbio_search
from .gget_cellxgene import cellxgene
-from .gget_elm import elm
-from .gget_diamond import diamond
from .gget_cosmic import cosmic
+from .gget_diamond import diamond
+from .gget_elm import elm
+from .gget_enrichr import enrichr
+from .gget_gpt import gpt
+from .gget_info import info
+from .gget_muscle import muscle
from .gget_mutate import mutate
from .gget_opentargets import opentargets
-from .gget_cbio import cbio_plot, cbio_search
-from .gget_bgee import bgee
-from .gget_8cube import specificity, psi_block, gene_expression
+from .gget_pdb import pdb
+from .gget_ref import ref
+from .gget_search import search
+from .gget_seq import seq
+from .gget_setup import setup
from .gget_virus import virus
-import logging
-
# Mute numexpr threads info
logging.getLogger("numexpr").setLevel(logging.WARNING)
-
-# Get version number from the config file
-try:
- from importlib.metadata import version, PackageNotFoundError
-except ImportError:
- from importlib_metadata import version, PackageNotFoundError # For Python <3.8
-
try:
__version__ = version("gget")
except PackageNotFoundError:
__version__ = "unknown"
-
__author__ = "Laura Luebbert"
__email__ = "lauralubbert@gmail.com"
diff --git a/gget/compile.py b/gget/compile.py
index 4cd028c98..bfd8f6c4c 100644
--- a/gget/compile.py
+++ b/gget/compile.py
@@ -1,7 +1,7 @@
import os
+import platform
import subprocess
import sys
-import platform
from .constants import MUSCLE_GITHUB_LINK
from .utils import set_up_logger
@@ -11,21 +11,16 @@
# Get absolute package path
PACKAGE_PATH = os.path.abspath(os.path.dirname(__file__))
# Path to muscle binary (only exists after 'compile_muscle' was executed)
-MUSCLE_PATH = os.path.join(
- PACKAGE_PATH, f"bins/compiled/muscle/src/{platform.system()}/muscle"
-)
+MUSCLE_PATH = os.path.join(PACKAGE_PATH, f"bins/compiled/muscle/src/{platform.system()}/muscle")
def compile_muscle():
- """
- Compiles MUSCLE from source.
+ """Compiles MUSCLE from source.
+
Currently only supports Linux and Darwin.
"""
-
if platform.system() != "Linux" and platform.system() != "Darwin":
- raise OSError(
- f"Muscle compiler currently only supports Linux and Darwin, not {platform.system()}.\n"
- )
+ raise OSError(f"Muscle compiler currently only supports Linux and Darwin, not {platform.system()}.\n")
logger.info("Compiling MUSCLE binary from source... ")
@@ -56,16 +51,10 @@ def compile_muscle():
# Run make command
if platform.system() == "Linux":
- logger.warning(
- "Compiling MUSCLE requires that g++, make, sed and git are installed."
- )
+ logger.warning("Compiling MUSCLE requires that g++, make, sed and git are installed.")
if platform.system() == "Darwin":
- logger.warning(
- "Compiling MUSCLE requires that gcc v11, make, sed and git are installed."
- )
- logger.warning(
- "Please run 'brew install gcc' to install gcc v11 if the compile fails."
- )
+ logger.warning("Compiling MUSCLE requires that gcc v11, make, sed and git are installed.")
+ logger.warning("Please run 'brew install gcc' to install gcc v11 if the compile fails.")
command2 = "make -s"
diff --git a/gget/constants.py b/gget/constants.py
index 463987f77..1ea0aa663 100644
--- a/gget/constants.py
+++ b/gget/constants.py
@@ -47,13 +47,11 @@
GET_BACKGROUND_ENRICHR_URL = "https://maayanlab.cloud/speedrichr/api/backgroundenrich"
POST_ENRICHR_URLS = {
- f"{typ}": f"https://maayanlab.cloud/{typ.capitalize()}Enrichr/addList"
- for typ in ["fly", "yeast", "worm", "fish"]
+ f"{typ}": f"https://maayanlab.cloud/{typ.capitalize()}Enrichr/addList" for typ in ["fly", "yeast", "worm", "fish"]
}
POST_ENRICHR_URLS["human"] = POST_ENRICHR_URL
GET_ENRICHR_URLS = {
- f"{typ}": f"https://maayanlab.cloud/{typ.capitalize()}Enrichr/enrich"
- for typ in ["fly", "yeast", "worm", "fish"]
+ f"{typ}": f"https://maayanlab.cloud/{typ.capitalize()}Enrichr/enrich" for typ in ["fly", "yeast", "worm", "fish"]
}
GET_ENRICHR_URLS["human"] = GET_ENRICHR_URL
@@ -62,12 +60,8 @@
EXPRESSION_URL = "https://maayanlab.cloud/archs4/search/loadExpressionTissue.php?"
# Download links for ELM database
-ELM_INSTANCES_FASTA_DOWNLOAD = (
- "http://elm.eu.org/instances.fasta?q=*&taxon=&instance_logic="
-)
-ELM_INSTANCES_TSV_DOWNLOAD = (
- "http://elm.eu.org/instances.tsv?q=*&taxon=&instance_logic="
-)
+ELM_INSTANCES_FASTA_DOWNLOAD = "http://elm.eu.org/instances.fasta?q=*&taxon=&instance_logic="
+ELM_INSTANCES_TSV_DOWNLOAD = "http://elm.eu.org/instances.tsv?q=*&taxon=&instance_logic="
ELM_CLASSES_TSV_DOWNLOAD = "http://elm.eu.org/elms/elms_index.tsv"
ELM_INTDOMAINS_TSV_DOWNLOAD = "http://elm.eu.org/interactiondomains.tsv"
diff --git a/gget/gget_8cube.py b/gget/gget_8cube.py
index f22036caf..1f685bb00 100644
--- a/gget/gget_8cube.py
+++ b/gget/gget_8cube.py
@@ -1,10 +1,11 @@
-import requests
-import pandas as pd
-import json as json_package
import io
+import json as json_package
+
+import pandas as pd
+import requests
-from .utils import set_up_logger
from .constants import DEFAULT_REQUESTS_TIMEOUT
+from .utils import set_up_logger
logger = set_up_logger()
@@ -25,9 +26,7 @@ def _convert_to_df(response_text, endpoint_name):
try:
return pd.read_csv(io.StringIO(response_text))
except Exception as e:
- raise RuntimeError(
- f"API '{endpoint_name}' returned non-CSV data: {e}\nResponse:\n{response_text}"
- )
+ raise RuntimeError(f"API '{endpoint_name}' returned non-CSV data: {e}\nResponse:\n{response_text}") from e
def _save_output(df_or_json, name, json=False, verbose=True):
@@ -58,8 +57,8 @@ def specificity(
save=False,
verbose=True,
):
- """
- Retrieve gene-level specificity statistics from the 8cubeDB
+ """Retrieve gene-level specificity statistics from the 8cubeDB.
+
(https://eightcubedb.onrender.com/).
This endpoint returns ψ (psi) and ζ (zeta) specificity metrics for one
@@ -76,7 +75,8 @@ def specificity(
gget_8cube_specificity.csv (or .json if json=True).
- verbose If True, print progress information. Default: True.
- Returns:
+ Returns
+ -------
A pandas DataFrame or JSON list containing:
- gene_name
- ensembl_id
@@ -85,11 +85,12 @@ def specificity(
- Psi_mean, Psi_std
- Zeta_mean, Zeta_std
- Raises:
+ Raises
+ ------
- ValueError If gene_list is not a list.
- RuntimeError If the API request fails or returns invalid data.
- """
+ """
if not isinstance(gene_list, (list, tuple)):
raise ValueError("`gene_list` must be a list.")
@@ -130,8 +131,7 @@ def psi_block(
save=False,
verbose=True,
):
- """
- Retrieve ψ_block (psi-block) specificity scores from the 8cubeDB.
+ """Retrieve ψ_block (psi-block) specificity scores from the 8cubeDB.
ψ_block quantifies the specificity of a gene to a particular block
within a partition. This endpoint supports block-wise
@@ -146,15 +146,17 @@ def psi_block(
or .json if json=True.
- verbose If True, print progress information. Default: True.
- Returns:
+ Returns
+ -------
A pandas DataFrame or JSON list containing ψ_block scores for each block
label in the partition (e.g., "Male:NZOJ", "Female:B6J", etc.).
- Raises:
+ Raises
+ ------
- ValueError If gene_list is not a list.
- RuntimeError If the API request fails.
- """
+ """
if not isinstance(gene_list, (list, tuple)):
raise ValueError("`gene_list` must be a list.")
@@ -166,10 +168,7 @@ def psi_block(
] + [("gene_list", g) for g in processed]
if verbose:
- logger.info(
- f"Fetching ψ-block scores for {len(processed)} genes "
- f"({analysis_level}, {analysis_type})…"
- )
+ logger.info(f"Fetching ψ-block scores for {len(processed)} genes ({analysis_level}, {analysis_type})…")
r = requests.get(PSI_BLOCK_URL, params=params, timeout=DEFAULT_REQUESTS_TIMEOUT)
if not r.ok:
@@ -200,8 +199,7 @@ def gene_expression(
save=False,
verbose=True,
):
- """
- Retrieve normalized gene expression values from 8cubeDB.
+ """Retrieve normalized gene expression values from 8cubeDB.
This endpoint returns mean and variance of normalized expression for the
specified gene(s), computed over the selected partition. For example:
@@ -217,15 +215,17 @@ def gene_expression(
or .json if json=True.
- verbose If True, print progress information.
- Returns:
+ Returns
+ -------
A pandas DataFrame or JSON list with expression values and metadata for
each partition block (columns vary depending on analysis_type).
- Raises:
+ Raises
+ ------
- ValueError If gene_list is not a list.
- RuntimeError If the API request fails or returns invalid/empty data.
- """
+ """
if not isinstance(gene_list, (list, tuple)):
raise ValueError("`gene_list` must be a list.")
@@ -237,16 +237,11 @@ def gene_expression(
] + [("gene_list", g) for g in processed]
if verbose:
- logger.info(
- f"Fetching expression data for {len(processed)} genes "
- f"({analysis_level}, {analysis_type})…"
- )
+ logger.info(f"Fetching expression data for {len(processed)} genes ({analysis_level}, {analysis_type})…")
r = requests.get(GENE_EXPR_URL, params=params, timeout=DEFAULT_REQUESTS_TIMEOUT)
if not r.ok:
- raise RuntimeError(
- f"Gene expression request failed ({r.status_code}): {r.text}"
- )
+ raise RuntimeError(f"Gene expression request failed ({r.status_code}): {r.text}")
df = _convert_to_df(r.text, "gene_expression")
diff --git a/gget/gget_alphafold.py b/gget/gget_alphafold.py
index 6a574cb01..5455bbfc3 100644
--- a/gget/gget_alphafold.py
+++ b/gget/gget_alphafold.py
@@ -11,49 +11,43 @@
# Get current date and time for default foldername
dt_string = datetime.now().strftime("%Y_%m_%d-%H%M")
-from tqdm import tqdm
-import os
-import shutil
-import sys
-import enum
-import glob
-import json
-import subprocess
-import platform
-import collections
-import copy
-from concurrent import futures
-import random
-from urllib import request
-import matplotlib.pyplot as plt
-import numpy as np
-from IPython import display
-from ipywidgets import GridspecLayout
-from ipywidgets import Output
-
-from .utils import set_up_logger
+import collections # noqa: E402
+import copy # noqa: E402
+import enum # noqa: E402
+import glob # noqa: E402
+import json # noqa: E402
+import os # noqa: E402
+import platform # noqa: E402
+import random # noqa: E402
+import shutil # noqa: E402
+import subprocess # noqa: E402
+import sys # noqa: E402
+from concurrent import futures # noqa: E402
+from urllib import request # noqa: E402
+
+import matplotlib.pyplot as plt # noqa: E402
+import numpy as np # noqa: E402
+from IPython import display # noqa: E402
+from ipywidgets import GridspecLayout, Output # noqa: E402
+from tqdm import tqdm # noqa: E402
+
+from .utils import set_up_logger # noqa: E402
logger = set_up_logger()
-TQDM_BAR_FORMAT = (
- "{l_bar}{bar}| {n_fmt}/{total_fmt} [elapsed: {elapsed} remaining: {remaining}]"
-)
+TQDM_BAR_FORMAT = "{l_bar}{bar}| {n_fmt}/{total_fmt} [elapsed: {elapsed} remaining: {remaining}]"
-from .compile import PACKAGE_PATH
+from .compile import PACKAGE_PATH # noqa: E402
# from .gget_setup import TMP_DISK
-from .gget_setup import UUID, PARAMS_DIR
+from .gget_setup import PARAMS_DIR, UUID # noqa: E402
STEREO_CHEM_DIR = os.path.join(PARAMS_DIR, "stereo_chemical_props.txt")
# Path to jackhmmer binary
-JACKHMMER_BINARY_PATH = os.path.join(
- PACKAGE_PATH, f"bins/{platform.system()}/jackhmmer"
-)
+JACKHMMER_BINARY_PATH = os.path.join(PACKAGE_PATH, f"bins/{platform.system()}/jackhmmer")
# Test pattern to find closest source
-test_url_pattern = (
- "https://storage.googleapis.com/alphafold-colab{:s}/latest/uniref90_2022_01.fasta.1"
-)
+test_url_pattern = "https://storage.googleapis.com/alphafold-colab{:s}/latest/uniref90_2022_01.fasta.1"
# Sequence validation parameters
MIN_PER_SEQUENCE_LENGTH = 16
@@ -80,9 +74,7 @@
def plot_plddt_legend():
- """
- Function to plot the legend for pLDDT.
- """
+ """Function to plot the legend for pLDDT."""
thresh = [
"Very low (pLDDT < 50)",
"Low (70 > pLDDT > 50)",
@@ -109,17 +101,13 @@ def plot_plddt_legend():
def fetch(source):
- """
- Support function for finding closest source.
- """
+ """Support function for finding closest source."""
request.urlretrieve(test_url_pattern.format(source))
return source
def get_msa(fasta_path, msa_databases, total_jackhmmer_chunks):
- """
- Function to search for MSA for the given sequence using chunked Jackhmmer search.
- """
+ """Function to search for MSA for the given sequence using chunked Jackhmmer search."""
from alphafold.data.tools import jackhmmer
## Run the search against chunks of genetic databases to save disk space
@@ -150,9 +138,7 @@ def jackhmmer_chunk_callback(i):
def clean_up():
- """
- Function to clean up temporary files after running gget alphafold.
- """
+ """Function to clean up temporary files after running gget alphafold."""
# # Remove fasta files with input sequences
# files = glob.glob("target_*.fasta")
# for f in files:
@@ -196,8 +182,8 @@ def alphafold(
show_sidechains=True,
verbose=True,
):
- """
- Predicts the structure of a protein using a slightly simplified version of AlphaFold v2.3.0 (https://doi.org/10.1038/s41586-021-03819-2)
+ """Predicts the structure of a protein using a slightly simplified version of AlphaFold v2.3.0 (https://doi.org/10.1038/s41586-021-03819-2).
+
published in the AlphaFold Colab notebook (https://colab.research.google.com/github/deepmind/alphafold/blob/main/notebooks/AlphaFold.ipynb).
Args:
@@ -229,32 +215,29 @@ def alphafold(
If you use this function, please cite the gget (https://doi.org/10.1101/2022.05.17.492392) and AphaFold (https://doi.org/10.1038/s41586-021-03819-2) papers
and, if applicable, the AlphaFold-Multimer paper (https://www.biorxiv.org/content/10.1101/2021.10.04.463034v1).
"""
-
if platform.system() == "Windows":
- logger.warning(
- "gget setup alphafold and gget alphafold are not supported on Windows OS."
- )
+ logger.warning("gget setup alphafold and gget alphafold are not supported on Windows OS.")
## Check if third-party dependencies are installed
# Check if openmm is installed
try:
- import simtk.openmm as openmm
+ import simtk.openmm as openmm # noqa: F401
except ImportError as e:
raise ImportError(
f"""
Importing openmm resulted in the following error:
{e}
- Please install AlphaFold third-party dependency openmm by running the following command from the command line:
- For Python version < 3.10:
- 'conda install -qy conda==4.13.0 && conda install -qy -c conda-forge openmm=7.5.1'
- For Python version 3.10:
- 'conda install -qy conda==24.1.2 && conda install -qy -c conda-forge openmm=7.7.0'
- For Python version 3.11:
- 'conda install -qy conda==24.11.1 && conda install -qy -c conda-forge openmm=8.0.0'
+ Please install AlphaFold third-party dependency openmm by running the following command from the command line:
+ For Python version < 3.10:
+ 'conda install -qy conda==4.13.0 && conda install -qy -c conda-forge openmm=7.5.1'
+ For Python version 3.10:
+ 'conda install -qy conda==24.1.2 && conda install -qy -c conda-forge openmm=7.7.0'
+ For Python version 3.11:
+ 'conda install -qy conda==24.11.1 && conda install -qy -c conda-forge openmm=8.0.0'
(Recommendation: Follow with 'conda update -qy conda' to update conda to the latest version afterwards.)
"""
- )
+ ) from e
# Check if AlphaFold is installed
try:
@@ -262,7 +245,7 @@ def alphafold(
except ImportError:
logger.error(
"""
- Some third-party dependencies are missing. Please run the following command:
+ Some third-party dependencies are missing. Please run the following command:
>>> gget.setup('alphafold') or $ gget setup alphafold
"""
)
@@ -276,7 +259,7 @@ def alphafold(
if pdb_out.decode() == "":
logger.error(
"""
- Some third-party dependencies are missing. Please run the following command:
+ Some third-party dependencies are missing. Please run the following command:
>>> gget.setup('alphafold') or $ gget setup alphafold
"""
)
@@ -286,7 +269,7 @@ def alphafold(
if not os.path.exists(os.path.join(PARAMS_DIR, "params/")):
logger.error(
"""
- The AlphaFold model parameters are missing. Please run the following command:
+ The AlphaFold model parameters are missing. Please run the following command:
>>> gget.setup('alphafold') or $ gget setup alphafold
"""
)
@@ -295,24 +278,17 @@ def alphafold(
if len(os.listdir(os.path.join(PARAMS_DIR, "params/"))) < 12:
logger.error(
"""
- The AlphaFold model parameters are missing. Please run the following command:
+ The AlphaFold model parameters are missing. Please run the following command:
>>> gget.setup('alphafold') or $ gget setup alphafold
"""
)
return
## Import AlphaFold functions
- from alphafold.notebooks import notebook_utils
- from alphafold.model import model
- from alphafold.model import config
- from alphafold.model import data
-
- from alphafold.data import feature_processing
- from alphafold.data import msa_pairing
- from alphafold.data import pipeline
- from alphafold.data import pipeline_multimer
-
from alphafold.common import protein
+ from alphafold.data import feature_processing, msa_pairing, pipeline, pipeline_multimer
+ from alphafold.model import config, data, model
+ from alphafold.notebooks import notebook_utils
try:
from alphafold.relax import utils
@@ -323,16 +299,16 @@ def alphafold(
Importing openmm resulted in the following error:
{e}
- Please install AlphaFold third-party dependency openmm by running the following command from the command line:
- For Python version < 3.10:
- 'conda install -qy conda==4.13.0 && conda install -qy -c conda-forge openmm=7.5.1'
- For Python version 3.10:
- 'conda install -qy conda==24.1.2 && conda install -qy -c conda-forge openmm=7.7.0'
- For Python version 3.11:
- 'conda install -qy conda==24.11.1 && conda install -qy -c conda-forge openmm=8.0.0'
+ Please install AlphaFold third-party dependency openmm by running the following command from the command line:
+ For Python version < 3.10:
+ 'conda install -qy conda==4.13.0 && conda install -qy -c conda-forge openmm=7.5.1'
+ For Python version 3.10:
+ 'conda install -qy conda==24.1.2 && conda install -qy -c conda-forge openmm=7.7.0'
+ For Python version 3.11:
+ 'conda install -qy conda==24.11.1 && conda install -qy -c conda-forge openmm=8.0.0'
(Recommendation: Follow with 'conda update -qy conda' to update conda to the latest version afterwards.)
"""
- )
+ ) from e
if relax:
# Import AlphaFold relax package
@@ -345,16 +321,16 @@ def alphafold(
Importing openmm resulted in the following error:
{e}
- Please install AlphaFold third-party dependency openmm by running the following command from the command line:
- For Python version < 3.10:
- 'conda install -qy conda==4.13.0 && conda install -qy -c conda-forge openmm=7.5.1'
- For Python version 3.10:
- 'conda install -qy conda==24.1.2 && conda install -qy -c conda-forge openmm=7.7.0'
- For Python version 3.11:
- 'conda install -qy conda==24.11.1 && conda install -qy -c conda-forge openmm=8.0.0'
+ Please install AlphaFold third-party dependency openmm by running the following command from the command line:
+ For Python version < 3.10:
+ 'conda install -qy conda==4.13.0 && conda install -qy -c conda-forge openmm=7.5.1'
+ For Python version 3.10:
+ 'conda install -qy conda==24.1.2 && conda install -qy -c conda-forge openmm=7.7.0'
+ For Python version 3.11:
+ 'conda install -qy conda==24.11.1 && conda install -qy -c conda-forge openmm=8.0.0'
(Recommendation: Follow with 'conda update -qy conda' to update conda to the latest version afterwards.)
"""
- )
+ ) from e
## Move stereo_chemical_props.txt from gget bins to Alphafold package so it can be found
# logger.info("Locate files containing stereochemical properties.")
@@ -367,7 +343,7 @@ def alphafold(
## Validate input sequence(s)
if verbose:
- logger.info(f"Validating input sequence(s).")
+ logger.info("Validating input sequence(s).")
# Handle command line passing path to FASTA as a list
if isinstance(sequence, list) and len(sequence) == 1:
@@ -381,7 +357,7 @@ def alphafold(
titles = []
seqs = []
with open(sequence) as text_file:
- for i, line in enumerate(text_file):
+ for i, line in enumerate(text_file): # noqa: B007
# Recognize a title line by the '>' character
if line[0] == ">":
# Append title line to titles list
@@ -398,9 +374,7 @@ def alphafold(
# Each second line will be a title line
if i % 2 == 0:
if line[0] != ">":
- raise ValueError(
- "Expected FASTA to start with a '>' character. "
- )
+ raise ValueError("Expected FASTA to start with a '>' character. ")
else:
# Append title line to titles list
titles.append(line.strip())
@@ -413,10 +387,8 @@ def alphafold(
else:
seqs.append(line.strip())
else:
- raise ValueError(
- "File format not recognized. gget alphafold only supports '.txt' or '.fa' files. "
- )
- elif type(sequence) == str and not "." in sequence:
+ raise ValueError("File format not recognized. gget alphafold only supports '.txt' or '.fa' files. ")
+ elif isinstance(sequence, str) and "." not in sequence:
# Convert string to list
seqs = [sequence]
else:
@@ -435,9 +407,7 @@ class ModelType(enum.Enum):
if len(seqs) == 1:
if multimer_for_monomer:
if verbose:
- logger.info(
- "Using the multimer model for a single chain, as requested."
- )
+ logger.info("Using the multimer model for a single chain, as requested.")
model_type_to_use = ModelType.MULTIMER
else:
if verbose:
@@ -460,7 +430,7 @@ class ModelType(enum.Enum):
if len(seqs[0]) > MAX_MONOMER_MODEL_LENGTH:
raise ValueError(
f"""
- Input sequence is too long: {len(sequences[0])} amino acids, while the maximum for the monomer model is {MAX_MONOMER_MODEL_LENGTH}.
+ Input sequence is too long: {len(sequences[0])} amino acids, while the maximum for the monomer model is {MAX_MONOMER_MODEL_LENGTH}.
You can try to run this sequence with the multimer model by using the flag [-mfm] ('multimer_for_monomer=True').
"""
)
@@ -472,7 +442,7 @@ class ModelType(enum.Enum):
## Find the closest source
if verbose:
- logger.info(f"Finding closest source for reference database.")
+ logger.info("Finding closest source for reference database.")
ex = futures.ThreadPoolExecutor(3)
fs = [ex.submit(fetch, source) for source in ["", "-europe", "-asia"]]
@@ -551,7 +521,7 @@ class ModelType(enum.Enum):
# Save the target sequence in a fasta file
fasta_path = os.path.join(abs_out_path, f"target_{sequence_index}.fasta")
- with open(fasta_path, "wt") as f:
+ with open(fasta_path, "w") as f:
f.write(f">query\n{sequence}")
# Don't do redundant work for multiple copies of the same chain in the multimer
@@ -570,45 +540,31 @@ class ModelType(enum.Enum):
single_chain_msas = []
uniprot_msa = None
for db_name, db_results in raw_msa_results.items():
- merged_msa = notebook_utils.merge_chunked_msa(
- results=db_results, max_hits=MAX_HITS.get(db_name)
- )
+ merged_msa = notebook_utils.merge_chunked_msa(results=db_results, max_hits=MAX_HITS.get(db_name))
if merged_msa.sequences and db_name != "uniprot":
single_chain_msas.append(merged_msa)
msa_size = len(set(merged_msa.sequences))
if verbose:
- logger.info(
- f"{msa_size} unique sequences found in {db_name} for sequence {sequence_index}."
- )
+ logger.info(f"{msa_size} unique sequences found in {db_name} for sequence {sequence_index}.")
elif merged_msa.sequences and db_name == "uniprot":
uniprot_msa = merged_msa
- notebook_utils.show_msa_info(
- single_chain_msas=single_chain_msas, sequence_index=sequence_index
- )
+ notebook_utils.show_msa_info(single_chain_msas=single_chain_msas, sequence_index=sequence_index)
# Turn the raw data into model features.
feature_dict = {}
feature_dict.update(
- pipeline.make_sequence_features(
- sequence=sequence, description="query", num_res=len(sequence)
- )
+ pipeline.make_sequence_features(sequence=sequence, description="query", num_res=len(sequence))
)
feature_dict.update(pipeline.make_msa_features(msas=single_chain_msas))
# Add empty placeholder features
- feature_dict.update(
- notebook_utils.empty_placeholder_template_features(
- num_templates=0, num_res=len(sequence)
- )
- )
+ feature_dict.update(notebook_utils.empty_placeholder_template_features(num_templates=0, num_res=len(sequence)))
# Construct the all_seq features only for heteromers, not homomers
if model_type_to_use == ModelType.MULTIMER and len(set(sequences)) > 1:
valid_feats = msa_pairing.MSA_FEATURES + ("msa_species_identifiers",)
all_seq_features = {
- f"{k}_all_seq": v
- for k, v in pipeline.make_msa_features([uniprot_msa]).items()
- if k in valid_feats
+ f"{k}_all_seq": v for k, v in pipeline.make_msa_features([uniprot_msa]).items() if k in valid_feats
}
feature_dict.update(all_seq_features)
@@ -621,15 +577,11 @@ class ModelType(enum.Enum):
elif model_type_to_use == ModelType.MULTIMER:
all_chain_features = {}
for chain_id, chain_features in features_for_chain.items():
- all_chain_features[chain_id] = pipeline_multimer.convert_monomer_features(
- chain_features, chain_id
- )
+ all_chain_features[chain_id] = pipeline_multimer.convert_monomer_features(chain_features, chain_id)
all_chain_features = pipeline_multimer.add_assembly_features(all_chain_features)
- np_example = feature_processing.pair_and_merge(
- all_chain_features=all_chain_features
- )
+ np_example = feature_processing.pair_and_merge(all_chain_features=all_chain_features)
# Pad MSA to avoid zero-sized extra_msa
np_example = pipeline_multimer.pad_msa(np_example, min_num_seq=512)
@@ -663,12 +615,8 @@ class ModelType(enum.Enum):
params = data.get_model_haiku_params(model_name, PARAMS_DIR)
model_runner = model.RunModel(cfg, params)
- processed_feature_dict = model_runner.process_features(
- np_example, random_seed=0
- )
- prediction = model_runner.predict(
- processed_feature_dict, random_seed=random.randrange(sys.maxsize)
- )
+ processed_feature_dict = model_runner.process_features(np_example, random_seed=0)
+ prediction = model_runner.predict(processed_feature_dict, random_seed=random.randrange(sys.maxsize))
if model_type_to_use == ModelType.MONOMER:
if "predicted_aligned_error" in prediction:
@@ -697,9 +645,7 @@ class ModelType(enum.Enum):
processed_feature_dict,
prediction,
b_factors=b_factors,
- remove_leading_feature_dimension=(
- model_type_to_use == ModelType.MONOMER
- ),
+ remove_leading_feature_dimension=(model_type_to_use == ModelType.MONOMER),
)
unrelaxed_proteins[model_name] = unrelaxed_protein
@@ -711,12 +657,10 @@ class ModelType(enum.Enum):
## AMBER relax the best model
# Find the best model according to the mean pLDDT.
- best_model_name = max(
- ranking_confidences.keys(), key=lambda x: ranking_confidences[x]
- )
+ best_model_name = max(ranking_confidences.keys(), key=lambda x: ranking_confidences[x])
if relax:
- pbar.set_description(f"AMBER relaxation")
+ pbar.set_description("AMBER relaxation")
amber_relaxer = run_relax.AmberRelaxation(
max_iterations=0,
@@ -726,9 +670,7 @@ class ModelType(enum.Enum):
max_outer_iterations=3,
use_gpu=False,
)
- relaxed_pdb, _, _ = amber_relaxer.process(
- prot=unrelaxed_proteins[best_model_name]
- )
+ relaxed_pdb, _, _ = amber_relaxer.process(prot=unrelaxed_proteins[best_model_name])
else:
logger.warning(
"\nRunning model without relaxation stage. Use flag [--relax] ('relax=True') to include AMBER relaxation."
diff --git a/gget/gget_archs4.py b/gget/gget_archs4.py
index e03fd43e9..2ec2b948c 100644
--- a/gget/gget_archs4.py
+++ b/gget/gget_archs4.py
@@ -1,17 +1,17 @@
-import requests
-import pandas as pd
-import json as json_package
import io
+import json as json_package
+
+import pandas as pd
+import requests
from .utils import set_up_logger
logger = set_up_logger()
# Custom functions
-from .gget_info import info
-
# Constants
-from .constants import GENECORR_URL, EXPRESSION_URL
+from .constants import EXPRESSION_URL, GENECORR_URL # noqa: E402
+from .gget_info import info # noqa: E402
def archs4(
@@ -24,9 +24,9 @@ def archs4(
save=False,
verbose=True,
):
- """
- Find the most correlated genes or the tissue expression atlas
- of a gene of interest using data from the human and mouse RNA-seq
+ """Find the most correlated genes or the tissue expression atlas of a gene of interest.
+
+ Uses data from the human and mouse RNA-seq
database ARCHS4 (https://maayanlab.cloud/archs4/).
Args:
@@ -52,16 +52,12 @@ def archs4(
# Check if 'which' argument is valid
whichs = ["correlation", "tissue"]
if which not in whichs:
- raise ValueError(
- f"'which' argument specified as {which}. Expected one of: {', '.join(whichs)}"
- )
+ raise ValueError(f"'which' argument specified as {which}. Expected one of: {', '.join(whichs)}")
# Check if 'species' argument is valid
sps = ["human", "mouse"]
if species not in sps:
- raise ValueError(
- f"'species' argument specified as {species}. Expected one of: {', '.join(sps)}"
- )
+ raise ValueError(f"'species' argument specified as {species}. Expected one of: {', '.join(sps)}")
## Transform Ensembl IDs to gene symbols
if ensembl:
@@ -72,9 +68,7 @@ def archs4(
# Check if Ensembl ID was found
if isinstance(info_df, type(None)):
- logger.error(
- f"ID '{gene}' not found. Please double-check spelling/arguments and try again."
- )
+ logger.error(f"ID '{gene}' not found. Please double-check spelling/arguments and try again.")
return
gene_symbol = info_df.loc[gene]["ensembl_gene_name"]
@@ -90,9 +84,7 @@ def archs4(
if which == "correlation":
if verbose:
- logger.info(
- f"Fetching the {gene_count} most correlated genes to {gene} from ARCHS4."
- )
+ logger.info(f"Fetching the {gene_count} most correlated genes to {gene} from ARCHS4.")
## Find most similar genes based on co-expression
# Define number of correlated genes to return (+1 to account for Python indexing)
@@ -120,9 +112,7 @@ def archs4(
)
return
else:
- logger.error(
- f"Gene correlation request for search term '{gene}' returned error: {corr_data['error']}"
- )
+ logger.error(f"Gene correlation request for search term '{gene}' returned error: {corr_data['error']}")
return
else:
@@ -136,9 +126,7 @@ def archs4(
if json:
results_dict = json_package.loads(corr_df.to_json(orient="records"))
if save:
- with open(
- f"gget_archs4_gene-correlation_{gene}.json", "w", encoding="utf-8"
- ) as f:
+ with open(f"gget_archs4_gene-correlation_{gene}.json", "w", encoding="utf-8") as f:
json_package.dump(results_dict, f, ensure_ascii=False, indent=4)
return results_dict
@@ -151,9 +139,7 @@ def archs4(
if which == "tissue":
if verbose:
- logger.info(
- f"Fetching the tissue expression atlas of {gene} from {species} ARCHS4 data."
- )
+ logger.info(f"Fetching the tissue expression atlas of {gene} from {species} ARCHS4 data.")
## Find tissue expression data
## Define API query
@@ -195,17 +181,13 @@ def archs4(
if json:
results_dict = json_package.loads(tissue_exp_df.to_json(orient="records"))
if save:
- with open(
- f"gget_archs4_tissue-expression_{gene}.json", "w", encoding="utf-8"
- ) as f:
+ with open(f"gget_archs4_tissue-expression_{gene}.json", "w", encoding="utf-8") as f:
json_package.dump(results_dict, f, ensure_ascii=False, indent=4)
return results_dict
else:
if save:
- tissue_exp_df.to_csv(
- f"gget_archs4_tissue-expression_{gene}.csv", index=False
- )
+ tissue_exp_df.to_csv(f"gget_archs4_tissue-expression_{gene}.csv", index=False)
return tissue_exp_df
diff --git a/gget/gget_bgee.py b/gget/gget_bgee.py
index 166d902fb..37398495b 100644
--- a/gget/gget_bgee.py
+++ b/gget/gget_bgee.py
@@ -1,19 +1,17 @@
-import pandas as pd
import json as json_
-from .utils import set_up_logger, json_list_to_df, http_json, dig
+from .utils import dig, http_json, json_list_to_df, set_up_logger
logger = set_up_logger()
def _bgee_species(gene_id: str, verbose=True):
- """
- Get species ID from Bgee
+ """Get species ID from Bgee.
+
:param gene_id: Ensembl gene ID
:param verbose: log progress
- :return: species ID
+ :return: species ID.
"""
-
if verbose:
logger.info(f"Getting species ID for gene {gene_id} from Bgee")
@@ -38,8 +36,7 @@ def _bgee_species(gene_id: str, verbose=True):
def _bgee_orthologs(gene_id, json=False, verbose=True):
- """
- Get orthologs for a gene from Bgee
+ """Get orthologs for a gene from Bgee.
Args:
@@ -51,9 +48,7 @@ def _bgee_orthologs(gene_id, json=False, verbose=True):
"""
# if single Ensembl ID passed as string, convert to list
if isinstance(gene_id, list):
- raise ValueError(
- "One a single gene ID can be passed at a time for ortholog searches."
- )
+ raise ValueError("One a single gene ID can be passed at a time for ortholog searches.")
# must first obtain species
species = _bgee_species(gene_id, verbose=verbose)
@@ -96,8 +91,7 @@ def _bgee_orthologs(gene_id, json=False, verbose=True):
def _bgee_expression(gene_id, json=False, verbose=True):
- """
- Get expression data from Bgee
+ """Get expression data from Bgee.
Args:
@@ -143,7 +137,10 @@ def _bgee_expression(gene_id, json=False, verbose=True):
)
expression_data = dig(
- payload, "data", "expressionData", "expressionCalls",
+ payload,
+ "data",
+ "expressionData",
+ "expressionCalls",
context="Bgee API (expression)",
)
@@ -173,8 +170,7 @@ def bgee(
json=False,
verbose=True,
):
- """
- Get orthologs/expression data for a gene from Bgee (https://www.bgee.org/).
+ """Get orthologs/expression data for a gene from Bgee (https://www.bgee.org/).
Args:
type type of data to retrieve ('expression' or 'orthologs')
@@ -189,6 +185,4 @@ def bgee(
elif type == "orthologs":
return _bgee_orthologs(gene_id, json=json, verbose=verbose)
else:
- raise ValueError(
- f"Argument type should be 'expression' or 'orthologs', not '{type}'"
- )
+ raise ValueError(f"Argument type should be 'expression' or 'orthologs', not '{type}'")
diff --git a/gget/gget_blast.py b/gget/gget_blast.py
index 7afc48396..eb0183d81 100644
--- a/gget/gget_blast.py
+++ b/gget/gget_blast.py
@@ -1,24 +1,24 @@
-from io import StringIO
-
-import pandas as pd
import json as json_package
import time
-from bs4 import BeautifulSoup
+from io import StringIO
+from urllib.parse import urlencode
# Using urllib instead of requests here because requests does not
# support long queries (queries very long here due to input sequence)
-from urllib.request import urlopen, Request
-from urllib.parse import urlencode
+from urllib.request import Request, urlopen
+
+import pandas as pd
+from bs4 import BeautifulSoup
# Custom functions
-from .utils import parse_blast_ref_page, wrap_cols_func, read_fasta, set_up_logger
+from .utils import parse_blast_ref_page, read_fasta, set_up_logger, wrap_cols_func
logger = set_up_logger()
# Constants
-from .constants import (
- BLAST_URL,
+from .constants import ( # noqa: E402
BLAST_CLIENT,
+ BLAST_URL,
)
@@ -35,8 +35,8 @@ def blast(
json=False,
save=False,
):
- """
- BLAST a nucleotide or amino acid sequence against any BLAST DB.
+ """BLAST a nucleotide or amino acid sequence against any BLAST DB.
+
Args:
- sequence Sequence (str) or path to FASTA file.
(If more than one sequence in FASTA file, only the first will be submitted to BLAST.)
@@ -91,16 +91,12 @@ def blast(
_, seqs = read_fasta(sequence)
else:
- raise ValueError(
- "File format not recognized. gget BLAST currently only supports '.txt' or '.fa' files. "
- )
+ raise ValueError("File format not recognized. gget BLAST currently only supports '.txt' or '.fa' files. ")
# Set the first sequence from the fasta file as 'sequence'
sequence = seqs[0]
if len(seqs) > 1:
- logger.warning(
- "File contains more than one sequence. Only the first sequence will be submitted to BLAST."
- )
+ logger.warning("File contains more than one sequence. Only the first sequence will be submitted to BLAST.")
# Convert sequence to upper case
sequence = sequence.upper()
@@ -134,16 +130,12 @@ def blast(
else:
# Check if the user specified database is valid
if database not in dbs:
- raise ValueError(
- f"Database specified is {database}. Expected one of: {', '.join(dbs)}"
- )
+ raise ValueError(f"Database specified is {database}. Expected one of: {', '.join(dbs)}")
else:
if verbose:
logger.info("Sequence recognized as nucleotide sequence.")
- logger.info(
- "BLAST will use program 'blastn' with user-specified database."
- )
+ logger.info("BLAST will use program 'blastn' with user-specified database.")
# If sequence is an amino acid sequence, set program to blastp
elif set(sequence) <= amino_acids:
program = "blastp"
@@ -157,47 +149,39 @@ def blast(
else:
# Check if the user specified database is valid
if database not in dbs:
- raise ValueError(
- f"Database specified is {database}. Expected one of: {', '.join(dbs)}"
- )
+ raise ValueError(f"Database specified is {database}. Expected one of: {', '.join(dbs)}")
else:
if verbose:
logger.info("Sequence recognized as amino acid sequence.")
- logger.info(
- "BLAST will use program 'blastp' with user-specified database."
- )
+ logger.info("BLAST will use program 'blastp' with user-specified database.")
else:
raise ValueError(
f"""
Sequence not automatically recognized as a nucleotide or amino acid sequence.
Please specify 'program' and 'database'.
- Program options: {', '.join(programs)}
- Database options: {', '.join(dbs)}
+ Program options: {", ".join(programs)}
+ Database options: {", ".join(dbs)}
"""
)
else:
# Check if the user specified program is valid
if program not in programs:
- raise ValueError(
- f"Program specified is {program}. Expected one of: {', '.join(programs)}"
- )
+ raise ValueError(f"Program specified is {program}. Expected one of: {', '.join(programs)}")
# Ask user to also specify database
if database == "default":
raise ValueError(
f"""
- User-specified program requires user-specified database. Please also specify argument 'database'.
- Database options: {', '.join(dbs)}
+ User-specified program requires user-specified database. Please also specify argument 'database'.
+ Database options: {", ".join(dbs)}
"""
)
else:
# Check if the user specified database is valid
if database not in dbs:
- raise ValueError(
- f"Database specified is {database}. Expected one of: {', '.join(dbs)}"
- )
+ raise ValueError(f"Database specified is {database}. Expected one of: {', '.join(dbs)}")
## Translate filter arguments
if low_comp_filt is False:
@@ -246,14 +230,12 @@ def blast(
if RTOE < 11:
# Communicate RTOE
if verbose:
- logger.info(f"BLAST initiated. Estimated time to completion: 11 seconds.")
+ logger.info("BLAST initiated. Estimated time to completion: 11 seconds.")
time.sleep(11)
else:
# Communicate RTOE
if verbose:
- logger.info(
- f"BLAST initiated with search ID {RID}. Estimated time to completion: {RTOE} seconds."
- )
+ logger.info(f"BLAST initiated with search ID {RID}. Estimated time to completion: {RTOE} seconds.")
time.sleep(int(RTOE))
## Poll server for status and fetch search results
@@ -295,9 +277,7 @@ def blast(
continue
elif status == "FAILED":
- logger.error(
- f"Search {RID} failed; please try again and/or report to blast-help@ncbi.nlm.nih.gov."
- )
+ logger.error(f"Search {RID} failed; please try again and/or report to blast-help@ncbi.nlm.nih.gov.")
return
elif status == "UNKNOWN":
@@ -314,11 +294,7 @@ def blast(
# Parse HTML results
soup = BeautifulSoup(results, "html.parser")
# Get the descriptions table
- dsc_table = soup.find(
- lambda tag: tag.name == "table"
- and tag.has_attr("id")
- and tag["id"] == "dscTable"
- )
+ dsc_table = soup.find(lambda tag: tag.name == "table" and tag.has_attr("id") and tag["id"] == "dscTable")
if dsc_table is None:
logger.error(
diff --git a/gget/gget_blat.py b/gget/gget_blat.py
index 5d08200cc..0cb5a4aed 100644
--- a/gget/gget_blat.py
+++ b/gget/gget_blat.py
@@ -1,11 +1,12 @@
import json as json_package
import time
from json.decoder import JSONDecodeError
-import pandas as pd
from urllib import request
from urllib.error import HTTPError, URLError
-from .utils import set_up_logger, read_fasta
+import pandas as pd
+
+from .utils import read_fasta, set_up_logger
logger = set_up_logger()
@@ -22,8 +23,7 @@ def blat(
save=False,
verbose=True,
):
- """
- BLAT a nucleotide or amino acid sequence against any BLAT UCSC assembly.
+ """BLAT a nucleotide or amino acid sequence against any BLAT UCSC assembly.
Args:
- sequence Sequence (str) or path to fasta file containing one sequence.
@@ -38,7 +38,6 @@ def blat(
Returns a data frame with the BLAT results.
"""
-
## Clean up sequence
# If the path to a fasta file was provided instead of a nucleotide sequence,
# read the file and extract the first sequence
@@ -47,24 +46,18 @@ def blat(
_, seqs = read_fasta(sequence)
else:
- raise ValueError(
- "File format not recognized. gget BLAT currently only supports '.txt' or '.fa' files. "
- )
+ raise ValueError("File format not recognized. gget BLAT currently only supports '.txt' or '.fa' files. ")
# Set the first sequence from the fasta file as 'sequence'
sequence = seqs[0]
if len(seqs) > 1:
if verbose:
- logger.info(
- "File contains more than one sequence. Only the first sequence will be submitted to BLAT."
- )
+ logger.info("File contains more than one sequence. Only the first sequence will be submitted to BLAT.")
# Shorten sequence to length limit if necessary
if len(sequence) > 8000:
if verbose:
- logger.info(
- "Length of sequence is > 8000. Only the fist 8000 characters will be submitted to BLAT."
- )
+ logger.info("Length of sequence is > 8000. Only the fist 8000 characters will be submitted to BLAT.")
sequence = sequence[:8000]
# Convert sequence to upper case
@@ -85,33 +78,27 @@ def blat(
if set(sequence) <= nucleotides:
seqtype = "DNA"
if verbose:
- logger.info(
- f"Sequence recognized as nucleotide sequence. 'seqtype' will be set as {seqtype}."
- )
+ logger.info(f"Sequence recognized as nucleotide sequence. 'seqtype' will be set as {seqtype}.")
# If sequence is an amino acid sequence, set seqtype to protein
elif set(sequence) <= amino_acids:
seqtype = "protein"
if verbose:
- logger.info(
- f"Sequence recognized as amino acid sequence. 'seqtype' will be set as {seqtype}."
- )
+ logger.info(f"Sequence recognized as amino acid sequence. 'seqtype' will be set as {seqtype}.")
else:
raise ValueError(
f"""
Sequence not automatically recognized as a nucleotide or amino acid sequence.
Please specify 'seqtype'.
- Seqtype options: {', '.join(seqtypes)}
+ Seqtype options: {", ".join(seqtypes)}
"""
)
else:
# Check if the user specified seqtype is valid
if seqtype not in seqtypes:
- raise ValueError(
- f"Seqtype specified is {seqtype}. Expected one of {', '.join(seqtypes)}"
- )
+ raise ValueError(f"Seqtype specified is {seqtype}. Expected one of {', '.join(seqtypes)}")
## Set assembly
# Note: If assembly not found, defaults to hg38
@@ -133,17 +120,13 @@ def blat(
if len(results["blat"]) == 0:
if verbose:
- logger.info(
- f"No {seqtype} BLAT matches were found for this sequence in genome {results['genome']}."
- )
+ logger.info(f"No {seqtype} BLAT matches were found for this sequence in genome {results['genome']}.")
return
# Let user know if assembly was not found
# If this is the case, BLAT automatically defaults to human (hg38)
if results["genome"] != database:
- logger.warning(
- f"Assembly {database} not recognized. Defaulted to {results['genome']} instead."
- )
+ logger.warning(f"Assembly {database} not recognized. Defaulted to {results['genome']} instead.")
## Build data frame to resemble BLAT web search results
# Define dataframe from dictionary
@@ -153,7 +136,7 @@ def blat(
df_dict.update({field: []})
for blat_result_list in results["blat"]:
- for field, (i, result) in zip(results["fields"], enumerate(blat_result_list)):
+ for field, (_i, result) in zip(results["fields"], enumerate(blat_result_list), strict=False):
df_dict[field].append(result)
df = pd.DataFrame(df_dict)
@@ -222,9 +205,9 @@ class _RetryableBlatError(Exception):
def _fetch_blat_results(url, seqtype, database):
- """
- Submit a BLAT request to UCSC and return the parsed JSON dict, or None
- on a non-recoverable failure. Retries transient failures (5xx, network
+ """Submit a BLAT request to UCSC and return the parsed JSON dict, or None on a non-recoverable failure.
+
+ Retries transient failures (5xx, network
errors, non-JSON responses from rate-limiting / HTML error pages) with
exponential backoff. The legacy "sequence too short or assembly invalid"
message is replaced with the actual server response so failures are
@@ -239,8 +222,7 @@ def _fetch_blat_results(url, seqtype, database):
if attempt < _BLAT_MAX_ATTEMPTS:
delay = _BLAT_BACKOFF_BASE_SECONDS * (2 ** (attempt - 1))
logger.warning(
- f"BLAT attempt {attempt}/{_BLAT_MAX_ATTEMPTS} failed ({last_error}). "
- f"Retrying in {delay:.1f}s."
+ f"BLAT attempt {attempt}/{_BLAT_MAX_ATTEMPTS} failed ({last_error}). Retrying in {delay:.1f}s."
)
time.sleep(delay)
@@ -278,10 +260,7 @@ def _fetch_blat_attempt(url, seqtype, database):
code = r.getcode()
if code != 200:
- raise RuntimeError(
- f"HTTP response status code {code}. "
- "Please double-check arguments and try again.\n"
- )
+ raise RuntimeError(f"HTTP response status code {code}. Please double-check arguments and try again.\n")
raw = r.read()
try:
@@ -290,13 +269,13 @@ def _fetch_blat_attempt(url, seqtype, database):
preview = _preview_bytes(raw)
# Non-JSON from a 200 response is almost always an HTML error / throttle
# page from UCSC, which is worth retrying.
- raise _RetryableBlatError(f"non-JSON response: {preview!r}")
+ raise _RetryableBlatError(f"non-JSON response: {preview!r}") from None
def _safe_read_preview(response, limit=300):
try:
return _preview_bytes(response.read(), limit=limit)
- except Exception:
+ except Exception: # noqa: BLE001
return ""
diff --git a/gget/gget_cbio.py b/gget/gget_cbio.py
index 427531656..ec70f7597 100644
--- a/gget/gget_cbio.py
+++ b/gget/gget_cbio.py
@@ -4,19 +4,18 @@
import math
import os
import subprocess
-import pandas as pd
+from collections import OrderedDict, defaultdict
+
+import matplotlib.pyplot as plt
import numpy as np
+import pandas as pd
import requests
+from matplotlib.colors import BoundaryNorm, ListedColormap, TwoSlopeNorm
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
-from collections import defaultdict, OrderedDict
-
-from .utils import set_up_logger
-
-import matplotlib.pyplot as plt
-from matplotlib.colors import ListedColormap, BoundaryNorm, TwoSlopeNorm
from .constants import CBIO_CANCER_TYPE_TO_TISSUE_DICTIONARY
+from .utils import set_up_logger
logger = set_up_logger()
@@ -29,8 +28,8 @@
def _ints_between(start, end, max_count, min_count, verbose=False):
- """
- Generate a list of integers between start and end (inclusive) with a maximum count of max_count and a minimum count min_count.
+ """Generate a list of integers between start and end (inclusive) with a maximum count of max_count and a minimum count min_count.
+
The list is guaranteed to contain start and end, and the spacing between the numbers will be as even as possible.
If a perfect spacing is not possible, the spacing will omit a number rather than overcrowding.
@@ -68,12 +67,10 @@ def _ints_between(start, end, max_count, min_count, verbose=False):
def _describe_bytes(size):
- """
- Describe a size in bytes in human-readable format.
+ """Describe a size in bytes in human-readable format.
:param size: size in bytes
"""
-
steps = ["bytes", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB"]
unit = steps.pop(0)
@@ -88,8 +85,7 @@ def _describe_bytes(size):
def _download_file_from_git_lfs(target_path: str, oid: str, size: int, verbose=False):
- """
- Download a single object from Git LFS.
+ """Download a single object from Git LFS.
:param target_path: path to save the downloaded object
:param oid: object ID
@@ -105,7 +101,7 @@ def _download_file_from_git_lfs(target_path: str, oid: str, size: int, verbose=F
lfs_metadata_json = json.dumps(lfs_metadata)
try:
- github_url = f"https://github.com/cBioPortal/datahub.git/info/lfs/objects/batch"
+ github_url = "https://github.com/cBioPortal/datahub.git/info/lfs/objects/batch"
curl_command = [
"curl",
@@ -142,7 +138,7 @@ def _download_file_from_git_lfs(target_path: str, oid: str, size: int, verbose=F
if verbose:
logger.info(f"Downloaded object {oid} to {target_path}")
- except Exception as e:
+ except Exception as e: # noqa: BLE001
logger.error(f"Error downloading object {oid} to {target_path}: {e}")
return False
@@ -164,8 +160,7 @@ def add(self, target_path: str, oid: str, size: int):
self.objects.append((target_path, (oid, size)))
def download(self) -> bool:
- """
- Download all objects in the plan.
+ """Download all objects in the plan.
:return: True if all objects were downloaded successfully, False otherwise
"""
@@ -183,8 +178,7 @@ def download_cbioportal_data(
out_dir=None,
confirm_download=False,
) -> bool:
- """
- Download data from cBioPortal studies.
+ """Download data from cBioPortal studies.
Args:
@@ -197,7 +191,6 @@ def download_cbioportal_data(
:return: True if successfully downloaded all needed data, False otherwise
"""
-
actual_out_dir = os.path.abspath(out_dir or "gget_cbio_cache")
os.makedirs(actual_out_dir, exist_ok=True)
@@ -237,9 +230,7 @@ def download_cbioportal_data(
response = session.get(url, timeout=30)
if not response.ok:
- logger.error(
- f"Failed to download {file_type} data for study {study_id}"
- )
+ logger.error(f"Failed to download {file_type} data for study {study_id}")
if file_type not in optional_file_types:
success = False
continue
@@ -254,9 +245,9 @@ def download_cbioportal_data(
v = v.strip()
fields[k] = v
- assert (
- fields["version"] == "https://git-lfs.github.com/spec/v1"
- ), f"Cannot handle git-lfs version {fields['version']}"
+ assert fields["version"] == "https://git-lfs.github.com/spec/v1", (
+ f"Cannot handle git-lfs version {fields['version']}"
+ )
oid: str = fields["oid"].split(":")[1].strip()
size: int = int(fields["size"])
@@ -264,14 +255,10 @@ def download_cbioportal_data(
if plan:
plan.add(filename, oid, size)
else:
- success &= _download_file_from_git_lfs(
- filename, oid, size, verbose=verbose
- )
+ success &= _download_file_from_git_lfs(filename, oid, size, verbose=verbose)
- except Exception as e:
- logger.error(
- f"Error downloading {file_type} data for study {study_id}: {e}"
- )
+ except Exception as e: # noqa: BLE001
+ logger.error(f"Error downloading {file_type} data for study {study_id}: {e}")
success = False
if verbose and not confirm_download:
@@ -280,9 +267,7 @@ def download_cbioportal_data(
# If using a download plan AND there are actually objects to download, ask for confirmation
if plan and plan.objects:
do_download = (
- input(
- f"Do you want to download {_describe_bytes(plan.total_size)} to {actual_out_dir}? (y/n) "
- )
+ input(f"Do you want to download {_describe_bytes(plan.total_size)} to {actual_out_dir}? (y/n) ")
.lower()
.strip()
== "y"
@@ -306,8 +291,7 @@ def _extract_study_name(name: str) -> str:
def cbio_search(key_words):
- """
- Find cBioPortal study IDs by keyword.
+ """Find cBioPortal study IDs by keyword.
Args:
key_words list of keywords to search for - use tissues related to tissue or cancer type of interest (e.g., esophag, ovarian, etc)
@@ -316,13 +300,12 @@ def cbio_search(key_words):
:return: list of study IDs that match the keywords
"""
-
try:
from bravado.client import SwaggerClient
except ImportError:
logger.error(
"""
- Some third-party dependencies are missing. Please run the following command:
+ Some third-party dependencies are missing. Please run the following command:
>>> gget.setup('cbio') or $ gget setup cbio
Alternative: Install the bravado package using pip (https://pypi.org/project/bravado).
@@ -348,18 +331,14 @@ def cbio_search(key_words):
studies = api.Studies.getAllStudiesUsingGET().result()
cancer_type_acronym_dict = {
- _extract_study_name(individual_study["name"]): individual_study["cancerTypeId"]
- for individual_study in studies
+ _extract_study_name(individual_study["name"]): individual_study["cancerTypeId"] for individual_study in studies
}
cancer_type_acronym_dict = OrderedDict(sorted(cancer_type_acronym_dict.items()))
cancer_id_list = [
cancer_type_acronym
for cancer_type, cancer_type_acronym in cancer_type_acronym_dict.items()
- if any(
- key_word in cancer_type.lower() or key_word in cancer_type_acronym.lower()
- for key_word in key_words
- )
+ if any(key_word in cancer_type.lower() or key_word in cancer_type_acronym.lower() for key_word in key_words)
and cancer_type_acronym.lower() != "mixed"
]
@@ -383,7 +362,7 @@ def _get_ensembl_gene_id(transcript_id: str, verbose=False):
data = response.json()
return data.get("Parent")
- except Exception as e:
+ except Exception: # noqa: BLE001
if verbose:
print(f"Error for: {transcript_id}")
return "Unknown"
@@ -394,7 +373,7 @@ def _get_ensembl_gene_id_bulk(transcript_ids):
return {}
try:
- url = f"https://rest.ensembl.org/lookup/id/"
+ url = "https://rest.ensembl.org/lookup/id/"
response = requests.post(
url,
json={"ids": transcript_ids},
@@ -407,9 +386,7 @@ def _get_ensembl_gene_id_bulk(transcript_ids):
data = response.json()
return {
- transcript_id: data[transcript_id].get("Parent")
- for transcript_id in transcript_ids
- if data[transcript_id]
+ transcript_id: data[transcript_id].get("Parent") for transcript_id in transcript_ids if data[transcript_id]
}
except Exception as e:
logger.error(f"Failed to fetch gene IDs from Ensembl: {e}")
@@ -421,29 +398,21 @@ def _get_ensembl_gene_name_bulk(gene_ids):
return {}
try:
- url = f"https://rest.ensembl.org/lookup/id/"
- response = requests.post(
- url, json={"ids": gene_ids}, headers={"Content-Type": "application/json"}
- )
+ url = "https://rest.ensembl.org/lookup/id/"
+ response = requests.post(url, json={"ids": gene_ids}, headers={"Content-Type": "application/json"})
if not response.ok:
response.raise_for_status()
data = response.json()
- return {
- gene_id: data[gene_id].get("display_name")
- for gene_id in gene_ids
- if data[gene_id]
- }
+ return {gene_id: data[gene_id].get("display_name") for gene_id in gene_ids if data[gene_id]}
except Exception as e:
logger.error(f"Failed to fetch gene names from Ensembl: {e}")
raise e
-def _get_valid_ensembl_gene_id(
- row, transcript_column: str = "seq_ID", gene_column: str = "gene_name"
-):
+def _get_valid_ensembl_gene_id(row, transcript_column: str = "seq_ID", gene_column: str = "gene_name"):
ensembl_gene_id = _get_ensembl_gene_id(row[transcript_column])
if ensembl_gene_id == "Unknown":
return row[gene_column]
@@ -451,7 +420,7 @@ def _get_valid_ensembl_gene_id(
def _get_valid_ensembl_gene_id_bulk(df: pd.DataFrame):
- map_: Optional[dict[str, str]] = None
+ map_: dict[str, str] | None = None
def f(
row: pd.Series,
@@ -495,17 +464,13 @@ def __init__(
ensembl_transcripts = [gene for gene in genes if gene.startswith("ENST")]
map_ = {
- k: v
- for k, v in _get_ensembl_gene_id_bulk(ensembl_transcripts).items()
- if v != "Unknown" and v is not None
+ k: v for k, v in _get_ensembl_gene_id_bulk(ensembl_transcripts).items() if v != "Unknown" and v is not None
}
genes = [map_.get(gene, gene) for gene in genes]
ensembl_gene_ids = [gene for gene in genes if gene.startswith("ENSG")]
map_ = {
- k: v
- for k, v in _get_ensembl_gene_name_bulk(ensembl_gene_ids).items()
- if v != "Unknown" and v is not None
+ k: v for k, v in _get_ensembl_gene_name_bulk(ensembl_gene_ids).items() if v != "Unknown" and v is not None
}
self.genes = [map_.get(gene, gene) for gene in genes]
@@ -529,9 +494,7 @@ def __init__(
"Entrez_Gene_Id",
"Consequence",
]
- self.column_for_merging: str = (
- "Hugo_Symbol" if self.merge_type == _SYMBOL else "Ensembl_Gene_ID"
- )
+ self.column_for_merging: str = "Hugo_Symbol" if self.merge_type == _SYMBOL else "Ensembl_Gene_ID"
self.df_collection = {}
self.big_combined_df = self._create_study_dataframes()
@@ -540,9 +503,7 @@ def _create_single_study_dataframe(self, study_id: str) -> pd.DataFrame:
data_folder = os.path.join(self.data_dir, study_id)
mutation_df = pd.read_csv(os.path.join(data_folder, "mutations.txt"), sep="\t")
- sample_df = pd.read_csv(
- os.path.join(data_folder, "clinical_sample.txt"), sep="\t"
- )
+ sample_df = pd.read_csv(os.path.join(data_folder, "clinical_sample.txt"), sep="\t")
self.df_collection[study_id]["mutations"] = mutation_df
self.df_collection[study_id]["samples"] = sample_df
@@ -556,13 +517,8 @@ def _create_single_study_dataframe(self, study_id: str) -> pd.DataFrame:
):
mutation_df.rename(columns={"Gene": "Ensembl_Gene_ID"}, inplace=True)
if self.remove_non_ensembl_genes:
- mutation_df = mutation_df[
- mutation_df["Ensembl_Gene_ID"].str.startswith("ENSG")
- ]
- elif (
- "Transcript_ID" in mutation_df.columns
- and mutation_df["Transcript_ID"].str.startswith("ENST").any()
- ):
+ mutation_df = mutation_df[mutation_df["Ensembl_Gene_ID"].str.startswith("ENSG")]
+ elif "Transcript_ID" in mutation_df.columns and mutation_df["Transcript_ID"].str.startswith("ENST").any():
logger.info("Fetching gene IDs from Ensembl")
mutation_df["Ensembl_Gene_ID"] = mutation_df.progress_apply(
_get_valid_ensembl_gene_id_bulk(mutation_df),
@@ -571,14 +527,10 @@ def _create_single_study_dataframe(self, study_id: str) -> pd.DataFrame:
gene_column="Hugo_Symbol",
)
if self.remove_non_ensembl_genes:
- mutation_df = mutation_df[
- mutation_df["Ensembl_Gene_ID"].str.startswith("ENSG")
- ]
+ mutation_df = mutation_df[mutation_df["Ensembl_Gene_ID"].str.startswith("ENSG")]
else:
self.merge_type = _SYMBOL
- logger.warn(
- "No Ensembl gene IDs found in the mutation data. Merging on gene symbol instead."
- )
+ logger.warn("No Ensembl gene IDs found in the mutation data. Merging on gene symbol instead.")
def join_unique_string_values(series):
if series.isnull().all():
@@ -613,9 +565,7 @@ def join_unique_string_values(series):
self.columns_to_keep.remove("Entrez_Gene_Id")
aggregated_df = (
- mutation_df.groupby(["Tumor_Sample_Barcode", self.column_for_merging])
- .agg(aggregation_dict)
- .reset_index()
+ mutation_df.groupby(["Tumor_Sample_Barcode", self.column_for_merging]).agg(aggregation_dict).reset_index()
)
if self.column_for_merging not in self.columns_to_keep:
@@ -643,9 +593,7 @@ def join_unique_string_values(series):
self.df_collection[study_id]["cna"] = cna_df
# Exclude 'Hugo_Symbol' column
- columns_to_transform = self.df_collection[study_id][
- "cna"
- ].columns.difference(["Hugo_Symbol"])
+ columns_to_transform = self.df_collection[study_id]["cna"].columns.difference(["Hugo_Symbol"])
# Apply binary transformation to the selected columns
df_binary = self.df_collection[study_id]["cna"][columns_to_transform].map(
@@ -653,9 +601,7 @@ def join_unique_string_values(series):
)
# Add 'Hugo_Symbol' column back to the DataFrame
- df_binary.insert(
- 0, "Hugo_Symbol", self.df_collection[study_id]["cna"]["Hugo_Symbol"]
- )
+ df_binary.insert(0, "Hugo_Symbol", self.df_collection[study_id]["cna"]["Hugo_Symbol"])
# Reassign the transformed DataFrame to the collection
self.df_collection[study_id]["cna_binary"] = df_binary
@@ -690,22 +636,14 @@ def join_unique_string_values(series):
melted_sv = melted_sv.drop_duplicates(subset=["Sample_Id", "Hugo_Symbol"])
# Count the occurrences of each Hugo_Symbol in each Sample_Id
- sv_occurrences = (
- melted_sv.groupby(["Hugo_Symbol", "Sample_Id"])
- .size()
- .reset_index(name="sv_occurrences")
- )
+ sv_occurrences = melted_sv.groupby(["Hugo_Symbol", "Sample_Id"]).size().reset_index(name="sv_occurrences")
# Rename columns to match the desired output
- sv_occurrences = sv_occurrences.rename(
- columns={"Sample_Id": "Tumor_Sample_Barcode"}
- )
+ sv_occurrences = sv_occurrences.rename(columns={"Sample_Id": "Tumor_Sample_Barcode"})
final_df = pd.merge(
final_df,
- sv_occurrences[
- ["Hugo_Symbol", "Tumor_Sample_Barcode", "sv_occurrences"]
- ],
+ sv_occurrences[["Hugo_Symbol", "Tumor_Sample_Barcode", "sv_occurrences"]],
on=["Hugo_Symbol", "Tumor_Sample_Barcode"],
how="outer",
)
@@ -719,15 +657,13 @@ def join_unique_string_values(series):
elif "SAMPLE_ID" in sample_df.columns:
sample_identifier_column = "SAMPLE_ID"
else:
- raise AssertionError(
- "Sample Identifier column not found in the sample dataframe"
- )
+ raise AssertionError("Sample Identifier column not found in the sample dataframe")
columns_to_merge = [sample_identifier_column, "Cancer Type", "Cancer Type Detailed"]
for column in columns_to_merge:
if column not in sample_df.columns:
columns_to_merge.remove(column)
-
+
final_df = pd.merge(
final_df,
sample_df[columns_to_merge],
@@ -746,9 +682,7 @@ def join_unique_string_values(series):
)
final_df["tissue"] = (
- final_df["cancer_type"]
- .map(CBIO_CANCER_TYPE_TO_TISSUE_DICTIONARY)
- .fillna("unclassified")
+ final_df["cancer_type"].map(CBIO_CANCER_TYPE_TO_TISSUE_DICTIONARY).fillna("unclassified")
)
# Drop the redundant SAMPLE_ID column
@@ -766,7 +700,7 @@ def _create_study_dataframes(self) -> pd.DataFrame:
# clean up data just in case (cut out comments)
filename = f"{self.data_dir}/{study_id}/mutations.txt"
- with open(filename, "r") as file:
+ with open(filename) as file:
lines = file.readlines()
changed = False
@@ -782,7 +716,7 @@ def _create_study_dataframes(self) -> pd.DataFrame:
final_df = self._create_single_study_dataframe(study_id=study_id)
dataframes.append(final_df)
- except Exception as e:
+ except Exception as e: # noqa: BLE001
logger.error(f"Error processing study {study_id}: {e}")
continue
@@ -800,9 +734,9 @@ def plot_heatmap(
figure_title=None,
):
if variation_type == "cna_nonbinary" or variation_type == "Consequence":
- assert (
- stratification == "sample"
- ), "Stratification must be 'sample' for cna_nonbinary and Consequence variations"
+ assert stratification == "sample", (
+ "Stratification must be 'sample' for cna_nonbinary and Consequence variations"
+ )
if variation_type != "cna_nonbinary":
simple_merge_by_stratification: dict[str, list[str]] = {
@@ -823,9 +757,7 @@ def plot_heatmap(
if filter_category is None: # no filtering
final_df = self.big_combined_df
else:
- final_df = self.big_combined_df[
- self.big_combined_df[filter_category] == filter_value
- ]
+ final_df = self.big_combined_df[self.big_combined_df[filter_category] == filter_value]
merge_on = list(set(merge_on).intersection(final_df.columns))
@@ -843,9 +775,7 @@ def plot_heatmap(
unique_samples_info = final_df[available_cols].drop_duplicates()
- hugo_mask = final_df["Hugo_Symbol"].isin(
- [gene for gene in (self.genes) if not gene.startswith("ENSG")]
- )
+ hugo_mask = final_df["Hugo_Symbol"].isin([gene for gene in (self.genes) if not gene.startswith("ENSG")])
if self.merge_type == _ENSEMBL:
ensg_mask = final_df["Ensembl_Gene_ID"].isin(
@@ -868,16 +798,12 @@ def plot_heatmap(
else:
raise AssertionError(f"Invalid merge type: {self.merge_type}")
- unexpressed_genes = [
- gene for gene in (self.genes) if gene not in existing_genes
- ]
+ unexpressed_genes = [gene for gene in (self.genes) if gene not in existing_genes]
# Get all unique Tumor_Sample_Barcode from the original DataFrame
all_samples = final_df[merge_on].drop_duplicates()
- all_samples = pd.merge(
- all_samples, unique_samples_info, on=merge_on, how="left"
- )
+ all_samples = pd.merge(all_samples, unique_samples_info, on=merge_on, how="left")
if variation_type not in columns_to_keep_copy:
columns_to_keep_copy.append(variation_type)
@@ -889,10 +815,7 @@ def plot_heatmap(
"cancer_type_detailed",
]
for column_name in must_keep:
- if (
- column_name in merge_on
- and column_name not in columns_to_keep_copy
- ):
+ if column_name in merge_on and column_name not in columns_to_keep_copy:
columns_to_keep_copy.append(column_name)
# Merge the filtered genes DataFrame with all samples to ensure all samples are included
@@ -906,11 +829,7 @@ def plot_heatmap(
if stratification != "sample": # no filtering
df_for_heatmap_very_final: pd.DataFrame = (
- merged_df.groupby([self.column_for_merging, stratification])[
- variation_type
- ]
- .sum()
- .reset_index()
+ merged_df.groupby([self.column_for_merging, stratification])[variation_type].sum().reset_index()
)
else:
df_for_heatmap_very_final: pd.DataFrame = merged_df
@@ -951,9 +870,7 @@ def plot_heatmap(
pivot_df1 = pivot_df1[sorted_columns]
if unexpressed_genes:
- new_rows = pd.DataFrame(
- np.nan, index=unexpressed_genes, columns=pivot_df1.columns
- )
+ new_rows = pd.DataFrame(np.nan, index=unexpressed_genes, columns=pivot_df1.columns)
pivot_df1 = pd.concat([pivot_df1, new_rows])
title = f"Heatmap of Gene mutations per gene across {stratification}"
@@ -967,12 +884,8 @@ def plot_heatmap(
pivot_df1.rename(index=map_, inplace=True)
else: # variation_type == "cna_nonbinary"
- assert (
- stratification == "sample"
- ), "stratification must be 'sample' for CNA data"
- assert (
- filter_category == "study_id"
- ), "filter_category must be 'study_id' for CNA data"
+ assert stratification == "sample", "stratification must be 'sample' for CNA data"
+ assert filter_category == "study_id", "filter_category must be 'study_id' for CNA data"
pivot_df1 = self.df_collection[filter_value]["cna"].copy()
pivot_df1.set_index("Hugo_Symbol", inplace=True)
pivot_df1 = pivot_df1[pivot_df1.index.isin(self.genes)]
@@ -990,13 +903,11 @@ def plot_heatmap(
missing_genes = [g for g in self.genes if g not in existing]
if missing_genes:
new_rows = pd.DataFrame(
- {col: np.nan for col in pivot_df1.columns},
+ dict.fromkeys(pivot_df1.columns, np.nan),
index=missing_genes,
)
new_rows["Hugo_Symbol"] = missing_genes
- pivot_df1 = pd.concat(
- [pivot_df1, new_rows], ignore_index=True
- )
+ pivot_df1 = pd.concat([pivot_df1, new_rows], ignore_index=True)
# Set 'Hugo_Symbol' back as index if needed
pivot_df1 = pivot_df1.set_index("Hugo_Symbol")
@@ -1016,9 +927,7 @@ def plot_heatmap(
# limit to first 500 columns
render_divider_lines = True
render_column_ids = pivot_df1.shape[1] < 100
- if (
- pivot_df1.shape[1] > 372
- ): # 372 is fine, 373 is not. There's something wrong with pyplot...
+ if pivot_df1.shape[1] > 372: # 372 is fine, 373 is not. There's something wrong with pyplot...
print("Warning: Too many columns to plot. Limiting to first 372 columns")
pivot_df1 = pivot_df1.iloc[:, :372]
render_divider_lines = False
@@ -1030,12 +939,8 @@ def plot_heatmap(
levels = list(range(min_value + 1, max_value + 1))
pivot_df1 = pivot_df1.fillna(min_value)
- colors_list = plt.get_cmap("RdBu_r", max_value - min_value + 1)(
- range(max_value - min_value + 1)
- )
- colors_list = np.vstack(
- ([[0.5, 0.5, 0.5, 0.3]], colors_list[1:])
- ) # Grey color for -3
+ colors_list = plt.get_cmap("RdBu_r", max_value - min_value + 1)(range(max_value - min_value + 1))
+ colors_list = np.vstack(([[0.5, 0.5, 0.5, 0.3]], colors_list[1:])) # Grey color for -3
cmap = ListedColormap(colors_list)
# Define the norm with the diverging palette centered at 0
@@ -1044,16 +949,12 @@ def plot_heatmap(
elif variation_type == "Consequence":
consequences = list(self.big_combined_df["Consequence"].unique())
- colors_list = plt.get_cmap("tab20", len(consequences))(
- range(len(consequences))
- )
+ colors_list = plt.get_cmap("tab20", len(consequences))(range(len(consequences)))
# if consequences contains nan, ensure the nan value is at the beginning
if np.nan in consequences:
colors_list = np.vstack(([[1.0, 1.0, 1.0, 0.3]], colors_list[:-1]))
- consequences = [np.nan] + sorted(
- v for v in consequences if not isinstance(v, float)
- )
+ consequences = [np.nan] + sorted(v for v in consequences if not isinstance(v, float))
else:
consequences.sort()
nas_present = False
@@ -1068,9 +969,7 @@ def plot_heatmap(
)
levels = list(range(min_value, max_value))
- string_to_int = {
- consequence: i for i, consequence in enumerate(consequences)
- }
+ string_to_int = {consequence: i for i, consequence in enumerate(consequences)}
pivot_df1 = pivot_df1.map(lambda x: string_to_int[x])
@@ -1090,9 +989,7 @@ def plot_heatmap(
# Create a custom colormap
colors_list = plt.get_cmap("Reds", len(levels))(range(len(levels)))
if nas_present:
- colors_list = np.vstack(
- ([[0.5, 0.5, 0.5, 0.3]], colors_list)
- ) # Grey color for -1
+ colors_list = np.vstack(([[0.5, 0.5, 0.5, 0.3]], colors_list)) # Grey color for -1
cmap = ListedColormap(colors_list)
# Define the norm, with vmin set to -1 and vmax to max_value
@@ -1197,8 +1094,7 @@ def cbio_plot(
show=False,
figure_title=None,
):
- """
- Plot a heatmap of given genes in the given studies.
+ """Plot a heatmap of given genes in the given studies.
Args:
study_ids list of cBioPortal study IDs
@@ -1226,9 +1122,7 @@ def cbio_plot(
if verbose:
logger.info("Downloading data")
- if not download_cbioportal_data(
- study_ids, verbose=verbose, out_dir=data_dir, confirm_download=confirm_download
- ):
+ if not download_cbioportal_data(study_ids, verbose=verbose, out_dir=data_dir, confirm_download=confirm_download):
logger.error("Failed to download data. Continuing with available studies")
# return False
diff --git a/gget/gget_cellxgene.py b/gget/gget_cellxgene.py
index febaa9fc7..4b08bebad 100644
--- a/gget/gget_cellxgene.py
+++ b/gget/gget_cellxgene.py
@@ -16,12 +16,12 @@
def _listify(x):
- """
- Return x as a 1-D list suitable for SOMA `in [...]` filters.
+ """Return x as a 1-D list suitable for SOMA `in [...]` filters.
+
- None -> None
- "str" -> ["str"]
- iterables -> list(iterable)
- - scalars -> [scalar]
+ - scalars -> [scalar].
"""
if x is None:
return None
@@ -35,8 +35,8 @@ def _listify(x):
def _build_obs_filter(filters: dict, is_primary_data: bool):
- """
- Build a SOMA obs value_filter string like:
+ """Build a SOMA obs value_filter string like:
+
"is_primary_data == True and tissue in ['lung'] and cell_type in ['muscle cell']"
Only includes keys with non-empty values.
"""
@@ -81,8 +81,8 @@ def cellxgene(
verbose=True,
out=None,
):
- """
- Query data from CZ CELLxGENE Discover (https://cellxgene.cziscience.com/) using the
+ """Query data from CZ CELLxGENE Discover (https://cellxgene.cziscience.com/) using the
+
CZ CELLxGENE Discover Census (https://github.com/chanzuckerberg/cellxgene-census).
NOTE: Querying large datasets requires a large amount of RAM. Use the cell metadata attributes
@@ -154,7 +154,7 @@ def cellxgene(
"tissue_general",
"tissue",
"cell_type",
- "disease"
+ "disease",
]
# Check dependency
@@ -163,7 +163,7 @@ def cellxgene(
except ImportError:
logger.error(
"""
- Some third-party dependencies are missing. Please run the following command:
+ Some third-party dependencies are missing. Please run the following command:
>>> gget.setup('cellxgene') or $ gget setup cellxgene
Alternative: Install the cellxgene-census package using pip (https://pypi.org/project/cellxgene-census).
@@ -238,9 +238,7 @@ def cellxgene(
var_value_filter = None
if verbose:
- logger.info(
- "Fetching AnnData object from CZ CELLxGENE Discover. This might take a few minutes..."
- )
+ logger.info("Fetching AnnData object from CZ CELLxGENE Discover. This might take a few minutes...")
with cellxgene_census.open_soma(census_version=census_version) as census:
adata = cellxgene_census.get_anndata(
census=census,
diff --git a/gget/gget_cosmic.py b/gget/gget_cosmic.py
index c96cc81a5..00bccf37f 100644
--- a/gget/gget_cosmic.py
+++ b/gget/gget_cosmic.py
@@ -1,32 +1,30 @@
-import requests
-import pandas as pd
-import subprocess
+import base64
+import getpass
+import gzip
+import json as json_package
import os
import re
-import json as json_package
-import base64
import shutil
+import subprocess
import tarfile
-import gzip
-import getpass
+
+import pandas as pd
# Constants
-from .constants import COSMIC_GET_URL
-from .utils import set_up_logger, get_latest_cosmic
+from .utils import get_latest_cosmic, set_up_logger
logger = set_up_logger()
def is_valid_email(email):
- """
- Check if an e-mail address is valid.
- """
+ """Check if an e-mail address is valid."""
email_pattern = re.compile(r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)")
return re.match(email_pattern, email) is not None
-def download_reference(download_link, tar_folder_path, file_path, verbose, email = None, password = None, unzip = False):
+def download_reference(download_link, tar_folder_path, file_path, verbose, email=None, password=None, unzip=False):
+ """Download a COSMIC reference file using email/password authentication, extract the tar, and optionally unzip it."""
if not email:
email = input("Please enter your COSMIC email: ")
if not is_valid_email(email):
@@ -56,19 +54,17 @@ def download_reference(download_link, tar_folder_path, file_path, verbose, email
except json_package.JSONDecodeError:
raise RuntimeError(
"Failed to download file. Please double-check arguments (especially cosmic_version) and try again."
- )
+ ) from None
try:
true_download_url = response_data.get("url")
except AttributeError:
- raise AttributeError("Invalid username or password.")
+ raise AttributeError("Invalid username or password.") from None
curl_command2 = ["curl", true_download_url, "--output", f"{tar_folder_path}.tar"]
result2 = subprocess.run(curl_command2, capture_output=True, text=True)
if result2.returncode != 0:
- raise RuntimeError(
- f"Failed to download file. Return code: {result2.returncode}\n{result2.stderr}"
- )
+ raise RuntimeError(f"Failed to download file. Return code: {result2.returncode}\n{result2.stderr}")
with tarfile.open(f"{tar_folder_path}.tar", "r") as tar:
tar.extractall(path=tar_folder_path)
@@ -84,8 +80,17 @@ def download_reference(download_link, tar_folder_path, file_path, verbose, email
def select_reference(
- cosmic_project, reference_dir, grch_version, cosmic_version, verbose, email = None, password = None, unzip = True, overwrite = None
+ cosmic_project,
+ reference_dir,
+ grch_version,
+ cosmic_version,
+ verbose,
+ email=None,
+ password=None,
+ unzip=True,
+ overwrite=None,
):
+ """Resolve the download link and paths for the requested COSMIC project, then download and extract the database, returning the file path and overwrite flag."""
# if cosmic_project == "transcriptome":
# download_link = f"https://cancer.sanger.ac.uk/api/mono/products/v1/downloads/scripted?path=grch{grch_version}/cosmic/v{cosmic_version}/Cosmic_Genes_Fasta_v{cosmic_version}_GRCh{grch_version}.tar&bucket=downloads"
# tarred_folder = f"Cosmic_Genes_Fasta_v{cosmic_version}_GRCh{grch_version}"
@@ -93,20 +98,16 @@ def select_reference(
if cosmic_project == "cancer":
if grch_version == 38:
- logger.error(
- "CancerMutationCensus data is only available for GRCh37. Define grch_version=37."
- )
+ logger.error("CancerMutationCensus data is only available for GRCh37. Define grch_version=37.")
download_link = f"https://cancer.sanger.ac.uk/api/mono/products/v1/downloads/scripted?path=grch{grch_version}/cmc/v{cosmic_version}/CancerMutationCensus_AllData_Tsv_v{cosmic_version}_GRCh{grch_version}.tar&bucket=downloads"
- tarred_folder = (
- f"CancerMutationCensus_AllData_Tsv_v{cosmic_version}_GRCh{grch_version}"
- )
- contained_file = (
- f"CancerMutationCensus_AllData_v{cosmic_version}_GRCh{grch_version}.tsv"
- )
+ tarred_folder = f"CancerMutationCensus_AllData_Tsv_v{cosmic_version}_GRCh{grch_version}"
+ contained_file = f"CancerMutationCensus_AllData_v{cosmic_version}_GRCh{grch_version}.tsv"
if str(cosmic_version) == "100": # special treatment due to v2
download_link = download_link.replace(".tar&bucket=downloads", "_v2.tar&bucket=downloads")
tarred_folder += "_v2"
- if str(cosmic_version) == "99" or str(cosmic_version) == "100": # special treatment due to link difference - path=GRCh37 instead of path=grch37
+ if (
+ str(cosmic_version) == "99" or str(cosmic_version) == "100"
+ ): # special treatment due to link difference - path=GRCh37 instead of path=grch37
download_link = download_link.replace(f"path=grch{grch_version}", f"path=GRCh{grch_version}")
elif cosmic_project == "cell_line":
@@ -121,21 +122,13 @@ def select_reference(
elif cosmic_project == "resistance":
download_link = f"https://cancer.sanger.ac.uk/api/mono/products/v1/downloads/scripted?path=grch{grch_version}/cosmic/v{cosmic_version}/Cosmic_ResistanceMutations_Tsv_v{cosmic_version}_GRCh{grch_version}.tar&bucket=downloads"
- tarred_folder = (
- f"Cosmic_ResistanceMutations_Tsv_v{cosmic_version}_GRCh{grch_version}"
- )
- contained_file = (
- f"Cosmic_ResistanceMutations_v{cosmic_version}_GRCh{grch_version}.tsv"
- )
+ tarred_folder = f"Cosmic_ResistanceMutations_Tsv_v{cosmic_version}_GRCh{grch_version}"
+ contained_file = f"Cosmic_ResistanceMutations_v{cosmic_version}_GRCh{grch_version}.tsv"
elif cosmic_project == "genome_screen":
download_link = f"https://cancer.sanger.ac.uk/api/mono/products/v1/downloads/scripted?path=grch{grch_version}/cosmic/v{cosmic_version}/Cosmic_GenomeScreensMutant_Tsv_v{cosmic_version}_GRCh{grch_version}.tar&bucket=downloads"
- tarred_folder = (
- f"Cosmic_GenomeScreensMutant_Tsv_v{cosmic_version}_GRCh{grch_version}"
- )
- contained_file = (
- f"Cosmic_GenomeScreensMutant_v{cosmic_version}_GRCh{grch_version}.tsv"
- )
+ tarred_folder = f"Cosmic_GenomeScreensMutant_Tsv_v{cosmic_version}_GRCh{grch_version}"
+ contained_file = f"Cosmic_GenomeScreensMutant_v{cosmic_version}_GRCh{grch_version}.tsv"
elif cosmic_project == "targeted_screen":
download_link = f"https://cancer.sanger.ac.uk/api/mono/products/v1/downloads/scripted?path=grch{grch_version}/cosmic/v{cosmic_version}/Cosmic_CompleteTargetedScreensMutant_Tsv_v{cosmic_version}_GRCh{grch_version}.tar&bucket=downloads"
@@ -178,7 +171,7 @@ def select_reference(
f"{tar_folder_path}.tar",
download_link,
]
- result = subprocess.run(curl_command, capture_output=True, text=True)
+ subprocess.run(curl_command, capture_output=True, text=True)
with tarfile.open(f"{tar_folder_path}.tar", "r") as tar:
tar.extractall(path=tar_folder_path)
@@ -198,18 +191,20 @@ def select_reference(
.lower()
)
if proceed in ["yes", "y"]:
- download_reference(download_link, tar_folder_path, file_path, verbose, email = email, password = password, unzip = unzip)
+ download_reference(
+ download_link, tar_folder_path, file_path, verbose, email=email, password=password, unzip=unzip
+ )
else:
raise KeyboardInterrupt(
- f"Database download canceled. Learn more about COSMIC at https://cancer.sanger.ac.uk/cosmic/download/cosmic."
+ "Database download canceled. Learn more about COSMIC at https://cancer.sanger.ac.uk/cosmic/download/cosmic."
)
return file_path, overwrite
def make_exact_match_mask(df, searchterm_lower, cols_to_check):
- """
- Build a boolean mask for rows where any of the specified columns match the search term exactly.
+ """Build a boolean mask for rows where any of the specified columns match the search term exactly.
+
Handles special case for ACCESSION_NUMBER to match both with and without version.
Allows for columns in cols_to_check to be missing from the DataFrame.
"""
@@ -251,9 +246,7 @@ def make_exact_match_mask(df, searchterm_lower, cols_to_check):
def query_local_cosmic(cosmic_tsv_path, cosmic_project, searchterm, limit):
- """
- Search the local COSMIC mutation census file for matching entries.
- """
+ """Search the local COSMIC mutation census file for matching entries."""
df = pd.read_csv(cosmic_tsv_path, sep="\t", low_memory=False)
searchterm_lower = searchterm.lower()
results = []
@@ -261,25 +254,22 @@ def query_local_cosmic(cosmic_tsv_path, cosmic_project, searchterm, limit):
def match_and_limit(mask, extract_fn):
for _, row in df[mask].head(limit).iterrows():
results.append(extract_fn(row))
-
- if cosmic_project in ["cancer", "cancer_example"]:
+ if cosmic_project in ["cancer", "cancer_example"]:
# Columns to check for search term
cols_to_check = [
- "GENE_NAME",
- "ACCESSION_NUMBER",
- "LEGACY_MUTATION_ID",
- "Mutation CDS",
+ "GENE_NAME",
+ "ACCESSION_NUMBER",
+ "LEGACY_MUTATION_ID",
+ "Mutation CDS",
"Mutation AA",
- "GENOMIC_MUTATION_ID"
- ]
+ "GENOMIC_MUTATION_ID",
+ ]
mask = make_exact_match_mask(df, searchterm_lower, cols_to_check)
- match_and_limit(mask, lambda row: {
- col.replace(" ", "_"): row[col]
- for col in row.index
- if not col.startswith("__")
- })
+ match_and_limit(
+ mask, lambda row: {col.replace(" ", "_"): row[col] for col in row.index if not col.startswith("__")}
+ )
elif cosmic_project in ["census", "resistance", "cell_line", "genome_screen", "targeted_screen", "other"]:
# Columns to check for search term
@@ -292,24 +282,24 @@ def match_and_limit(mask, extract_fn):
"GENOMIC_MUTATION_ID",
"LEGACY_MUTATION_ID",
"SAMPLE_NAME",
- "MUTATION_CDS",
- "MUTATION_AA",
+ "MUTATION_CDS",
+ "MUTATION_AA",
"MUTATION_ID",
- "COSMIC_STUDY_ID"
- ]
+ "COSMIC_STUDY_ID",
+ ]
mask = make_exact_match_mask(df, searchterm_lower, cols_to_check)
- match_and_limit(mask, lambda row: {
- col.replace(" ", "_"): row[col]
- for col in row.index
- if not col.startswith("__")
- })
+ match_and_limit(
+ mask, lambda row: {col.replace(" ", "_"): row[col] for col in row.index if not col.startswith("__")}
+ )
else:
raise ValueError(f"Unsupported cosmic_project: {cosmic_project}")
-
+
if len(results) == 0:
- raise ValueError(f"No results were found for searchterm '{searchterm}' and cosmic_project '{cosmic_project}' in COSMIC database file (cosmic_tsv_path) '{cosmic_tsv_path}'.")
+ raise ValueError(
+ f"No results were found for searchterm '{searchterm}' and cosmic_project '{cosmic_project}' in COSMIC database file (cosmic_tsv_path) '{cosmic_tsv_path}'."
+ )
return results
@@ -332,10 +322,10 @@ def cosmic(
mutation_column="mutation",
mut_id_column="mutation_id",
out=None,
- verbose=True
+ verbose=True,
):
- """
- Search for genes, mutations, etc associated with cancers using the COSMIC
+ """Search for genes, mutations, etc associated with cancers using the COSMIC database.
+
(Catalogue Of Somatic Mutations In Cancer) database
(https://cancer.sanger.ac.uk/cosmic).
NOTE: Licence fees apply for the commercial use of COSMIC (https://www.cosmickb.org/licensing).
@@ -373,7 +363,7 @@ def cosmic(
Examples: EGFR, ENST00000275493, c.650A>T, p.Q217L, COSV51765119, BT2012100223LNCTB (sample ID)
NOTE: Set to None when downloading COSMIC databases with download_cosmic=True.
- cosmic_tsv_path (str) Path to the COSMIC mutation tsv file, e.g. 'path/to/CancerMutationCensus_AllData_v101_GRCh37.tsv'.
- This file is downloaded when downloading COSMIC databases using the arguments described above.
+ This file is downloaded when downloading COSMIC databases using the arguments described above.
NOTE: This is a required argument when download_cosmic=False.
- limit (int) Number of hits to return. Default: 100
- json (True/False) If True, returns results in json format instead of data frame. Default: False
@@ -387,7 +377,6 @@ def cosmic(
- When download_cosmic=True: Database will be downloaded into current working directory
- verbose (True/False) whether to print progress information. Default: True
"""
-
if verbose:
logger.info("NOTE: Licence fees apply for the commercial use of COSMIC (https://www.cosmickb.org/licensing).")
@@ -396,7 +385,9 @@ def cosmic(
if not cosmic_project:
cosmic_project = "cancer"
if verbose:
- logger.info(f"No cosmic_project provided. Defaulting to cosmic_project '{cosmic_project}' (also works for 'cancer_example').")
+ logger.info(
+ f"No cosmic_project provided. Defaulting to cosmic_project '{cosmic_project}' (also works for 'cancer_example')."
+ )
mut_class_allowed = [
"cancer",
@@ -412,11 +403,9 @@ def cosmic(
f"Parameter 'cosmic_project' must be one of the following: {', '.join(mut_class_allowed)}.\n"
)
- grch_allowed = ['37', '38']
+ grch_allowed = ["37", "38"]
if str(grch_version) not in grch_allowed:
- raise ValueError(
- f"Parameter 'grch_version' must be one of the following: {', '.join(grch_allowed)}.\n"
- )
+ raise ValueError(f"Parameter 'grch_version' must be one of the following: {', '.join(grch_allowed)}.\n")
if not out:
out = os.getcwd()
@@ -427,21 +416,17 @@ def cosmic(
if not cosmic_version:
cosmic_version = get_latest_cosmic()
if verbose:
- logger.info(
- f"Downloading data from latest COSMIC version (v{cosmic_version})."
- )
+ logger.info(f"Downloading data from latest COSMIC version (v{cosmic_version}).")
## Download requested database
mutation_tsv_file, overwrite = select_reference(
- cosmic_project, out, grch_version, cosmic_version, verbose, email = email, password = password
+ cosmic_project, out, grch_version, cosmic_version, verbose, email=email, password=password
)
if gget_mutate and overwrite is not False:
## Create copy of results formatted for further use by gget mutate
if verbose:
- logger.info(
- "Creating modified mutations file for use with gget mutate..."
- )
+ logger.info("Creating modified mutations file for use with gget mutate...")
if cosmic_project == "cancer" or cosmic_project == "cancer_example":
relevant_cols = [
@@ -494,23 +479,16 @@ def cosmic(
# }
# )
- from gget.gget_mutate import mutation_pattern, convert_chromosome_value_to_int_when_possible
import numpy as np
+ from gget.gget_mutate import convert_chromosome_value_to_int_when_possible, mutation_pattern
+
# * uncomment to include strand information (tested not to be accurate for CMC)
- df[["chromosome", "GENOME_POS"]] = df[
- "Mutation genome position GRCh37"
- ].str.split(":", expand=True)
- df["chromosome"] = df["chromosome"].apply(
- convert_chromosome_value_to_int_when_possible
- )
- df[["GENOME_START", "GENOME_STOP"]] = df["GENOME_POS"].str.split(
- "-", expand=True
- )
+ df[["chromosome", "GENOME_POS"]] = df["Mutation genome position GRCh37"].str.split(":", expand=True)
+ df["chromosome"] = df["chromosome"].apply(convert_chromosome_value_to_int_when_possible)
+ df[["GENOME_START", "GENOME_STOP"]] = df["GENOME_POS"].str.split("-", expand=True)
- df[["nucleotide_positions", "actual_mutation"]] = df[
- "mutation"
- ].str.extract(mutation_pattern)
+ df[["nucleotide_positions", "actual_mutation"]] = df["mutation"].str.extract(mutation_pattern)
sub_mask = df["actual_mutation"].str.contains(">")
ins_mask = (df["actual_mutation"].str.contains("ins")) & (
@@ -520,16 +498,12 @@ def cosmic(
ins_delins_mask = ins_mask | delins_mask
sub_ins_delins_mask = sub_mask | ins_delins_mask
- df.loc[sub_mask, "wt_allele_cds"] = (
- df.loc[sub_mask, "actual_mutation"].str.split(">").str[0]
- )
- df.loc[sub_mask, "mut_allele_cds"] = (
- df.loc[sub_mask, "actual_mutation"].str.split(">").str[1]
- )
+ df.loc[sub_mask, "wt_allele_cds"] = df.loc[sub_mask, "actual_mutation"].str.split(">").str[0]
+ df.loc[sub_mask, "mut_allele_cds"] = df.loc[sub_mask, "actual_mutation"].str.split(">").str[1]
- df.loc[ins_delins_mask, "mut_allele_cds"] = df.loc[
- ins_delins_mask, "actual_mutation"
- ].str.extract(r"ins(.+)")[0]
+ df.loc[ins_delins_mask, "mut_allele_cds"] = df.loc[ins_delins_mask, "actual_mutation"].str.extract(
+ r"ins(.+)"
+ )[0]
df["strand"] = np.nan
@@ -545,13 +519,9 @@ def cosmic(
)
df.loc[sub_mask, "actual_mutation_updated"] = (
- df.loc[sub_mask, "GENOMIC_WT_ALLELE_SEQ"]
- + ">"
- + df.loc[sub_mask, "GENOMIC_MUT_ALLELE_SEQ"]
- )
- df.loc[ins_mask, "actual_mutation_updated"] = (
- "ins" + df.loc[ins_mask, "GENOMIC_MUT_ALLELE_SEQ"]
+ df.loc[sub_mask, "GENOMIC_WT_ALLELE_SEQ"] + ">" + df.loc[sub_mask, "GENOMIC_MUT_ALLELE_SEQ"]
)
+ df.loc[ins_mask, "actual_mutation_updated"] = "ins" + df.loc[ins_mask, "GENOMIC_MUT_ALLELE_SEQ"]
df.loc[delins_mask, "actual_mutation_updated"] = (
"delins" + df.loc[delins_mask, "GENOMIC_MUT_ALLELE_SEQ"]
)
@@ -577,14 +547,10 @@ def cosmic(
+ "_"
+ df["GENOME_STOP"].astype(str)
+ df["actual_mutation_final"],
- "g."
- + df["GENOME_START"].astype(str)
- + df["actual_mutation_final"],
+ "g." + df["GENOME_START"].astype(str) + df["actual_mutation_final"],
)
- df.loc[
- df["Mutation genome position GRCh37"].isna(), "mutation_genome"
- ] = np.nan
+ df.loc[df["Mutation genome position GRCh37"].isna(), "mutation_genome"] = np.nan
df.drop(
columns=[
@@ -635,9 +601,7 @@ def cosmic(
df = df.drop(columns=["GENE_NAME", "MUTATION_ID"])
if remove_duplicates:
- duplicate_count = (
- df.duplicated(subset=["seq_ID", "mutation"], keep=False).sum() // 2
- )
+ duplicate_count = df.duplicated(subset=["seq_ID", "mutation"], keep=False).sum() // 2
print(
f"Removing {duplicate_count} duplicate entries from the COSMIC csv for gget mutate: {duplicate_count}"
)
@@ -657,9 +621,7 @@ def cosmic(
df.to_csv(mutate_csv_out, index=False)
if verbose:
- logger.info(
- f"Modified mutations file for use with gget mutate created at {mutate_csv_out}"
- )
+ logger.info(f"Modified mutations file for use with gget mutate created at {mutate_csv_out}")
else:
# Old code from when COSMIC was acccessible without an account:
@@ -677,7 +639,7 @@ def cosmic(
# raise ValueError(
# f"'entity' argument specified as {entity}. Expected one of: {', '.join(sps)}"
# )
-
+
# # Translate categories to match COSMIC data table IDs
# if entity == "cancer":
# entity = "disease"
@@ -847,7 +809,7 @@ def cosmic(
# counter = counter + 1
# if limit < counter:
# break
-
+
# Check if cosmic_tsv_path exists
if not cosmic_tsv_path or not os.path.exists(cosmic_tsv_path):
example_call_python = f"gget.cosmic(download_cosmic=True, searchterm=None, cosmic_project='{cosmic_project}', grch_version={grch_version}, cosmic_version={cosmic_version or get_latest_cosmic()})"
@@ -868,7 +830,9 @@ def cosmic(
else:
cosmic_project = "other"
if verbose:
- logger.info(f"No cosmic_project provided. Defaulting to cosmic_project '{cosmic_project}' (incapsulates all mutation classes except 'cancer' and 'cancer_example').")
+ logger.info(
+ f"No cosmic_project provided. Defaulting to cosmic_project '{cosmic_project}' (incapsulates all mutation classes except 'cancer' and 'cancer_example')."
+ )
# Query local COSMIC database
dicts = query_local_cosmic(cosmic_tsv_path, cosmic_project, searchterm, limit)
diff --git a/gget/gget_diamond.py b/gget/gget_diamond.py
index d5b1c6ab2..1dba1a60c 100644
--- a/gget/gget_diamond.py
+++ b/gget/gget_diamond.py
@@ -1,25 +1,20 @@
+import json as json_package
+import os
+import platform
import subprocess
import sys
-import platform
-import os
-import pandas as pd
import uuid
-import json as json_package
from .compile import PACKAGE_PATH
-from .utils import tsv_to_df, create_tmp_fasta, remove_temp_files, set_up_logger
+from .utils import create_tmp_fasta, remove_temp_files, set_up_logger, tsv_to_df
logger = set_up_logger()
# Path to precompiled diamond binary
if platform.system() == "Windows":
- PRECOMPILED_DIAMOND_PATH = os.path.join(
- PACKAGE_PATH, f"bins/{platform.system()}/diamond.exe"
- )
+ PRECOMPILED_DIAMOND_PATH = os.path.join(PACKAGE_PATH, f"bins/{platform.system()}/diamond.exe")
else:
- PRECOMPILED_DIAMOND_PATH = os.path.join(
- PACKAGE_PATH, f"bins/{platform.system()}/diamond"
- )
+ PRECOMPILED_DIAMOND_PATH = os.path.join(PACKAGE_PATH, f"bins/{platform.system()}/diamond")
def diamond(
@@ -34,8 +29,7 @@ def diamond(
json=False,
out=None,
):
- """
- Align multiple protein or translated DNA sequences using DIAMOND (https://www.nature.com/articles/nmeth.3176).
+ """Align multiple protein or translated DNA sequences using DIAMOND (https://www.nature.com/articles/nmeth.3176).
Args:
- query Sequences (str or list) or path to FASTA file containing sequences to be aligned against the reference.
@@ -128,14 +122,14 @@ def diamond(
if translated:
if verbose:
- logger.info(f"Aligning nucleotide query to amino acid reference (blastx mode).")
+ logger.info("Aligning nucleotide query to amino acid reference (blastx mode).")
diamond_program = "blastx"
else:
diamond_program = "blastp"
# Run DIAMOND commands as separate subprocess calls (avoids shell=True security issues)
if verbose:
- logger.info(f"Creating DIAMOND database and initiating alignment...")
+ logger.info("Creating DIAMOND database and initiating alignment...")
# Step 1: Check diamond version
version_cmd = [diamond_bin, "version"]
@@ -147,13 +141,7 @@ def diamond(
raise RuntimeError("DIAMOND version check failed.")
# Step 2: Create database
- makedb_cmd = [
- diamond_bin, "makedb",
- "--quiet",
- "--in", ref_file,
- "--db", db_path,
- "--threads", str(threads)
- ]
+ makedb_cmd = [diamond_bin, "makedb", "--quiet", "--in", ref_file, "--db", db_path, "--threads", str(threads)]
with subprocess.Popen(makedb_cmd, stderr=subprocess.PIPE) as process:
stderr = process.stderr.read().decode("utf-8")
if stderr:
@@ -163,17 +151,35 @@ def diamond(
# Step 3: Run alignment
align_cmd = [
- diamond_bin, diamond_program,
- "--outfmt", "6",
- "qseqid", "sseqid", "pident", "qlen", "slen", "length",
- "mismatch", "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore",
+ diamond_bin,
+ diamond_program,
+ "--outfmt",
+ "6",
+ "qseqid",
+ "sseqid",
+ "pident",
+ "qlen",
+ "slen",
+ "length",
+ "mismatch",
+ "gapopen",
+ "qstart",
+ "qend",
+ "sstart",
+ "send",
+ "evalue",
+ "bitscore",
"--quiet",
- "--query", in_file,
- "--db", ref_file,
- "--out", out_file,
+ "--query",
+ in_file,
+ "--db",
+ ref_file,
+ "--out",
+ out_file,
f"--{sensitivity}",
- "--threads", str(threads),
- "--ignore-warnings"
+ "--threads",
+ str(threads),
+ "--ignore-warnings",
]
with subprocess.Popen(align_cmd, stderr=subprocess.PIPE) as process:
stderr = process.stderr.read().decode("utf-8")
@@ -184,7 +190,7 @@ def diamond(
raise RuntimeError("DIAMOND alignment failed.")
else:
if verbose:
- logger.info(f"DIAMOND alignment complete.")
+ logger.info("DIAMOND alignment complete.")
df_diamond = tsv_to_df(
output,
diff --git a/gget/gget_elm.py b/gget/gget_elm.py
index cf766473d..806c2090c 100644
--- a/gget/gget_elm.py
+++ b/gget/gget_elm.py
@@ -1,26 +1,26 @@
-import pandas as pd
-import numpy as np
-import os
import json as json_package
+import os
import re
-from .utils import get_uniprot_seqs, tsv_to_df, set_up_logger
+import numpy as np
+import pandas as pd
+
+from .utils import get_uniprot_seqs, set_up_logger, tsv_to_df
logger = set_up_logger()
-from .constants import UNIPROT_REST_API
-from .gget_diamond import diamond
-from .gget_setup import (
- ELM_INSTANCES_FASTA,
+from .constants import UNIPROT_REST_API # noqa: E402
+from .gget_diamond import diamond # noqa: E402
+from .gget_setup import ( # noqa: E402
ELM_CLASSES_TSV,
+ ELM_INSTANCES_FASTA,
ELM_INSTANCES_TSV,
ELM_INTDOMAINS_TSV,
)
def motif_in_query(row):
- """
- Checks if motif is in the overlapping region with the query sequence
+ """Checks if motif is in the overlapping region with the query sequence.
Args:
row - row in dataframe
@@ -29,15 +29,13 @@ def motif_in_query(row):
"""
return (
True
- if (row["motif_start_in_subject"] >= row["subject_start"])
- & (row["motif_end_in_subject"] <= row["subject_end"])
+ if (row["motif_start_in_subject"] >= row["subject_start"]) & (row["motif_end_in_subject"] <= row["subject_end"])
else False
)
def get_elm_instances(UniProtID):
- """
- Get ELM instances and their information from local ELM tsv files.
+ """Get ELM instances and their information from local ELM tsv files.
Args:
- UniProtID UniProt Acc to search for in the accession column of ELM tsv files.
@@ -47,9 +45,7 @@ def get_elm_instances(UniProtID):
# Get matching rows from elm_instances.tsv
# ELM Instances.tsv file contains 5 lines before headers and data
df_full_instances = tsv_to_df(ELM_INSTANCES_TSV, skiprows=5)
- df_instances_matching = df_full_instances[
- df_full_instances["Primary_Acc"] == UniProtID
- ]
+ df_instances_matching = df_full_instances[df_full_instances["Primary_Acc"] == UniProtID]
# Rename columns
df_instances_matching = df_instances_matching.rename(
columns={
@@ -90,8 +86,8 @@ def seq_workflow(
verbose,
diamond_binary,
):
- """
- Alignment of sequence using DIAMOND to get UniProt Acc. Use the UniProt Acc to construct an ortholog dataframe similar to the UniProt workflow
+ """Alignment of sequence using DIAMOND to get UniProt Acc. Use the UniProt Acc to construct an ortholog dataframe similar to the UniProt workflow
+
except for additional columns for start, end and whether the motif overlaps the subject sequence.
Args:
@@ -133,10 +129,7 @@ def seq_workflow(
# Construct df with elm instances from UniProt Acc returned from diamond
# TODO double check that this gets info if more than one UniProt Acc matched
if verbose:
- uniprot_ids = [
- str(id).split("|")[1]
- for id in df_diamond["subject_accession"].values
- ]
+ uniprot_ids = [str(id).split("|")[1] for id in df_diamond["subject_accession"].values]
logger.info(
f"ORTHO Sequence {seq_number}/{len(sequences)}: DIAMOND found the following orthologous proteins: {', '.join(uniprot_ids)}. Retrieving ELMs for each UniProt Acc..."
)
@@ -147,20 +140,14 @@ def seq_workflow(
# missing motifs other than the first one
# df_elm["query_cover"] = df_diamond["length"].values[i] / seq_len * 100
df_elm["query_seq_length"] = df_diamond["query_seq_length"].values[i]
- df_elm["subject_seq_length"] = df_diamond["subject_seq_length"].values[
- i
- ]
+ df_elm["subject_seq_length"] = df_diamond["subject_seq_length"].values[i]
df_elm["alignment_length"] = df_diamond["length"].values[i]
- df_elm["identity_percentage"] = df_diamond[
- "identity_percentage"
- ].values[i]
+ df_elm["identity_percentage"] = df_diamond["identity_percentage"].values[i]
df_elm["query_start"] = int(df_diamond["query_start"].values[i])
df_elm["query_end"] = int(df_diamond["query_end"].values[i])
df_elm["subject_start"] = int(df_diamond["subject_start"].values[i])
df_elm["subject_end"] = int(df_diamond["subject_end"].values[i])
- df_elm["motif_inside_subject_query_overlap"] = df_elm.apply(
- motif_in_query, axis=1
- )
+ df_elm["motif_inside_subject_query_overlap"] = df_elm.apply(motif_in_query, axis=1)
df = pd.concat([df, df_elm])
@@ -170,15 +157,16 @@ def seq_workflow(
def regex_match(sequence):
- """
- Compare ELM regex with input sequence and return all matching elms
+ """Compare ELM regex with input sequence and return all matching elms.
Args:
sequence - user input sequence (can be either amino acid seq or UniProt Acc)
- Returns:
+ Returns
+ -------
df_final - dataframe containing regex matches
TODO: Make sure this returns empty dataframe if no matches were found
+
"""
# Get all motif regex patterns from elm db local file
df_elm_classes = tsv_to_df(ELM_CLASSES_TSV, skiprows=5)
@@ -199,7 +187,7 @@ def regex_match(sequence):
df_final = pd.DataFrame()
# Compare ELM regex with input sequence and return all matching elms
- for elm_id, pattern in zip(elm_ids, regex_patterns):
+ for elm_id, pattern in zip(elm_ids, regex_patterns, strict=False):
regex_matches = re.finditer(f"(?=({pattern}))", sequence)
for match_string in regex_matches:
@@ -214,7 +202,7 @@ def regex_match(sequence):
elm_row.insert(loc=2, column="motif_start_in_query", value=int(start + 1))
elm_row.insert(loc=3, column="motif_end_in_query", value=int(end))
- elm_identifier = [str(x) for x in elm_row["ELMIdentifier"]][0]
+ [str(x) for x in elm_row["ELMIdentifier"]][0]
# df_instances_matching = df_full_instances.loc[
# df_full_instances["ELMIdentifier"] == elm_identifier
@@ -243,8 +231,8 @@ def elm(
json=False,
out=None,
):
- """
- Locally predicts Eukaryotic Linear Motifs from an amino acid sequence or UniProt Acc using
+ """Locally predicts Eukaryotic Linear Motifs from an amino acid sequence or UniProt Acc using
+
data from the ELM database (http://elm.eu.org/).
Args:
@@ -276,7 +264,7 @@ def elm(
or not os.path.exists(ELM_INTDOMAINS_TSV)
):
raise FileNotFoundError(
- f"Some or all ELM database files are missing. Please run 'gget setup elm' (Python: gget.setup('elm')) once to download the necessary files."
+ "Some or all ELM database files are missing. Please run 'gget setup elm' (Python: gget.setup('elm')) once to download the necessary files."
)
# Let users know when local ELM was last updated
@@ -299,12 +287,12 @@ def elm(
# If sequence is not a valid amino sequence, raise error
if not set(sequence) <= amino_acids:
logger.warning(
- f"Input amino acid sequence contains invalid characters. If the input is a UniProt Acc, please use flag --uniprot (Python: uniprot=True)."
+ "Input amino acid sequence contains invalid characters. If the input is a UniProt Acc, please use flag --uniprot (Python: uniprot=True)."
)
# Build ortholog dataframe
if verbose:
- logger.info(f"ORTHO Compiling ortholog information...")
+ logger.info("ORTHO Compiling ortholog information...")
ortho_df = pd.DataFrame()
if uniprot:
ortho_df = get_elm_instances(sequence)
@@ -317,9 +305,7 @@ def elm(
if len(df_uniprot) > 0:
# Only grab sequences where IDs match exactly
- aa_seqs = df_uniprot[df_uniprot["uniprot_id"] == sequence][
- "sequence"
- ].values
+ aa_seqs = df_uniprot[df_uniprot["uniprot_id"] == sequence]["sequence"].values
if len(aa_seqs) == 0:
raise ValueError(
@@ -350,9 +336,7 @@ def elm(
)
if len(ortho_df) == 0:
- logger.warning(
- "ORTHO No ELM database orthologs found for input sequence or UniProt Acc."
- )
+ logger.warning("ORTHO No ELM database orthologs found for input sequence or UniProt Acc.")
# Reorder columns of ortholog data frame
ortho_cols = [
@@ -393,28 +377,25 @@ def elm(
ortho_df = ortho_df[ortho_cols]
# Remove false positives and true negatives
ortho_df = ortho_df[
- (ortho_df["InstanceLogic"] != "false positive")
- & (ortho_df["InstanceLogic"] != "true negative")
+ (ortho_df["InstanceLogic"] != "false positive") & (ortho_df["InstanceLogic"] != "true negative")
]
# Drop duplicate rows and reset the index
ortho_df = ortho_df.drop_duplicates().reset_index(drop=True)
# Build data frame containing regex motif matches
if verbose:
- logger.info(f"REGEX Finding regex motif matches...")
+ logger.info("REGEX Finding regex motif matches...")
fetch_aa_failed = False
if uniprot:
# use amino acid sequence associated with UniProt Acc to do regex match
# do not fetch sequence again if already done above
- if not "df_uniprot" in locals():
+ if "df_uniprot" not in locals():
df_uniprot = get_uniprot_seqs(UNIPROT_REST_API, sequence)
if len(df_uniprot) > 0:
# Only grab sequences where IDs match exactly
- sequences = df_uniprot[df_uniprot["uniprot_id"] == sequence][
- "sequence"
- ].values
+ sequences = df_uniprot[df_uniprot["uniprot_id"] == sequence]["sequence"].values
if len(sequences) == 0:
logger.warning(
@@ -433,9 +414,7 @@ def elm(
df_regex_matches = regex_match(sequence)
if len(df_regex_matches) == 0:
- logger.warning(
- "REGEX No regex matches found for input sequence or UniProt Acc."
- )
+ logger.warning("REGEX No regex matches found for input sequence or UniProt Acc.")
# Reorder regex columns
if expand:
@@ -492,8 +471,7 @@ def elm(
df_regex_matches = df_regex_matches[regex_cols]
# Remove false positives and true negatives
df_regex_matches = df_regex_matches[
- (df_regex_matches["InstanceLogic"] != "false positive")
- & (df_regex_matches["InstanceLogic"] != "true negative")
+ (df_regex_matches["InstanceLogic"] != "false positive") & (df_regex_matches["InstanceLogic"] != "true negative")
]
# Drop duplicates and reset index
df_regex_matches = df_regex_matches.drop_duplicates().reset_index(drop=True)
diff --git a/gget/gget_enrichr.py b/gget/gget_enrichr.py
index 8b1050dc3..9e98c4121 100644
--- a/gget/gget_enrichr.py
+++ b/gget/gget_enrichr.py
@@ -1,32 +1,29 @@
-import requests
-import pandas as pd
import json as json_package
-import numpy as np
+import textwrap
# Plotting packages
import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import requests
from matplotlib.ticker import MaxNLocator
-import textwrap
+from .compile import PACKAGE_PATH
from .constants import (
- POST_ENRICHR_URLS,
+ DEFAULT_REQUESTS_TIMEOUT,
+ GET_BACKGROUND_ENRICHR_URL,
GET_ENRICHR_URLS,
POST_BACKGROUND_ID_ENRICHR_URL,
- GET_BACKGROUND_ENRICHR_URL,
- DEFAULT_REQUESTS_TIMEOUT,
+ POST_ENRICHR_URLS,
)
-from .compile import PACKAGE_PATH
from .gget_info import info
-
from .utils import set_up_logger
logger = set_up_logger()
def ensembl_to_gene_names(ensembl_ids):
- """
- Function to fetch gene names from a list of Ensembl IDs using gget info.
- """
+ """Function to fetch gene names from a list of Ensembl IDs using gget info."""
genes_v2 = []
# Remove version number if passed
@@ -37,9 +34,7 @@ def ensembl_to_gene_names(ensembl_ids):
for gene_id in ensembl_ids:
# Check if Ensembl ID was found
if gene_id not in info_df.index:
- logger.warning(
- f"ID '{gene_id}' not found. Please double-check spelling/arguments."
- )
+ logger.warning(f"ID '{gene_id}' not found. Please double-check spelling/arguments.")
continue
gene_symbol = info_df.loc[gene_id]["ensembl_gene_name"]
@@ -54,6 +49,7 @@ def ensembl_to_gene_names(ensembl_ids):
def clean_genes_list(genes_list):
+ """Remove NaNs, Nones, and 'nan' strings from a list of genes."""
# Remove any NaNs/Nones from the gene list
genes_clean = []
for gene in genes_list:
@@ -79,8 +75,7 @@ def enrichr(
save=False,
verbose=True,
):
- """
- Perform an enrichment analysis on a list of genes using Enrichr (https://maayanlab.cloud/Enrichr/).
+ """Perform an enrichment analysis on a list of genes using Enrichr (https://maayanlab.cloud/Enrichr/).
Args:
- genes List of Entrez gene symbols to perform enrichment analysis on, passed as a list of strings, e.g. ['PHF14', 'RBM3', 'MSL1', 'PHF21A'].
@@ -117,11 +112,8 @@ def enrichr(
Returns a data frame with the Enrichr results.
"""
-
if species not in ["human", "mouse", "fly", "yeast", "worm", "fish"]:
- raise ValueError(
- f"Argument 'species' must be one of 'human', 'mouse', 'fly', 'yeast', 'worm', or 'fish'."
- )
+ raise ValueError("Argument 'species' must be one of 'human', 'mouse', 'fly', 'yeast', 'worm', or 'fish'.")
if species == "mouse":
species = "human"
@@ -161,61 +153,49 @@ def enrichr(
# All available libraries: https://maayanlab.cloud/Enrichr/#libraries
if species == "human":
db_message = f"""
- Please note that there might be a more appropriate database for your application.
+ Please note that there might be a more appropriate database for your application.
Go to https://maayanlab.cloud/{species_enrichr}/#libraries for a full list of supported databases.
"""
else:
db_message = f"""
- Please note that there might be a more appropriate database for your application.
+ Please note that there might be a more appropriate database for your application.
Go to https://maayanlab.cloud/{species_enrichr}/#stats for a full list of supported databases.
"""
if not isinstance(background, bool):
raise ValueError(
- f"Argument`background` must be a boolean True/False. If you are adding a background list, use the argument `background_list` instead."
+ "Argument`background` must be a boolean True/False. If you are adding a background list, use the argument `background_list` instead."
)
# Handle database shortcuts
if database == "pathway":
database = "KEGG_2021_Human"
if verbose:
- logger.info(
- f"Performing Enrichr analysis using database {database}. " + db_message
- )
+ logger.info(f"Performing Enrichr analysis using database {database}. " + db_message)
elif database == "transcription":
database = "ChEA_2016"
if verbose:
- logger.info(
- f"Performing Enrichr analysis using database {database}. " + db_message
- )
+ logger.info(f"Performing Enrichr analysis using database {database}. " + db_message)
elif database == "ontology":
database = "GO_Biological_Process_2021"
if verbose:
- logger.info(
- f"Performing Enrichr analysis using database {database}. " + db_message
- )
+ logger.info(f"Performing Enrichr analysis using database {database}. " + db_message)
elif database == "diseases_drugs":
database = "GWAS_Catalog_2019"
if verbose:
- logger.info(
- f"Performing Enrichr analysis using database {database}. " + db_message
- )
+ logger.info(f"Performing Enrichr analysis using database {database}. " + db_message)
elif database == "celltypes":
database = "PanglaoDB_Augmented_2021"
if verbose:
- logger.info(
- f"Performing Enrichr analysis using database {database}. " + db_message
- )
+ logger.info(f"Performing Enrichr analysis using database {database}. " + db_message)
elif database == "kinase_interactions":
database = "KEA_2015"
if verbose:
- logger.info(
- f"Performing Enrichr analysis using database {database}. " + db_message
- )
+ logger.info(f"Performing Enrichr analysis using database {database}. " + db_message)
else:
database = database
@@ -225,9 +205,7 @@ def enrichr(
# To generate a KEGG pathway image, confirm that the database is a KEGG database and pykegg is installed
if kegg_out:
if not database.startswith("KEGG"):
- logger.error(
- "Please specify a KEGG database when generating a KEGG pathway image."
- )
+ logger.error("Please specify a KEGG database when generating a KEGG pathway image.")
return
try:
import pykegg
@@ -268,9 +246,7 @@ def enrichr(
if ensembl:
if verbose:
- logger.info(
- f"Performing Enrichr analysis on the following gene symbols: {', '.join(genes_clean)}"
- )
+ logger.info(f"Performing Enrichr analysis on the following gene symbols: {', '.join(genes_clean)}")
# Join genes from list
genes_clean_final = "\n".join(genes_clean)
@@ -303,9 +279,7 @@ def enrichr(
# If user gives a background list, use the user input instead of the default
if background_list:
if verbose:
- logger.info(
- f"Performing Enrichr analysis using user-defined background gene list."
- )
+ logger.info("Performing Enrichr analysis using user-defined background gene list.")
if background:
logger.warning(
@@ -409,14 +383,14 @@ def enrichr(
if species == "human":
logger.error(
f"""
- Database {database} not found. Go to https://maayanlab.cloud/{species_enrichr}/#libraries
+ Database {database} not found. Go to https://maayanlab.cloud/{species_enrichr}/#libraries
for a full list of supported databases.
"""
)
else:
logger.error(
f"""
- Database {database} not found. Go to https://maayanlab.cloud/{species_enrichr}/#stats
+ Database {database} not found. Go to https://maayanlab.cloud/{species_enrichr}/#stats
for a full list of supported databases.
"""
)
@@ -485,12 +459,8 @@ def enrichr(
# Plot barplot
# ax1.barh(np.arange(len(gene_counts)), gene_counts, color=cmap(c_values), align="center")
- ax1.barh(
- np.arange(len(gene_counts)), gene_counts, color=barcolor, align="center"
- )
- ax1.set_yticks(
- np.arange(len(gene_counts)), labels, linespacing=0.85, fontsize=fontsize
- )
+ ax1.barh(np.arange(len(gene_counts)), gene_counts, color=barcolor, align="center")
+ ax1.set_yticks(np.arange(len(gene_counts)), labels, linespacing=0.85, fontsize=fontsize)
ax1.invert_yaxis()
# Set x-limit to be gene count + 1
ax1.set_xlim(0, ax1.get_xlim()[1] + 1)
@@ -509,9 +479,7 @@ def enrichr(
s=20,
)
# Change label and color of p-value axis
- ax2.set_xlabel(
- "$-log_{10}$(adjusted P value)", fontsize=fontsize, color=p_val_color
- )
+ ax2.set_xlabel("$-log_{10}$(adjusted P value)", fontsize=fontsize, color=p_val_color)
ax2.spines["top"].set_color(p_val_color)
ax2.tick_params(axis="x", colors=p_val_color, labelsize=fontsize)
@@ -543,9 +511,7 @@ def enrichr(
ax1.tick_params(axis="y", labelsize=fontsize)
# Set title
- ax1.set_title(
- f"Enrichr results from database {database}", fontsize=fontsize + 2
- )
+ ax1.set_title(f"Enrichr results from database {database}", fontsize=fontsize + 2)
# Set axis margins
ax1.margins(y=0, x=0)
@@ -567,7 +533,7 @@ def enrichr(
# Generate KEGG pathway image
if kegg_out:
candidate_rank = df[df["rank"] == kegg_rank].iloc[0, :]
- kegg_img = pykegg.visualize(
+ pykegg.visualize(
candidate_rank["path_name"],
candidate_rank["overlapping_genes"],
db=database,
diff --git a/gget/gget_gpt.py b/gget/gget_gpt.py
index 463ae721c..e22a7622a 100644
--- a/gget/gget_gpt.py
+++ b/gget/gget_gpt.py
@@ -17,10 +17,10 @@ def gpt(
out=None,
verbose=True,
):
- """
- Generates natural language text based on a given prompt using the OpenAI API's 'openai.ChatCompletion.create' endpoint.
+ """Generates natural language text based on a given prompt using the OpenAI API's 'openai.ChatCompletion.create' endpoint.
- Parameters:
+ Parameters
+ ----------
- prompt (str): The input prompt to generate text from.
- api_key (str): Your OpenAI API key (see: https://platform.openai.com/account/api-keys).
- model (str): The name of the GPT model to use for generating the text. Default is "gpt-3.5-turbo".
@@ -40,7 +40,8 @@ def gpt(
- out (str) If provided, saves the generated text to a file with the specified path. Default is None.
- verbose True/False whether to print progress information. Default True.
- Returns:
+ Returns
+ -------
- A string containing the generated text.
NOTE: OpenAI API calls are only 'free' for the first three months after generating your OpenAI Account
@@ -49,6 +50,7 @@ def gpt(
See their pricing and FAQ here: https://openai.com/pricing
This module, including its source code, documentation and unittests, were partly written by OpenAI's Chat-GTP3.
+
"""
# Check if cellxgene_census is installed
try:
@@ -56,7 +58,7 @@ def gpt(
except ImportError:
logger.error(
"""
- Some third-party dependencies are missing. Please run the following command:
+ Some third-party dependencies are missing. Please run the following command:
>>> gget.setup('gpt') or $ gget setup gpt
Alternative: Install the openai package using pip (https://pypi.org/project/openai).
@@ -100,9 +102,7 @@ def gpt(
)
if verbose:
- logger.info(
- f"Total tokens used for API call to model '{model}': {response['usage']['total_tokens']}"
- )
+ logger.info(f"Total tokens used for API call to model '{model}': {response['usage']['total_tokens']}")
texts = response["choices"][0]["message"]["content"]
diff --git a/gget/gget_info.py b/gget/gget_info.py
index d7497aa90..ddf69b8f3 100644
--- a/gget/gget_info.py
+++ b/gget/gget_info.py
@@ -1,27 +1,27 @@
+import json as json_package
+
import numpy as np
import pandas as pd
-import json as json_package
import requests
from bs4 import BeautifulSoup
# Custom functions
from .utils import (
- rest_query,
- get_uniprot_info,
- wrap_cols_func,
get_pdb_ids,
- set_up_logger,
+ get_uniprot_info,
post_query,
+ set_up_logger,
+ wrap_cols_func,
)
logger = set_up_logger()
# Constants
-from .constants import (
+from .constants import ( # noqa: E402
+ DEFAULT_REQUESTS_TIMEOUT,
ENSEMBL_REST_API,
- UNIPROT_REST_API,
NCBI_URL,
- DEFAULT_REQUESTS_TIMEOUT,
+ UNIPROT_REST_API,
)
@@ -38,8 +38,7 @@ def info(
expand=False,
ensembl_only=False,
):
- """
- Fetch gene and transcript metadata using Ensembl IDs.
+ """Fetch gene and transcript metadata using Ensembl IDs.
Args:
- ens_ids One or more Ensembl IDs to look up (string or list of strings).
@@ -66,9 +65,7 @@ def info(
)
if ensembl_only:
if verbose:
- logger.warning(
- "'ensembl_only' argument deprecated! Please use arguments 'ncbi=False' and 'uniprot=False'."
- )
+ logger.warning("'ensembl_only' argument deprecated! Please use arguments 'ncbi=False' and 'uniprot=False'.")
# Set synonyms found by each database initially to none
ncbi_synonyms = None
@@ -84,11 +81,10 @@ def info(
# Define Ensembl REST API server
server = ENSEMBL_REST_API
# Define type of returned content from REST
- content_type = "application/json"
## Clean up Ensembl IDs
# If single Ensembl ID passed as string, convert to list
- if type(ens_ids) == str:
+ if isinstance(ens_ids, str):
ens_ids = [ens_ids]
# Remove Ensembl ID version if passed
ens_ids_clean = []
@@ -124,7 +120,7 @@ def info(
results_dict = post_query(server, endpoint, query)
results_dict = {k: v for k, v in results_dict.items() if v is not None}
- for ensembl_ID, df_temp in results_dict.items():
+ for ensembl_ID, df_temp in results_dict.items(): # noqa: B007
try:
# Add Ensembl ID with latest version number to df_temp
df_temp["ensembl_id"] = str(df_temp["id"]) + "." + str(df_temp["version"])
@@ -133,11 +129,7 @@ def info(
df_temp["ensembl_id"] = str(df_temp["id"])
# second pass for ids that were not found in the initial query
- ens_ids_clean_tmp = [
- ensembl_ID
- for ensembl_ID in ens_ids_clean
- if ensembl_ID not in results_dict.keys()
- ]
+ ens_ids_clean_tmp = [ensembl_ID for ensembl_ID in ens_ids_clean if ensembl_ID not in results_dict.keys()]
if len(ens_ids_clean_tmp) > 0:
# print(f"Second pass for ids: {ens_ids_clean_tmp}")
@@ -146,12 +138,10 @@ def info(
results_dict_new = post_query(server, endpoint, query)
results_dict_new = {k: v for k, v in results_dict_new.items() if v is not None}
- for ensembl_ID, df_temp in results_dict_new.items():
+ for ensembl_ID, df_temp in results_dict_new.items(): # noqa: B007
try:
# Add Ensembl ID with latest version number to df_temp
- df_temp["ensembl_id"] = (
- str(df_temp["id"]) + "." + str(df_temp["version"])
- )
+ df_temp["ensembl_id"] = str(df_temp["id"]) + "." + str(df_temp["version"])
except KeyError:
# Just add Ensembl ID if no version found
df_temp["ensembl_id"] = str(df_temp["id"])
@@ -164,14 +154,10 @@ def info(
ens_ids_clean_2.append(ensembl_ID)
else:
if verbose:
- logger.warning(
- f"ID '{ensembl_ID}' not found. Please double-check spelling/arguments and try again."
- )
+ logger.warning(f"ID '{ensembl_ID}' not found. Please double-check spelling/arguments and try again.")
# rewrite results to be in the input order
- results_dict = {
- ensembl_ID: results_dict[ensembl_ID] for ensembl_ID in ens_ids_clean_2
- }
+ results_dict = {ensembl_ID: results_dict[ensembl_ID] for ensembl_ID in ens_ids_clean_2}
master_dict.update(results_dict)
@@ -199,15 +185,11 @@ def info(
if fetch_uniprot is True:
try:
# Get gene names and descriptions from UniProt
- df_uniprot = get_uniprot_info(
- UNIPROT_REST_API, ens_id, verbose=verbose
- )
+ df_uniprot = get_uniprot_info(UNIPROT_REST_API, ens_id, verbose=verbose)
- except Exception as e:
+ except Exception as e: # noqa: BLE001
if verbose:
- logger.warning(
- f"UniProt server request for ID '{ens_id}' returned the following error:\n{e}"
- )
+ logger.warning(f"UniProt server request for ID '{ens_id}' returned the following error:\n{e}")
continue
if not isinstance(df_uniprot, type(None)):
@@ -224,9 +206,7 @@ def info(
# Get uniprot synonyms and remove NaN values
uni_synonyms = df_uniprot["uni_synonyms"].values[0]
- uni_synonyms = [
- item for item in uni_synonyms if not (pd.isnull(item)) == True
- ]
+ uni_synonyms = [item for item in uni_synonyms if not (pd.isnull(item))]
# Transpose UniProt data frame and add Ensembl ID as column name
df_uniprot = df_uniprot.T
@@ -254,12 +234,9 @@ def info(
# Check for error message in NCBI return
if (
soup.find("li", class_="error icon") is not None
- and "An error has occured"
- in soup.find("li", class_="error icon").text.strip()
+ and "An error has occured" in soup.find("li", class_="error icon").text.strip()
):
- error_message = soup.find(
- "li", class_="error icon"
- ).text.strip()
+ error_message = soup.find("li", class_="error icon").text.strip()
logger.error(
f"The NCBI server request for Ensembl ID '{ens_id}' returned the following error:\n{error_message}"
@@ -272,9 +249,7 @@ def info(
# Check if NCBI gene ID is available
try:
- ncbi_gene_id = soup.find("input", {"id": "gene-id-value"}).get(
- "value"
- )
+ ncbi_gene_id = soup.find("input", {"id": "gene-id-value"}).get("value")
except AttributeError:
ncbi_gene_id = np.nan
@@ -302,7 +277,7 @@ def info(
except AttributeError:
ncbi_synonyms = None
- except Exception as e:
+ except Exception as e: # noqa: BLE001
logger.error(
f"The NCBI server request for Ensembl ID '{ens_id}' returned the following error:\n{e}"
)
@@ -329,7 +304,7 @@ def info(
try:
pdb_ids = get_pdb_ids(ens_id)
- except Exception as e:
+ except Exception as e: # noqa: BLE001
if verbose:
logger.warning(
f"The PDBe server request for Ensembl ID '{ens_id}' returned the following error:\n{e}"
@@ -352,7 +327,7 @@ def info(
if ncbi_synonyms is not None and not isinstance(df_uniprot, type(None)):
synonyms = list(set().union(uni_synonyms, ncbi_synonyms))
# Remove nan values
- synonyms = [item for item in synonyms if not (pd.isnull(item)) == True]
+ synonyms = [item for item in synonyms if not (pd.isnull(item))]
# Add only UniProt synonyms if NCBI syns not available
elif ncbi_synonyms is None and not isinstance(df_uniprot, type(None)):
@@ -370,7 +345,7 @@ def info(
# Sort synonyms alphabetically (if sortable)
try:
synonyms = sorted(synonyms)
- except:
+ except Exception: # noqa: BLE001
pass
# Append dataframes with data from NCBI, UniProt and PDB from ens_id to df_temp
@@ -448,34 +423,32 @@ def info(
try:
try:
# Add Transcript ID with latest version if available
- versioned_trans_id = (
- str(trans_dict["id"]) + "." + str(trans_dict["version"])
- )
+ versioned_trans_id = str(trans_dict["id"]) + "." + str(trans_dict["version"])
all_transcripts.append(versioned_trans_id)
except KeyError:
# Just add ID if no version found
all_transcripts.append(trans_dict["id"])
- except:
+ except Exception: # noqa: BLE001
all_transcripts.append(np.nan)
try:
transcript_names.append(trans_dict["display_name"])
- except:
+ except Exception: # noqa: BLE001
transcript_names.append(np.nan)
try:
transcript_biotypes.append(trans_dict["biotype"])
- except:
+ except Exception: # noqa: BLE001
transcript_biotypes.append(np.nan)
try:
transcript_starts.append(trans_dict["start"])
- except:
+ except Exception: # noqa: BLE001
transcript_starts.append(np.nan)
try:
transcript_ends.append(trans_dict["end"])
- except:
+ except Exception: # noqa: BLE001
transcript_ends.append(np.nan)
try:
transcript_strands.append(trans_dict["strand"])
- except:
+ except Exception: # noqa: BLE001
transcript_strands.append(np.nan)
data["all_transcripts"].append(all_transcripts)
@@ -485,7 +458,7 @@ def info(
data["transcript_starts"].append(transcript_starts)
data["transcript_ends"].append(transcript_ends)
- except:
+ except Exception: # noqa: BLE001
data["all_transcripts"].append(np.nan)
data["transcript_biotypes"].append(np.nan)
data["transcript_names"].append(np.nan)
@@ -502,29 +475,27 @@ def info(
try:
try:
# Add ID with latest version if available
- versioned_id = (
- str(exon_dict["id"]) + "." + str(exon_dict["version"])
- )
+ versioned_id = str(exon_dict["id"]) + "." + str(exon_dict["version"])
all_exons.append(versioned_id)
except KeyError:
# Just add ID if no version found
all_exons.append(exon_dict["id"])
- except:
+ except Exception: # noqa: BLE001
all_exons.append(np.nan)
try:
exon_starts.append(exon_dict["start"])
- except:
+ except Exception: # noqa: BLE001
exon_starts.append(np.nan)
try:
exon_ends.append(exon_dict["end"])
- except:
+ except Exception: # noqa: BLE001
exon_ends.append(np.nan)
data["all_exons"].append(all_exons)
data["exon_starts"].append(exon_starts)
data["exon_ends"].append(exon_ends)
- except:
+ except Exception: # noqa: BLE001
data["all_exons"].append(np.nan)
data["exon_starts"].append(np.nan)
data["exon_ends"].append(np.nan)
@@ -538,37 +509,33 @@ def info(
try:
try:
# Add ID with latest version if available
- versioned_id = (
- str(transl_dict["id"]) + "." + str(transl_dict["version"])
- )
+ versioned_id = str(transl_dict["id"]) + "." + str(transl_dict["version"])
all_translations.append(versioned_id)
except KeyError:
# Just add ID if no version found
all_translations.append(transl_dict["id"])
- except:
+ except Exception: # noqa: BLE001
all_translations.append(np.nan)
try:
translation_starts.append(transl_dict["start"])
- except:
+ except Exception: # noqa: BLE001
translation_starts.append(np.nan)
try:
translation_ends.append(transl_dict["end"])
- except:
+ except Exception: # noqa: BLE001
translation_ends.append(np.nan)
data["all_translations"].append(all_translations)
data["translation_starts"].append(translation_starts)
data["translation_ends"].append(translation_ends)
- except:
+ except Exception: # noqa: BLE001
data["all_translations"].append(np.nan)
data["translation_starts"].append(np.nan)
data["translation_ends"].append(np.nan)
# Append cleaned up info to df_final
- df_final = pd.concat(
- [df_final, pd.DataFrame.from_dict(data, orient="index", columns=ens_ids)]
- )
+ df_final = pd.concat([df_final, pd.DataFrame.from_dict(data, orient="index", columns=ens_ids)])
## Transpose data frame so each row corresponds to one Ensembl ID
df_final = df_final.T
@@ -617,6 +584,7 @@ def info(
transcript_strands or [],
transcript_starts or [],
transcript_ends or [],
+ strict=False,
):
results_dict[ens_id]["all_transcripts"].append(
{
@@ -641,9 +609,7 @@ def info(
# Build new dictionary entries
results_dict[ens_id].update({"all_exons": []})
- for exon_id, exon_start, exon_end in zip(
- exon_ids or [], exon_starts or [], exon_ends or []
- ):
+ for exon_id, exon_start, exon_end in zip(exon_ids or [], exon_starts or [], exon_ends or [], strict=False):
results_dict[ens_id]["all_exons"].append(
{"exon_id": exon_id, "exon_start": exon_start, "exon_end": exon_end}
)
@@ -661,7 +627,7 @@ def info(
# Build new dictionary entries
results_dict[ens_id].update({"all_translations": []})
for translation_id, translation_start, translation_end in zip(
- translation_ids or [], translation_starts or [], translation_ends or []
+ translation_ids or [], translation_starts or [], translation_ends or [], strict=False
):
results_dict[ens_id]["all_translations"].append(
{
diff --git a/gget/gget_muscle.py b/gget/gget_muscle.py
index 26343376a..bd1f03a6f 100644
--- a/gget/gget_muscle.py
+++ b/gget/gget_muscle.py
@@ -1,31 +1,26 @@
+import itertools
import os
import platform
import subprocess
-import itertools
import sys
import time
import uuid
# Custom functions
-from .compile import compile_muscle, MUSCLE_PATH, PACKAGE_PATH
-from .utils import aa_colors, n_colors, create_tmp_fasta, set_up_logger
+from .compile import MUSCLE_PATH, PACKAGE_PATH, compile_muscle
+from .utils import aa_colors, create_tmp_fasta, n_colors, set_up_logger
logger = set_up_logger()
# Path to precompiled muscle binary
if platform.system() == "Windows":
- PRECOMPILED_MUSCLE_PATH = os.path.join(
- PACKAGE_PATH, f"bins/{platform.system()}/muscle.win64.exe"
- )
+ PRECOMPILED_MUSCLE_PATH = os.path.join(PACKAGE_PATH, f"bins/{platform.system()}/muscle.win64.exe")
else:
- PRECOMPILED_MUSCLE_PATH = os.path.join(
- PACKAGE_PATH, f"bins/{platform.system()}/muscle"
- )
+ PRECOMPILED_MUSCLE_PATH = os.path.join(PACKAGE_PATH, f"bins/{platform.system()}/muscle")
def muscle(fasta, super5=False, out=None, verbose=True):
- """
- Align multiple nucleotide or amino acid sequences against each other (using the Muscle v5 algorithm).
+ """Align multiple nucleotide or amino acid sequences against each other (using the Muscle v5 algorithm).
Args:
- fasta List of sequences or path to fasta file containing the sequences to be aligned.
@@ -63,7 +58,7 @@ def muscle(fasta, super5=False, out=None, verbose=True):
abs_out_path = os.path.abspath(out)
# Compile muscle if it is not already compiled
- if os.path.isfile(PRECOMPILED_MUSCLE_PATH) == False:
+ if not os.path.isfile(PRECOMPILED_MUSCLE_PATH):
# Compile muscle
compile_muscle()
muscle_path = MUSCLE_PATH
@@ -115,15 +110,14 @@ def muscle(fasta, super5=False, out=None, verbose=True):
return
else:
if verbose:
- logger.info(
- f"MUSCLE alignment complete. Alignment time: {round(time.time() - start_time, 2)} seconds"
- )
+ logger.info(f"MUSCLE alignment complete. Alignment time: {round(time.time() - start_time, 2)} seconds")
if out is None:
## Print cleaned up muscle output
# Get the titles and sequences from the generated .afa file
titles = []
seqs_master = []
+ seqs = []
with open(abs_out_path) as aln_file:
for i, line in enumerate(aln_file):
# Recognize title lines by the '>' character
diff --git a/gget/gget_mutate.py b/gget/gget_mutate.py
index ff68f4d1c..353b146eb 100644
--- a/gget/gget_mutate.py
+++ b/gget/gget_mutate.py
@@ -1,13 +1,13 @@
-import pandas as pd
+import os
import re
-from tqdm import tqdm
+
import numpy as np
-import os
-from typing import Union, List, Optional
+import pandas as pd
+from tqdm import tqdm
tqdm.pandas()
-from .utils import read_fasta, set_up_logger
+from .utils import read_fasta, set_up_logger # noqa: E402
logger = set_up_logger()
@@ -20,7 +20,9 @@
cosmic_incorrect_wt_base = 0
mut_idx_outside_seq = 0
-mutation_pattern = r"(?:c|g)\.([0-9_\-\+\*]+)([a-zA-Z>]+)" # more complex: r'c\.([0-9_\-\+\*\(\)\?]+)([a-zA-Z>\(\)0-9]+)'
+mutation_pattern = (
+ r"(?:c|g)\.([0-9_\-\+\*]+)([a-zA-Z>]+)" # more complex: r'c\.([0-9_\-\+\*\(\)\?]+)([a-zA-Z>\(\)0-9]+)'
+)
# Get complement
complement = {
@@ -110,6 +112,7 @@
def convert_chromosome_value_to_int_when_possible(val):
+ """Convert a chromosome value to an integer string when possible, otherwise return it as a string."""
try:
# Try to convert the value to a float, then to an int, and finally to a string
return str(int(float(val)))
@@ -118,9 +121,8 @@ def convert_chromosome_value_to_int_when_possible(val):
return str(val)
-def merge_gtf_transcript_locations_into_cosmic_csv(
- mutations, gtf_path, gtf_transcript_id_column
-):
+def merge_gtf_transcript_locations_into_cosmic_csv(mutations, gtf_path, gtf_transcript_id_column):
+ """Merge transcript start/end positions and strand from a GTF file into the mutations DataFrame."""
gtf_df = pd.read_csv(
gtf_path,
sep="\t",
@@ -146,9 +148,7 @@ def merge_gtf_transcript_locations_into_cosmic_csv(
gtf_df["transcript_id"] = gtf_df["attribute"].str.extract('transcript_id "([^"]+)"')
- assert len(gtf_df["transcript_id"]) == len(
- set(gtf_df["transcript_id"])
- ), "Duplicate transcript_id values found!"
+ assert len(gtf_df["transcript_id"]) == len(set(gtf_df["transcript_id"])), "Duplicate transcript_id values found!"
# Filter out rows where transcript_id is NaN
gtf_df = gtf_df.dropna(subset=["transcript_id"])
@@ -164,22 +164,20 @@ def merge_gtf_transcript_locations_into_cosmic_csv(
merged_df = pd.merge(mutations, gtf_df, on=gtf_transcript_id_column, how="left")
# Fill NaN values
- merged_df["start_transcript_position"] = merged_df[
- "start_transcript_position"
- ].fillna(0)
- merged_df["end_transcript_position"] = merged_df["end_transcript_position"].fillna(
- 9999999
- )
+ merged_df["start_transcript_position"] = merged_df["start_transcript_position"].fillna(0)
+ merged_df["end_transcript_position"] = merged_df["end_transcript_position"].fillna(9999999)
merged_df["strand"] = merged_df["strand"].fillna(".")
return merged_df
def get_sequence_length(seq_id, seq_dict):
+ """Return the length of the sequence stored under seq_id in seq_dict."""
return len(seq_dict.get(seq_id, ""))
def get_nucleotide_at_position(seq_id, pos, seq_dict):
+ """Return the nucleotide at the given position in the sequence for seq_id, or None if out of range."""
full_seq = seq_dict.get(seq_id, "")
if pos < len(full_seq):
return full_seq[pos]
@@ -187,12 +185,11 @@ def get_nucleotide_at_position(seq_id, pos, seq_dict):
def translate_sequence(sequence, start, end):
+ """Translate a nucleotide sequence into an amino acid sequence between start and end."""
amino_acid_sequence = ""
for i in range(start, end, 3):
codon = sequence[i : i + 3].upper()
- amino_acid = codon_to_amino_acid.get(
- codon, "X"
- ) # Use 'X' for unknown or incomplete codons
+ amino_acid = codon_to_amino_acid.get(codon, "X") # Use 'X' for unknown or incomplete codons
amino_acid_sequence += amino_acid
return amino_acid_sequence
@@ -203,15 +200,15 @@ def translate_sequence(sequence, start, end):
def remove_gt_after_semicolon(line):
+ """Remove leading '>' characters from each semicolon-separated part except the first."""
parts = line.split(";")
# Remove '>' from the beginning of each part except the first part
parts = [parts[0]] + [part.lstrip(">") for part in parts[1:]]
return ";".join(parts)
-def wt_fragment_and_mutant_fragment_share_kmer(
- mutated_fragment: str, wildtype_fragment: str, k: int
-) -> bool:
+def wt_fragment_and_mutant_fragment_share_kmer(mutated_fragment: str, wildtype_fragment: str, k: int) -> bool:
+ """Return True if the mutated fragment shares any k-mer with the wildtype fragment."""
if len(mutated_fragment) <= k:
if mutated_fragment in wildtype_fragment:
return True
@@ -227,9 +224,8 @@ def wt_fragment_and_mutant_fragment_share_kmer(
def add_mutation_type(mutations, mut_column):
- mutations["mutation_type_id"] = mutations[mut_column].str.extract(mutation_pattern)[
- 1
- ]
+ """Add a 'mutation_type' column to the mutations DataFrame based on the mutation notation."""
+ mutations["mutation_type_id"] = mutations[mut_column].str.extract(mutation_pattern)[1]
# Define conditions and choices for the mutation types
conditions = [
@@ -262,15 +258,15 @@ def add_mutation_type(mutations, mut_column):
def extract_sequence(row, seq_dict, seq_id_column="seq_ID"):
+ """Extract the subsequence spanning the mutation positions for a row, or None if positions are missing."""
if pd.isna(row["start_mutation_position"]) or pd.isna(row["end_mutation_position"]):
return None
- seq = seq_dict[row[seq_id_column]][
- int(row["start_mutation_position"]) : int(row["end_mutation_position"]) + 1
- ]
+ seq = seq_dict[row[seq_id_column]][int(row["start_mutation_position"]) : int(row["end_mutation_position"]) + 1]
return seq
def common_prefix_length(s1, s2):
+ """Return the length of the common prefix shared by s1 and s2."""
min_len = min(len(s1), len(s2))
for i in range(min_len):
if s1[i] != s2[i]:
@@ -280,6 +276,7 @@ def common_prefix_length(s1, s2):
# Function to find the length of the common suffix with the prefix
def common_suffix_length(s1, s2):
+ """Return the length of the common suffix shared by s1 and s2."""
min_len = min(len(s1), len(s2))
for i in range(min_len):
if s1[-(i + 1)] != s2[-(i + 1)]:
@@ -288,6 +285,7 @@ def common_suffix_length(s1, s2):
def count_repeat_right_flank(mut_nucleotides, right_flank_region):
+ """Count the total overlap length of repeated mut_nucleotides at the start of the right flank region."""
total_overlap_len = 0
while right_flank_region.startswith(mut_nucleotides):
total_overlap_len += len(mut_nucleotides)
@@ -297,6 +295,7 @@ def count_repeat_right_flank(mut_nucleotides, right_flank_region):
def count_repeat_left_flank(mut_nucleotides, left_flank_region):
+ """Count the total overlap length of repeated mut_nucleotides at the end of the left flank region."""
total_overlap_len = 0
while left_flank_region.endswith(mut_nucleotides):
total_overlap_len += len(mut_nucleotides)
@@ -306,6 +305,7 @@ def count_repeat_left_flank(mut_nucleotides, left_flank_region):
def beginning_mut_nucleotides_with_right_flank(mut_nucleotides, right_flank_region):
+ """Return the overlap length between mut_nucleotides and the beginning of the right flank region."""
if mut_nucleotides == right_flank_region[: len(mut_nucleotides)]:
return count_repeat_right_flank(mut_nucleotides, right_flank_region)
else:
@@ -314,6 +314,7 @@ def beginning_mut_nucleotides_with_right_flank(mut_nucleotides, right_flank_regi
# Comparing end of mut_nucleotides to the end of left_flank_region
def end_mut_nucleotides_with_left_flank(mut_nucleotides, left_flank_region):
+ """Return the overlap length between mut_nucleotides and the end of the left flank region."""
if mut_nucleotides == left_flank_region[-len(mut_nucleotides) :]:
return count_repeat_left_flank(mut_nucleotides, left_flank_region)
else:
@@ -321,6 +322,7 @@ def end_mut_nucleotides_with_left_flank(mut_nucleotides, left_flank_region):
def calculate_beginning_mutation_overlap_with_right_flank(row):
+ """Calculate the overlap between the beginning of a row's mutation and its right flank region."""
if row["mutation_type"] == "deletion":
sequence_to_check = row["wt_nucleotides_ensembl"]
else:
@@ -331,12 +333,11 @@ def calculate_beginning_mutation_overlap_with_right_flank(row):
else:
original_sequence = row["right_flank_region"]
- return beginning_mut_nucleotides_with_right_flank(
- sequence_to_check, original_sequence
- )
+ return beginning_mut_nucleotides_with_right_flank(sequence_to_check, original_sequence)
def calculate_end_mutation_overlap_with_left_flank(row):
+ """Calculate the overlap between the end of a row's mutation and its left flank region."""
if row["mutation_type"] == "deletion":
sequence_to_check = row["wt_nucleotides_ensembl"]
else:
@@ -351,30 +352,30 @@ def calculate_end_mutation_overlap_with_left_flank(row):
def mutate(
- sequences: Union[str, List[str]],
- mutations: Union[str, List[str]],
+ sequences: str | list[str],
+ mutations: str | list[str],
mut_column: str = "mutation",
seq_id_column: str = "seq_ID",
- mut_id_column: Optional[str] = None,
- gtf: Optional[str] = None,
- gtf_transcript_id_column: Optional[str] = None,
+ mut_id_column: str | None = None,
+ gtf: str | None = None,
+ gtf_transcript_id_column: str | None = None,
k: int = 30,
- min_seq_len: Optional[int] = None,
+ min_seq_len: int | None = None,
optimize_flanking_regions: bool = False,
remove_seqs_with_wt_kmers: bool = False,
- max_ambiguous: Optional[int] = None,
+ max_ambiguous: int | None = None,
merge_identical: bool = True,
update_df: bool = False,
- update_df_out: Optional[str] = None,
+ update_df_out: str | None = None,
store_full_sequences: bool = False,
translate: bool = False,
- translate_start: Union[int, str, None] = None,
- translate_end: Union[int, str, None] = None,
- out: Optional[str] = None,
+ translate_start: int | str | None = None,
+ translate_end: int | str | None = None,
+ out: str | None = None,
verbose: bool = True,
):
- """
- Takes in nucleotide sequences and mutations (in standard mutation annotation - see below)
+ """Takes in nucleotide sequences and mutations (in standard mutation annotation - see below)
+
and returns mutated versions of the input sequences according to the provided mutations.
Reuiqred input argument:
@@ -461,8 +462,14 @@ def mutate(
Saves mutated sequences in fasta format (or, if out=None: when update_df is True, returns the mutation dataframe, otherwise returns a list containing the mutated sequences).
"""
-
- global intronic_mutations, posttranslational_region_mutations, unknown_mutations, uncertain_mutations, ambiguous_position_mutations, cosmic_incorrect_wt_base, mut_idx_outside_seq
+ global \
+ intronic_mutations, \
+ posttranslational_region_mutations, \
+ unknown_mutations, \
+ uncertain_mutations, \
+ ambiguous_position_mutations, \
+ cosmic_incorrect_wt_base, \
+ mut_idx_outside_seq
columns_to_keep = [
"header",
@@ -472,7 +479,7 @@ def mutate(
"wt_sequence",
"mutant_sequence",
"start_mutation_position",
- "end_mutation_position"
+ "end_mutation_position",
]
# Load input sequences and their identifiers from fasta file
@@ -481,7 +488,7 @@ def mutate(
# Handle input sequences passed as a list
elif isinstance(sequences, list):
- titles = [f"seq{i+1}" for i in range(len(sequences))]
+ titles = [f"seq{i + 1}" for i in range(len(sequences))]
seqs = sequences
# Handle a single sequence passed as a string
@@ -492,7 +499,7 @@ def mutate(
else:
raise ValueError(
"""
- Format of the input to the 'sequences' argument not recognized.
+ Format of the input to the 'sequences' argument not recognized.
'sequences' must be one of the following:
- Path to the fasta file containing the sequences to be mutated (e.g. 'seqs.fa')
- A list of sequences to be mutated (e.g. ['ACTGCTAGCT', 'AGCTAGCT'])
@@ -508,18 +515,14 @@ def mutate(
mutations = pd.read_csv(mutations)
for col in mutations.columns:
if col not in columns_to_keep:
- columns_to_keep.append(
- col
- ) # append "mutation_aa", "gene_name", "mutation_id"
+ columns_to_keep.append(col) # append "mutation_aa", "gene_name", "mutation_id"
elif isinstance(mutations, str) and mutations.endswith(".tsv"):
mutations_path = mutations
mutations = pd.read_csv(mutations, sep="\t")
for col in mutations.columns:
if col not in columns_to_keep:
- columns_to_keep.append(
- col
- ) # append "mutation_aa", "gene_name", "mutation_id"
+ columns_to_keep.append(col) # append "mutation_aa", "gene_name", "mutation_id"
# Handle mutations passed as a list
elif isinstance(mutations, list):
@@ -531,14 +534,14 @@ def mutate(
temp = pd.DataFrame()
temp["mutation"] = mutations
- temp["mut_ID"] = [f"mut{i+1}" for i in range(len(mutations))]
- temp["seq_ID"] = [f"seq{i+1}" for i in range(len(mutations))]
+ temp["mut_ID"] = [f"mut{i + 1}" for i in range(len(mutations))]
+ temp["seq_ID"] = [f"seq{i + 1}" for i in range(len(mutations))]
mutations = temp
else:
temp = pd.DataFrame()
temp["mutation"] = [mutations[0]] * len(seqs)
- temp["mut_ID"] = [f"mut{i+1}" for i in range(len(seqs))]
- temp["seq_ID"] = [f"seq{i+1}" for i in range(len(seqs))]
+ temp["mut_ID"] = [f"mut{i + 1}" for i in range(len(seqs))]
+ temp["seq_ID"] = [f"seq{i + 1}" for i in range(len(seqs))]
mutations = temp
# Handle single mutation passed as a string
@@ -546,8 +549,8 @@ def mutate(
# This will work for one mutation for one sequence as well as one mutation for multiple sequences
temp = pd.DataFrame()
temp["mutation"] = [mutations] * len(seqs)
- temp["mut_ID"] = [f"mut{i+1}" for i in range(len(seqs))]
- temp["seq_ID"] = [f"seq{i+1}" for i in range(len(seqs))]
+ temp["mut_ID"] = [f"mut{i + 1}" for i in range(len(seqs))]
+ temp["seq_ID"] = [f"seq{i + 1}" for i in range(len(seqs))]
mutations = temp
elif isinstance(mutations, pd.DataFrame):
@@ -556,7 +559,7 @@ def mutate(
else:
raise ValueError(
"""
- Format of the input to the 'mutations' argument not recognized.
+ Format of the input to the 'mutations' argument not recognized.
'mutations' must be one of the following:
- Path to comma-separated csv file (e.g. 'mutations.csv')
- A pandas DataFrame object
@@ -570,7 +573,7 @@ def mutate(
seq_dict = {}
non_nuc_seqs = 0
- for title, seq in zip(titles, seqs):
+ for title, seq in zip(titles, seqs, strict=False):
# Check that sequences are nucleotide sequences
if not set(seq) <= nucleotides:
non_nuc_seqs += 1
@@ -583,7 +586,7 @@ def mutate(
logger.warning(
f"""
Non-nucleotide characters detected in {non_nuc_seqs} input sequences. gget mutate is currently only optimized for mutating nucleotide sequences.
- Specifically inversion mutations might not be performed correctly.
+ Specifically inversion mutations might not be performed correctly.
"""
)
@@ -600,9 +603,7 @@ def mutate(
mutations = mutations.dropna(subset=[seq_id_column])
# ensure seq_ID column is string type, and chromosome numbers don't have decimals
- mutations[seq_id_column] = mutations[seq_id_column].apply(
- convert_chromosome_value_to_int_when_possible
- )
+ mutations[seq_id_column] = mutations[seq_id_column].apply(convert_chromosome_value_to_int_when_possible)
mutations = add_mutation_type(mutations, mut_column)
@@ -615,16 +616,16 @@ def mutate(
if 0 < len(seqs_not_found) < 20:
logger.warning(
f"""
- The sequences with the following {len(seqs_not_found)} sequence ID(s) were not found: {", ".join(seqs_not_found[seq_id_column].values)}
- These sequences and their corresponding mutations will not be included in the output.
+ The sequences with the following {len(seqs_not_found)} sequence ID(s) were not found: {", ".join(seqs_not_found[seq_id_column].values)}
+ These sequences and their corresponding mutations will not be included in the output.
Ensure that the sequence IDs correspond to the string following the > character in the 'sequences' fasta file (do NOT include spaces or dots).
"""
)
elif len(seqs_not_found) > 0:
logger.warning(
f"""
- The sequences corresponding to {len(seqs_not_found)} sequence IDs were not found.
- These sequences and their corresponding mutations will not be included in the output.
+ The sequences corresponding to {len(seqs_not_found)} sequence IDs were not found.
+ These sequences and their corresponding mutations will not be included in the output.
Ensure that the sequence IDs correspond to the string following the > character in the 'sequences' fasta file (do NOT include spaces or dots).
"""
)
@@ -634,7 +635,7 @@ def mutate(
if len(mutations) < 1:
raise ValueError(
"""
- None of the input sequences match the sequence IDs provided in 'mutations'.
+ None of the input sequences match the sequence IDs provided in 'mutations'.
Ensure that the sequence IDs correspond to the string following the > character in the 'sequences' fasta file (do NOT include spaces or dots).
"""
)
@@ -645,9 +646,7 @@ def mutate(
mut_id_column = mut_column
mutations["mutant_sequence"] = ""
- mutations["header"] = (
- ">" + mutations[seq_id_column] + ":" + mutations[mut_id_column]
- )
+ mutations["header"] = ">" + mutations[seq_id_column] + ":" + mutations[mut_id_column]
# Calculate number of bad mutations
uncertain_mutations = mutations[mut_column].str.contains(r"\?").sum()
@@ -664,9 +663,7 @@ def mutate(
mutations = mutations[~mask]
# Extract nucleotide positions and mutation info from Mutation CDS
- mutations[["nucleotide_positions", "actual_mutation"]] = mutations[
- mut_column
- ].str.extract(mutation_pattern)
+ mutations[["nucleotide_positions", "actual_mutation"]] = mutations[mut_column].str.extract(mutation_pattern)
# Filter out mutations that did not match the re
unknown_mutations = mutations["nucleotide_positions"].isna().sum()
@@ -681,15 +678,13 @@ def mutate(
mutations["start_mutation_position"] = split_positions[0]
if split_positions.shape[1] > 1:
- mutations["end_mutation_position"] = split_positions[1].fillna(
- split_positions[0]
- )
+ mutations["end_mutation_position"] = split_positions[1].fillna(split_positions[0])
else:
mutations["end_mutation_position"] = mutations["start_mutation_position"]
- mutations.loc[
- mutations["end_mutation_position"].isna(), "end_mutation_position"
- ] = mutations["start_mutation_position"]
+ mutations.loc[mutations["end_mutation_position"].isna(), "end_mutation_position"] = mutations[
+ "start_mutation_position"
+ ]
mutations[["start_mutation_position", "end_mutation_position"]] = mutations[
["start_mutation_position", "end_mutation_position"]
@@ -700,14 +695,12 @@ def mutate(
mutations["end_mutation_position"] -= 1 # don't forget to increment by 1 later
# Calculate sequence length
- mutations["sequence_length"] = mutations[seq_id_column].apply(
- lambda x: get_sequence_length(x, seq_dict)
- )
+ mutations["sequence_length"] = mutations[seq_id_column].apply(lambda x: get_sequence_length(x, seq_dict))
# Filter out mutations with positions outside the sequence
- index_error_mask = (
- mutations["start_mutation_position"] > mutations["sequence_length"]
- ) | (mutations["end_mutation_position"] > mutations["sequence_length"])
+ index_error_mask = (mutations["start_mutation_position"] > mutations["sequence_length"]) | (
+ mutations["end_mutation_position"] > mutations["sequence_length"]
+ )
mut_idx_outside_seq = index_error_mask.sum()
@@ -728,33 +721,15 @@ def mutate(
if remove_seqs_with_wt_kmers:
long_duplications = (
- (duplication_mask)
- & (
- (
- mutations["end_mutation_position"]
- - mutations["start_mutation_position"]
- )
- >= k
- )
+ (duplication_mask) & ((mutations["end_mutation_position"] - mutations["start_mutation_position"]) >= k)
).sum()
logger.info(f"Removing {long_duplications} duplications > k")
mutations = mutations[
- ~(
- (duplication_mask)
- & (
- (
- mutations["end_mutation_position"]
- - mutations["start_mutation_position"]
- )
- >= k
- )
- )
+ ~((duplication_mask) & ((mutations["end_mutation_position"] - mutations["start_mutation_position"]) >= k))
]
# Create a mask for all non-substitution mutations
- non_substitution_mask = (
- deletion_mask | delins_mask | insertion_mask | duplication_mask | inversion_mask
- )
+ non_substitution_mask = deletion_mask | delins_mask | insertion_mask | duplication_mask | inversion_mask
# Extract the WT nucleotides for the substitution rows from reference fasta (i.e., Ensembl)
start_positions = mutations.loc[substitution_mask, "start_mutation_position"].values
@@ -763,27 +738,19 @@ def mutate(
wt_nucleotides_substitution = np.array(
[
get_nucleotide_at_position(seq_id, pos, seq_dict)
- for seq_id, pos in zip(
- mutations.loc[substitution_mask, seq_id_column], start_positions
- )
+ for seq_id, pos in zip(mutations.loc[substitution_mask, seq_id_column], start_positions, strict=False)
]
)
- mutations.loc[substitution_mask, "wt_nucleotides_ensembl"] = (
- wt_nucleotides_substitution
- )
+ mutations.loc[substitution_mask, "wt_nucleotides_ensembl"] = wt_nucleotides_substitution
# Extract the WT nucleotides for the substitution rows from the Mutation CDS (i.e., COSMIC)
mutations["wt_nucleotides_cosmic"] = None
- mutations.loc[substitution_mask, "wt_nucleotides_cosmic"] = mutations[
- "actual_mutation"
- ].str[0]
+ mutations.loc[substitution_mask, "wt_nucleotides_cosmic"] = mutations["actual_mutation"].str[0]
- congruent_wt_bases_mask = (
- mutations["wt_nucleotides_cosmic"] == mutations["wt_nucleotides_ensembl"]
- ) | mutations[["wt_nucleotides_cosmic", "wt_nucleotides_ensembl"]].isna().any(
- axis=1
- )
+ congruent_wt_bases_mask = (mutations["wt_nucleotides_cosmic"] == mutations["wt_nucleotides_ensembl"]) | mutations[
+ ["wt_nucleotides_cosmic", "wt_nucleotides_ensembl"]
+ ].isna().any(axis=1)
cosmic_incorrect_wt_base = (~congruent_wt_bases_mask).sum()
@@ -794,40 +761,33 @@ def mutate(
return mutations if update_df else []
# Adjust the start and end positions for insertions
- mutations.loc[
- insertion_mask, "start_mutation_position"
- ] += 1 # in other cases, we want left flank to exclude the start of mutation site; but with insertion, the start of mutation site as it is denoted still belongs in the flank region
- mutations.loc[
- insertion_mask, "end_mutation_position"
- ] -= 1 # in this notation, the end position is one before the start position
+ mutations.loc[insertion_mask, "start_mutation_position"] += (
+ 1 # in other cases, we want left flank to exclude the start of mutation site; but with insertion, the start of mutation site as it is denoted still belongs in the flank region
+ )
+ mutations.loc[insertion_mask, "end_mutation_position"] -= (
+ 1 # in this notation, the end position is one before the start position
+ )
# Extract the WT nucleotides for the non-substitution rows from the Mutation CDS (i.e., COSMIC)
- mutations.loc[non_substitution_mask, "wt_nucleotides_ensembl"] = mutations.loc[
- non_substitution_mask
- ].apply(lambda row: extract_sequence(row, seq_dict, seq_id_column), axis=1)
+ mutations.loc[non_substitution_mask, "wt_nucleotides_ensembl"] = mutations.loc[non_substitution_mask].apply(
+ lambda row: extract_sequence(row, seq_dict, seq_id_column), axis=1
+ )
# Apply mutations to the sequences
mutations["mut_nucleotides"] = None
- mutations.loc[substitution_mask, "mut_nucleotides"] = mutations.loc[
- substitution_mask, "actual_mutation"
- ].str[-1]
+ mutations.loc[substitution_mask, "mut_nucleotides"] = mutations.loc[substitution_mask, "actual_mutation"].str[-1]
mutations.loc[deletion_mask, "mut_nucleotides"] = ""
- mutations.loc[delins_mask, "mut_nucleotides"] = mutations.loc[
- delins_mask, "actual_mutation"
- ].str.extract(r"delins([A-Z]+)")[0]
- mutations.loc[insertion_mask, "mut_nucleotides"] = mutations.loc[
- insertion_mask, "actual_mutation"
- ].str.extract(r"ins([A-Z]+)")[0]
- mutations.loc[duplication_mask, "mut_nucleotides"] = mutations.loc[
- duplication_mask
- ].apply(lambda row: row["wt_nucleotides_ensembl"], axis=1)
- mutations.loc[inversion_mask, "mut_nucleotides"] = mutations.loc[
- inversion_mask
- ].apply(
- lambda row: "".join(
- complement.get(nucleotide, "N")
- for nucleotide in row["wt_nucleotides_ensembl"][::-1]
- ),
+ mutations.loc[delins_mask, "mut_nucleotides"] = mutations.loc[delins_mask, "actual_mutation"].str.extract(
+ r"delins([A-Z]+)"
+ )[0]
+ mutations.loc[insertion_mask, "mut_nucleotides"] = mutations.loc[insertion_mask, "actual_mutation"].str.extract(
+ r"ins([A-Z]+)"
+ )[0]
+ mutations.loc[duplication_mask, "mut_nucleotides"] = mutations.loc[duplication_mask].apply(
+ lambda row: row["wt_nucleotides_ensembl"], axis=1
+ )
+ mutations.loc[inversion_mask, "mut_nucleotides"] = mutations.loc[inversion_mask].apply(
+ lambda row: "".join(complement.get(nucleotide, "N") for nucleotide in row["wt_nucleotides_ensembl"][::-1]),
axis=1,
)
@@ -840,31 +800,22 @@ def mutate(
# Calculate the kmer bounds
mutations["start_kmer_position_min"] = mutations["start_mutation_position"] - k
- mutations["start_kmer_position"] = mutations["start_kmer_position_min"].combine(
- 0, max
- )
+ mutations["start_kmer_position"] = mutations["start_kmer_position_min"].combine(0, max)
mutations["end_kmer_position_max"] = mutations["end_mutation_position"] + k
- mutations["end_kmer_position"] = mutations[
- ["end_kmer_position_max", "sequence_length"]
- ].min(
+ mutations["end_kmer_position"] = mutations[["end_kmer_position_max", "sequence_length"]].min(
axis=1
) # don't forget to increment by 1 later on
if gtf is not None:
- assert mutations_path.endswith(".csv") or mutations_path.endswith(
- ".tsv"
- ), "Mutations must be a CSV or TSV file"
+ assert mutations_path.endswith(".csv") or mutations_path.endswith(".tsv"), "Mutations must be a CSV or TSV file"
if (
- "start_transcript_position" not in mutations.columns
- and "end_transcript_position" not in mutations.columns
+ "start_transcript_position" not in mutations.columns and "end_transcript_position" not in mutations.columns
): # * currently hard-coded column names, but optionally can be changed to arguments later
mutations = merge_gtf_transcript_locations_into_cosmic_csv(
mutations, gtf, gtf_transcript_id_column=gtf_transcript_id_column
)
- columns_to_keep.extend(
- ["start_transcript_position", "end_transcript_position", "strand"]
- )
+ columns_to_keep.extend(["start_transcript_position", "end_transcript_position", "strand"])
else:
logger.warning(
"Transcript positions already present in the input mutations file. Skipping GTF file merging."
@@ -873,18 +824,10 @@ def mutate(
# adjust start_transcript_position to be 0-index
mutations["start_transcript_position"] -= 1
- mutations["start_kmer_position"] = mutations[
- ["start_kmer_position", "start_transcript_position"]
- ].max(axis=1)
- mutations["end_kmer_position"] = mutations[
- ["end_kmer_position", "end_transcript_position"]
- ].min(axis=1)
-
- mut_apply = (
- (lambda *args, **kwargs: mutations.progress_apply(*args, **kwargs))
- if verbose
- else mutations.apply
- )
+ mutations["start_kmer_position"] = mutations[["start_kmer_position", "start_transcript_position"]].max(axis=1)
+ mutations["end_kmer_position"] = mutations[["end_kmer_position", "end_transcript_position"]].min(axis=1)
+
+ mut_apply = (lambda *args, **kwargs: mutations.progress_apply(*args, **kwargs)) if verbose else mutations.apply
if update_df and store_full_sequences:
# Extract flank sequences
@@ -892,9 +835,7 @@ def mutate(
tqdm.pandas(desc="Extracting full left flank sequences")
mutations["left_flank_region_full"] = mut_apply(
- lambda row: seq_dict[row[seq_id_column]][
- 0 : row["start_mutation_position"]
- ],
+ lambda row: seq_dict[row[seq_id_column]][0 : row["start_mutation_position"]],
axis=1,
) # ? vectorize
@@ -902,9 +843,7 @@ def mutate(
tqdm.pandas(desc="Extracting full right flank sequences")
mutations["right_flank_region_full"] = mut_apply(
- lambda row: seq_dict[row[seq_id_column]][
- row["end_mutation_position"] + 1 : row["sequence_length"]
- ],
+ lambda row: seq_dict[row[seq_id_column]][row["end_mutation_position"] + 1 : row["sequence_length"]],
axis=1,
) # ? vectorize
@@ -912,9 +851,7 @@ def mutate(
tqdm.pandas(desc="Extracting k-mer left flank sequences")
mutations["left_flank_region"] = mut_apply(
- lambda row: seq_dict[row[seq_id_column]][
- row["start_kmer_position"] : row["start_mutation_position"]
- ],
+ lambda row: seq_dict[row[seq_id_column]][row["start_kmer_position"] : row["start_mutation_position"]],
axis=1,
) # ? vectorize
@@ -922,9 +859,7 @@ def mutate(
tqdm.pandas(desc="Extracting k-mer right flank sequences")
mutations["right_flank_region"] = mut_apply(
- lambda row: seq_dict[row[seq_id_column]][
- row["end_mutation_position"] + 1 : row["end_kmer_position"] + 1
- ],
+ lambda row: seq_dict[row[seq_id_column]][row["end_mutation_position"] + 1 : row["end_kmer_position"] + 1],
axis=1,
) # ? vectorize
@@ -945,31 +880,25 @@ def mutate(
if optimize_flanking_regions:
# Apply the function for beginning of mut_nucleotides with right_flank_region
- mutations.loc[
- non_substitution_mask, "beginning_mutation_overlap_with_right_flank"
- ] = mutations.loc[non_substitution_mask].apply(
- calculate_beginning_mutation_overlap_with_right_flank, axis=1
- )
+ mutations.loc[non_substitution_mask, "beginning_mutation_overlap_with_right_flank"] = mutations.loc[
+ non_substitution_mask
+ ].apply(calculate_beginning_mutation_overlap_with_right_flank, axis=1)
# Apply the function for end of mut_nucleotides with left_flank_region
- mutations.loc[non_substitution_mask, "end_mutation_overlap_with_left_flank"] = (
- mutations.loc[non_substitution_mask].apply(
- calculate_end_mutation_overlap_with_left_flank, axis=1
- )
- )
+ mutations.loc[non_substitution_mask, "end_mutation_overlap_with_left_flank"] = mutations.loc[
+ non_substitution_mask
+ ].apply(calculate_end_mutation_overlap_with_left_flank, axis=1)
# Calculate k-len(flank) (see above instructions)
- mutations.loc[non_substitution_mask, "k_minus_left_flank_length"] = (
- k - mutations.loc[non_substitution_mask, "left_flank_region"].apply(len)
- )
- mutations.loc[non_substitution_mask, "k_minus_right_flank_length"] = (
- k - mutations.loc[non_substitution_mask, "right_flank_region"].apply(len)
- )
+ mutations.loc[non_substitution_mask, "k_minus_left_flank_length"] = k - mutations.loc[
+ non_substitution_mask, "left_flank_region"
+ ].apply(len)
+ mutations.loc[non_substitution_mask, "k_minus_right_flank_length"] = k - mutations.loc[
+ non_substitution_mask, "right_flank_region"
+ ].apply(len)
mutations.loc[non_substitution_mask, "updated_left_flank_start"] = np.maximum(
- mutations.loc[
- non_substitution_mask, "beginning_mutation_overlap_with_right_flank"
- ]
+ mutations.loc[non_substitution_mask, "beginning_mutation_overlap_with_right_flank"]
- mutations.loc[non_substitution_mask, "k_minus_left_flank_length"],
0,
)
@@ -979,12 +908,8 @@ def mutate(
0,
)
- mutations["updated_left_flank_start"] = (
- mutations["updated_left_flank_start"].fillna(0).astype(int)
- )
- mutations["updated_right_flank_end"] = (
- mutations["updated_right_flank_end"].fillna(0).astype(int)
- )
+ mutations["updated_left_flank_start"] = mutations["updated_left_flank_start"].fillna(0).astype(int)
+ mutations["updated_right_flank_end"] = mutations["updated_right_flank_end"].fillna(0).astype(int)
else:
mutations["updated_left_flank_start"] = 0
@@ -998,14 +923,12 @@ def mutate(
)
# Create WT non-substitution k-mer sequences
- mutations.loc[non_substitution_mask, "wt_sequence"] = mutations.loc[
- non_substitution_mask
- ].apply(
- lambda row: row["left_flank_region"][row["updated_left_flank_start"] :]
- + row["wt_nucleotides_ensembl"]
- + row["right_flank_region"][
- : len(row["right_flank_region"]) - row["updated_right_flank_end"]
- ],
+ mutations.loc[non_substitution_mask, "wt_sequence"] = mutations.loc[non_substitution_mask].apply(
+ lambda row: (
+ row["left_flank_region"][row["updated_left_flank_start"] :]
+ + row["wt_nucleotides_ensembl"]
+ + row["right_flank_region"][: len(row["right_flank_region"]) - row["updated_right_flank_end"]]
+ ),
axis=1,
)
@@ -1017,22 +940,18 @@ def mutate(
)
# Create mutant non-substitution k-mer sequences
- mutations.loc[non_substitution_mask, "mutant_sequence"] = mutations.loc[
- non_substitution_mask
- ].apply(
- lambda row: row["left_flank_region"][row["updated_left_flank_start"] :]
- + row["mut_nucleotides"]
- + row["right_flank_region"][
- : len(row["right_flank_region"]) - row["updated_right_flank_end"]
- ],
+ mutations.loc[non_substitution_mask, "mutant_sequence"] = mutations.loc[non_substitution_mask].apply(
+ lambda row: (
+ row["left_flank_region"][row["updated_left_flank_start"] :]
+ + row["mut_nucleotides"]
+ + row["right_flank_region"][: len(row["right_flank_region"]) - row["updated_right_flank_end"]]
+ ),
axis=1,
)
if remove_seqs_with_wt_kmers:
if verbose:
- tqdm.pandas(
- desc="Removing mutant fragments that share a kmer with wt fragments"
- )
+ tqdm.pandas(desc="Removing mutant fragments that share a kmer with wt fragments")
mutations["wt_fragment_and_mutant_fragment_share_kmer"] = mut_apply(
lambda row: wt_fragment_and_mutant_fragment_share_kmer(
@@ -1043,9 +962,7 @@ def mutate(
axis=1,
)
- mutations_overlapping_with_wt = mutations[
- "wt_fragment_and_mutant_fragment_share_kmer"
- ].sum()
+ mutations_overlapping_with_wt = mutations["wt_fragment_and_mutant_fragment_share_kmer"].sum()
mutations = mutations[~mutations["wt_fragment_and_mutant_fragment_share_kmer"]]
@@ -1054,9 +971,7 @@ def mutate(
# Create full sequences (substitution and non-substitution)
mutations["mutant_sequence_full"] = (
- mutations["left_flank_region_full"]
- + mutations["mut_nucleotides"]
- + mutations["right_flank_region_full"]
+ mutations["left_flank_region_full"] + mutations["mut_nucleotides"] + mutations["right_flank_region_full"]
)
# Calculate k-mer lengths and report the distribution
@@ -1067,16 +982,12 @@ def mutate(
max_length = mutations["mutant_sequence_kmer_length"].max()
if min_seq_len:
- rows_less_than_minimum = (
- mutations["mutant_sequence_kmer_length"] < min_seq_len
- ).sum()
+ rows_less_than_minimum = (mutations["mutant_sequence_kmer_length"] < min_seq_len).sum()
mutations = mutations[mutations["mutant_sequence_kmer_length"] >= min_seq_len]
if verbose:
- logger.info(
- f"Removed {rows_less_than_minimum} mutant kmers with length less than {min_seq_len}..."
- )
+ logger.info(f"Removed {rows_less_than_minimum} mutant kmers with length less than {min_seq_len}...")
if max_ambiguous is not None:
# Get number of 'N' or 'n' occuring in the sequence
@@ -1085,9 +996,7 @@ def mutate(
mutations = mutations[mutations["num_N"] <= max_ambiguous]
if verbose:
- logger.info(
- f"Removed {num_rows_with_N} mutant kmers containing more than {max_ambiguous} 'N's..."
- )
+ logger.info(f"Removed {num_rows_with_N} mutant kmers containing more than {max_ambiguous} 'N's...")
# Drop the 'num_N' column after filtering
mutations = mutations.drop(columns=["num_N"])
@@ -1097,16 +1006,14 @@ def mutate(
bins = range(0, max_length + 6, 5)
# Bin the lengths and count the number of elements in each bin
- binned_lengths = pd.cut(
- mutations["mutant_sequence_kmer_length"], bins=bins, right=False
- )
+ binned_lengths = pd.cut(mutations["mutant_sequence_kmer_length"], bins=bins, right=False)
bin_counts = binned_lengths.value_counts().sort_index()
# Display the report
if verbose:
logger.debug("Report of the number of elements in each bin of width 5:")
logger.debug(bin_counts)
- except Exception as e:
+ except Exception: # noqa: BLE001
pass
# split_cols = mutations[mut_id_column].str.split("_", n=1, expand=True)
@@ -1133,27 +1040,27 @@ def mutate(
# good_mutations = good_mutations - num_rows_with_N
report = f"""
- {good_mutations} mutations correctly recorded ({good_mutations/total_mutations*100:.2f}%)
- {intronic_mutations} intronic mutations found ({intronic_mutations/total_mutations*100:.2f}%)
- {posttranslational_region_mutations} posttranslational region mutations found ({posttranslational_region_mutations/total_mutations*100:.2f}%)
- {unknown_mutations} unknown mutations found ({unknown_mutations/total_mutations*100:.2f}%)
- {uncertain_mutations} mutations with uncertain mutation found ({uncertain_mutations/total_mutations*100:.2f}%)
- {ambiguous_position_mutations} mutations with ambiguous position found ({ambiguous_position_mutations/total_mutations*100:.2f}%)
- {cosmic_incorrect_wt_base} mutations with incorrect wildtype base found ({cosmic_incorrect_wt_base/total_mutations*100:.2f}%)
- {mut_idx_outside_seq} mutations with indices outside of the sequence length found ({mut_idx_outside_seq/total_mutations*100:.2f}%)
+ {good_mutations} mutations correctly recorded ({good_mutations / total_mutations * 100:.2f}%)
+ {intronic_mutations} intronic mutations found ({intronic_mutations / total_mutations * 100:.2f}%)
+ {posttranslational_region_mutations} posttranslational region mutations found ({posttranslational_region_mutations / total_mutations * 100:.2f}%)
+ {unknown_mutations} unknown mutations found ({unknown_mutations / total_mutations * 100:.2f}%)
+ {uncertain_mutations} mutations with uncertain mutation found ({uncertain_mutations / total_mutations * 100:.2f}%)
+ {ambiguous_position_mutations} mutations with ambiguous position found ({ambiguous_position_mutations / total_mutations * 100:.2f}%)
+ {cosmic_incorrect_wt_base} mutations with incorrect wildtype base found ({cosmic_incorrect_wt_base / total_mutations * 100:.2f}%)
+ {mut_idx_outside_seq} mutations with indices outside of the sequence length found ({mut_idx_outside_seq / total_mutations * 100:.2f}%)
"""
if remove_seqs_with_wt_kmers:
- report += f"""{long_duplications} duplications longer than k found ({long_duplications/total_mutations*100:.2f}%)
- {mutations_overlapping_with_wt} mutations with overlapping kmers found ({mutations_overlapping_with_wt/total_mutations*100:.2f}%)
+ report += f"""{long_duplications} duplications longer than k found ({long_duplications / total_mutations * 100:.2f}%)
+ {mutations_overlapping_with_wt} mutations with overlapping kmers found ({mutations_overlapping_with_wt / total_mutations * 100:.2f}%)
"""
if min_seq_len:
- report += f"""{rows_less_than_minimum} mutations with fragment length < k found ({rows_less_than_minimum/total_mutations*100:.2f}%)
+ report += f"""{rows_less_than_minimum} mutations with fragment length < k found ({rows_less_than_minimum / total_mutations * 100:.2f}%)
"""
if max_ambiguous is not None:
- report += f"""{num_rows_with_N} mutations with Ns found ({num_rows_with_N/total_mutations*100:.2f}%)
+ report += f"""{num_rows_with_N} mutations with Ns found ({num_rows_with_N / total_mutations * 100:.2f}%)
"""
if good_mutations != total_mutations:
@@ -1165,9 +1072,9 @@ def mutate(
columns_to_keep.extend(["wt_sequence_aa_full", "mutant_sequence_aa_full"])
if not mutations_path:
- assert (
- type(translate_start) != str and type(translate_end) != str
- ), "translate_start and translate_end must be integers when translating sequences (or default None)."
+ assert not isinstance(translate_start, str) and not isinstance(translate_end, str), (
+ "translate_start and translate_end must be integers when translating sequences (or default None)."
+ )
if translate_start is None:
translate_start = 0
if translate_end is None:
@@ -1177,38 +1084,24 @@ def mutate(
if verbose:
tqdm.pandas(desc="Translating WT amino acid sequences")
- mutations["wt_sequence_aa_full"] = mutations[
- "wt_sequence_full"
- ].progress_apply(
- lambda x: translate_sequence(
- x, start=translate_start, end=translate_end
- )
+ mutations["wt_sequence_aa_full"] = mutations["wt_sequence_full"].progress_apply(
+ lambda x: translate_sequence(x, start=translate_start, end=translate_end)
)
else:
mutations["wt_sequence_aa_full"] = mutations["wt_sequence_full"].apply(
- lambda x: translate_sequence(
- x, start=translate_start, end=translate_end
- )
+ lambda x: translate_sequence(x, start=translate_start, end=translate_end)
)
if verbose:
tqdm.pandas(desc="Translating mutant amino acid sequences")
- mutations["mutant_sequence_aa_full"] = mutations[
- "mutant_sequence_full"
- ].progress_apply(
- lambda x: translate_sequence(
- x, start=translate_start, end=translate_end
- )
+ mutations["mutant_sequence_aa_full"] = mutations["mutant_sequence_full"].progress_apply(
+ lambda x: translate_sequence(x, start=translate_start, end=translate_end)
)
else:
- mutations["mutant_sequence_aa_full"] = mutations[
- "mutant_sequence_full"
- ].apply(
- lambda x: translate_sequence(
- x, start=translate_start, end=translate_end
- )
+ mutations["mutant_sequence_aa_full"] = mutations["mutant_sequence_full"].apply(
+ lambda x: translate_sequence(x, start=translate_start, end=translate_end)
)
print(f"Translated mutated sequences: {mutations['wt_sequence_aa_full']}")
@@ -1229,9 +1122,7 @@ def mutate(
tqdm.pandas(desc="Translating WT amino acid sequences")
mutations["wt_sequence_aa_full"] = mut_apply(
- lambda row: translate_sequence(
- row["wt_sequence_full"], row[translate_start], row[translate_end]
- ),
+ lambda row: translate_sequence(row["wt_sequence_full"], row[translate_start], row[translate_end]),
axis=1,
)
@@ -1257,17 +1148,13 @@ def mutate(
)
mutations = (
mutations.groupby("mutant_sequence", sort=False)
- .agg(
- lambda x: ";".join(x.astype(str))
- ) # Concatenate values with semicolons
+ .agg(lambda x: ";".join(x.astype(str))) # Concatenate values with semicolons
.reset_index()
)
else:
mutations = (
- mutations.groupby("mutant_sequence", sort=False, group_keys=False)[
- "header"
- ]
+ mutations.groupby("mutant_sequence", sort=False, group_keys=False)["header"]
.apply(";".join)
.reset_index()
)
@@ -1296,9 +1183,7 @@ def mutate(
empty_kmer_count = (mutations["mutant_sequence"] == "").sum()
if empty_kmer_count > 0 and verbose:
- logger.warning(
- f"{empty_kmer_count} mutated sequences were empty and were not included in the output."
- )
+ logger.warning(f"{empty_kmer_count} mutated sequences were empty and were not included in the output.")
mutations = mutations[mutations["mutant_sequence"] != ""]
@@ -1307,9 +1192,7 @@ def mutate(
if update_df:
logger.info("Saving dataframe with updated mutation info...")
saved_updated_df = True
- logger.warning(
- "File size can be very large if the number of mutations is large."
- )
+ logger.warning("File size can be very large if the number of mutations is large.")
if not update_df_out:
if not mutations_path:
# logger.warning(
@@ -1323,9 +1206,7 @@ def mutate(
mutations.to_csv(update_df_out, index=False)
print(f"Updated mutation info has been saved to {update_df_out}")
- mutations["fasta_format"] = (
- ">" + mutations["header"] + "\n" + mutations["mutant_sequence"] + "\n"
- )
+ mutations["fasta_format"] = ">" + mutations["header"] + "\n" + mutations["mutant_sequence"] + "\n"
if out:
# Save mutated sequences in new fasta file
@@ -1342,12 +1223,12 @@ def mutate(
else:
all_mut_seqs = []
all_mut_seqs.extend(mutations["mutant_sequence"].values)
-
+
# Remove empty strings from final list of mutated sequences
# (these are introduced when unknown mutations are encountered)
while "" in all_mut_seqs:
all_mut_seqs.remove("")
-
+
if len(all_mut_seqs) > 0:
return all_mut_seqs
return []
diff --git a/gget/gget_opentargets.py b/gget/gget_opentargets.py
index d09f0dc16..cdb704770 100644
--- a/gget/gget_opentargets.py
+++ b/gget/gget_opentargets.py
@@ -1,9 +1,10 @@
import json as json_
import textwrap
+
import pandas as pd
-from .constants import OPENTARGETS_GRAPHQL_API, DEFAULT_REQUESTS_TIMEOUT
-from .utils import set_up_logger, http_json, dig
+from .constants import DEFAULT_REQUESTS_TIMEOUT, OPENTARGETS_GRAPHQL_API
+from .utils import dig, http_json, set_up_logger
logger = set_up_logger() # export GGET_LOGLEVEL=DEBUG
@@ -132,7 +133,7 @@
diseaseFromSource
depmapId
geneEffect
- }
+ }
}
}
}
@@ -155,7 +156,7 @@
speciesA {
taxonId
}
- intB
+ intB
targetB {
id
approvedSymbol
@@ -170,13 +171,22 @@
}
"""
-OPENTARGETS_RESOURCES = {"diseases", "drugs", "tractability", "pharmacogenetics", "expression", "depmap", "interactions"}
+OPENTARGETS_RESOURCES = {
+ "diseases",
+ "drugs",
+ "tractability",
+ "pharmacogenetics",
+ "expression",
+ "depmap",
+ "interactions",
+}
+
def _collapse_singletons(obj):
- """
- Recursively collapse:
+ """Recursively collapse nested single-element lists and single dicts with one key.
+
- nested single-element lists
- - single dicts with one key → value
+ - single dicts with one key → value.
"""
# -------------------------
# Case 1: list
@@ -189,7 +199,7 @@ def flatten(x):
yield from flatten(el)
else:
yield el
-
+
flat = list(flatten(obj))
flat = [el for el in flat if el is not None]
@@ -209,7 +219,7 @@ def flatten(x):
if len(obj) == 0:
return None
-
+
# if single key → collapse
if len(obj) == 1:
return next(iter(obj.values()))
@@ -221,23 +231,26 @@ def flatten(x):
# -------------------------
return obj
+
def _make_hashable(x):
- if isinstance(x, dict):
- return tuple(sorted((k, _make_hashable(v)) for k, v in x.items()))
- elif isinstance(x, list):
- return tuple(_make_hashable(v) for v in x)
- elif isinstance(x, set):
- return tuple(sorted(_make_hashable(v) for v in x))
- else:
- return x
-
+ if isinstance(x, dict):
+ return tuple(sorted((k, _make_hashable(v)) for k, v in x.items()))
+ elif isinstance(x, list):
+ return tuple(_make_hashable(v) for v in x)
+ elif isinstance(x, set):
+ return tuple(sorted(_make_hashable(v) for v in x))
+ else:
+ return x
+
+
def _unhash(x):
- if isinstance(x, tuple):
- # detect dict-like tuples
- if all(isinstance(i, tuple) and len(i) == 2 for i in x):
- return {k: _unhash(v) for k, v in x}
- return [_unhash(v) for v in x]
- return x
+ if isinstance(x, tuple):
+ # detect dict-like tuples
+ if all(isinstance(i, tuple) and len(i) == 2 for i in x):
+ return {k: _unhash(v) for k, v in x}
+ return [_unhash(v) for v in x]
+ return x
+
def opentargets(
ensembl_id,
@@ -248,8 +261,7 @@ def opentargets(
filters=None,
json=False,
):
- """
- Query OpenTargets for data associated with a given Ensembl gene ID.
+ """Query OpenTargets for data associated with a given Ensembl gene ID.
Args:
@@ -272,7 +284,6 @@ def opentargets(
Returns requested information in DataFrame format.
"""
-
if resource == "diseases":
query_string = QUERY_STRING_DISEASES
rows_path = ["associatedDiseases", "rows"]
@@ -290,12 +301,17 @@ def opentargets(
rows_path = ["expressions"]
elif resource == "depmap":
query_string = QUERY_STRING_DEPMAP
- rows_path = ["depMapEssentiality", "_FLATTEN_screens"] #* _FLATTEN_ indicates that we want to flatten the nested 'screens' field into the main table
+ rows_path = [
+ "depMapEssentiality",
+ "_FLATTEN_screens",
+ ] # * _FLATTEN_ indicates that we want to flatten the nested 'screens' field into the main table
elif resource == "interactions":
query_string = QUERY_STRING_INTERACTIONS
rows_path = ["interactions", "rows"]
else:
- raise ValueError(f"'resource' argument specified as {resource}. Expected one of: {', '.join(OPENTARGETS_RESOURCES)}")
+ raise ValueError(
+ f"'resource' argument specified as {resource}. Expected one of: {', '.join(OPENTARGETS_RESOURCES)}"
+ )
variables = {"ensemblId": ensembl_id}
@@ -331,12 +347,12 @@ def opentargets(
rows = [
{
**{k: v for k, v in row.items() if k != row_key}, # keep everything except the nested field
- **subdict # unpack the nested dict
+ **subdict, # unpack the nested dict
}
for row in rows
for subdict in row[row_key]
]
-
+
if len(rows) == 0:
if verbose:
logger.info(f"No {resource} data found for {ensembl_id}.")
@@ -352,24 +368,24 @@ def opentargets(
if limit is not None:
df = df.head(limit)
-
+
df = df.map(_unhash)
df = df.map(_collapse_singletons)
if filters is not None:
for filter_key, filter_value in filters.items():
if filter_key not in df.columns:
- raise ValueError(f"Filter key '{filter_key}' not found in data columns. Available columns: {', '.join(df.columns)}")
+ raise ValueError(
+ f"Filter key '{filter_key}' not found in data columns. Available columns: {', '.join(df.columns)}"
+ )
df = df[df[filter_key] == filter_value]
if wrap_text:
for col in df.columns:
if df[col].dtype == object:
- df[col] = df[col].apply(
- lambda x: textwrap.fill(str(x), width=40) if isinstance(x, str) else x
- )
-
+ df[col] = df[col].apply(lambda x: textwrap.fill(str(x), width=40) if isinstance(x, str) else x)
+
if json:
return json_.loads(df.to_json(orient="records", force_ascii=False))
-
+
return df
diff --git a/gget/gget_pdb.py b/gget/gget_pdb.py
index 4975061be..b26ed0fa1 100644
--- a/gget/gget_pdb.py
+++ b/gget/gget_pdb.py
@@ -1,17 +1,16 @@
-from urllib.request import urlopen
-from urllib.error import HTTPError
import json
+from urllib.error import HTTPError
+from urllib.request import urlopen
from .utils import set_up_logger
logger = set_up_logger()
-from .constants import RCSB_PDB_API
+from .constants import RCSB_PDB_API # noqa: E402
def pdb(pdb_id, resource="pdb", identifier=None, save=False):
- """
- Query RCSB PDB for the protein structutre/metadata of a given PDB ID.
+ """Query RCSB PDB for the protein structutre/metadata of a given PDB ID.
Args:
- pdb_id PDB ID to be queried (str), e.g. "7S7U".
@@ -33,7 +32,6 @@ def pdb(pdb_id, resource="pdb", identifier=None, save=False):
Returns requested information in JSON format (except for resource="pdb" which returns protein structure in PDB format).
"""
-
# Check if resource argument is valid
resources = [
"pdb",
@@ -49,9 +47,7 @@ def pdb(pdb_id, resource="pdb", identifier=None, save=False):
"nonpolymer_entity_instance",
]
if resource not in resources:
- raise ValueError(
- f"'resource' argument specified as {resource}. Expected one of: {', '.join(resources)}"
- )
+ raise ValueError(f"'resource' argument specified as {resource}. Expected one of: {', '.join(resources)}")
# Check if required identifiers are present
if resource == "assembly" and identifier is None:
@@ -92,7 +88,6 @@ def pdb(pdb_id, resource="pdb", identifier=None, save=False):
# Submit URL request with fallback logic
r = None
- last_error = None
code = None
for url in urls:
try:
@@ -105,8 +100,7 @@ def pdb(pdb_id, resource="pdb", identifier=None, save=False):
if code == 200:
break
- except HTTPError as e:
- last_error = e
+ except HTTPError:
continue
if r is None or code != 200:
@@ -123,9 +117,7 @@ def pdb(pdb_id, resource="pdb", identifier=None, save=False):
f"{resource} for {pdb_id} chain {identifier} was not found. Please double-check arguments and try again."
)
else:
- logger.error(
- f"{resource} for {pdb_id} was not found. Please double-check arguments and try again."
- )
+ logger.error(f"{resource} for {pdb_id} was not found. Please double-check arguments and try again.")
return
if resource != "pdb":
diff --git a/gget/gget_ref.py b/gget/gget_ref.py
index 865bd47ec..f65b8066c 100644
--- a/gget/gget_ref.py
+++ b/gget/gget_ref.py
@@ -1,28 +1,28 @@
-from bs4 import BeautifulSoup
-import requests
import json
+import requests
+from bs4 import BeautifulSoup
+
# Custom functions
from .utils import (
- ref_species_options,
find_latest_ens_rel,
find_nv_kingdom,
+ ref_species_options,
set_up_logger,
)
logger = set_up_logger()
-from .constants import (
+from .constants import ( # noqa: E402
+ DEFAULT_REQUESTS_TIMEOUT,
ENSEMBL_FTP_URL,
- ENSEMBL_FTP_URL_NV,
ENSEMBL_FTP_URL_GRCH37,
- DEFAULT_REQUESTS_TIMEOUT,
+ ENSEMBL_FTP_URL_NV,
)
def find_FTP_link(url, link_substring):
- """
- Helper function for gget ref to find an FTP link, its release date and size.
+ """Helper function for gget ref to find an FTP link, its release date and size.
Args:
url - URL link to FTP subfolder (e.g. GTF) including species and release
@@ -34,9 +34,7 @@ def find_FTP_link(url, link_substring):
# Raise error if status code not "OK" Response
if html.status_code != 200:
- raise RuntimeError(
- f"HTTP response status code {html.status_code}. Please try again.\n"
- )
+ raise RuntimeError(f"HTTP response status code {html.status_code}. Please try again.\n")
soup = BeautifulSoup(html.text, "html.parser")
@@ -67,8 +65,7 @@ def ref(
list_iv_species=False,
verbose=True,
):
- """
- Fetch FTPs for reference genomes and annotations by species from Ensembl.
+ """Fetch FTPs for reference genomes and annotations by species from Ensembl.
Args:
- species Defines the species for which the reference should be fetched in the format "_",
@@ -138,13 +135,9 @@ def ref(
)
# Find all available species for GTFs for this Ensembl release
- species_list_gtf = ref_species_options(
- "gtf", database=ENSEMBL_FTP_URL_NV, release=release
- )
+ species_list_gtf = ref_species_options("gtf", database=ENSEMBL_FTP_URL_NV, release=release)
# Find all available species for FASTAs for this Ensembl release
- species_list_dna = ref_species_options(
- "dna", database=ENSEMBL_FTP_URL_NV, release=release
- )
+ species_list_dna = ref_species_options("dna", database=ENSEMBL_FTP_URL_NV, release=release)
# Find intersection of the two lists
# (Only species which have GTF and FASTAs available can continue)
@@ -158,7 +151,7 @@ def ref(
## Check 'which' parameter
# If single which passed as string, convert to list
- if type(which) == str:
+ if isinstance(which, str):
which = [which]
# Raise error if several values are passed and 'all' is included
@@ -170,7 +163,7 @@ def ref(
which_allowed = ["all", "gtf", "cdna", "dna", "cds", "ncrna", "pep"]
if any(x not in which_allowed for x in which):
raise ValueError(
- f"Parameter 'which' must be 'all', or any one or a combination of the following: 'gtf', 'cdna', 'dna', 'cds', 'ncrna', 'pep'.\n"
+ "Parameter 'which' must be 'all', or any one or a combination of the following: 'gtf', 'cdna', 'dna', 'cds', 'ncrna', 'pep'.\n"
)
# Species shortcuts
@@ -191,9 +184,7 @@ def ref(
database = ENSEMBL_FTP_URL_GRCH37
ENS_rel = find_latest_ens_rel(ENSEMBL_FTP_URL)
# Standard database
- elif species in ref_species_options(
- "dna", database=ENSEMBL_FTP_URL, release=release
- ):
+ elif species in ref_species_options("dna", database=ENSEMBL_FTP_URL, release=release):
database = ENSEMBL_FTP_URL
# Find latest vertebrate Ensembl release
ENS_rel = find_latest_ens_rel(database)
@@ -204,24 +195,18 @@ def ref(
ENS_rel = find_latest_ens_rel(database)
# If release != None, use user-defined Ensembl release
- if release != None:
+ if release is not None:
# Warn user when release is higher than the latest release
if release > ENS_rel:
- logger.warning(
- f"Provided Ensembl release number {release} is greater than the latest release ({ENS_rel})."
- )
+ logger.warning(f"Provided Ensembl release number {release} is greater than the latest release ({ENS_rel}).")
ENS_rel = release
if not grch37:
## Raise error if species not found (both FASTA and GTF have to be available)
# Find all available species for genome FASTAs for this Ensembl release
- species_list_dna = ref_species_options(
- "dna", database=database, release=ENS_rel
- )
+ species_list_dna = ref_species_options("dna", database=database, release=ENS_rel)
# Find all available species for GTFs for this Ensembl release
- species_list_gtf = ref_species_options(
- "gtf", database=database, release=ENS_rel
- )
+ species_list_gtf = ref_species_options("gtf", database=database, release=ENS_rel)
# Find intersection of the two lists
# (Only species which have GTF and FASTAs available can continue)
species_list = list(set(species_list_gtf) & set(species_list_dna))
@@ -251,9 +236,7 @@ def ref(
link_substring = f"{ENS_rel}.gtf.gz"
# Get link, release date and dataset size
- gtf_str, gtf_date, gtf_size = find_FTP_link(
- url=gtf_search_url, link_substring=link_substring
- )
+ gtf_str, gtf_date, gtf_size = find_FTP_link(url=gtf_search_url, link_substring=link_substring)
# Build the final download link
if not isinstance(gtf_str, type(None)):
gtf_url = gtf_search_url + gtf_str
@@ -266,17 +249,13 @@ def ref(
if "all" in which or "cdna" in which:
if database == ENSEMBL_FTP_URL_NV:
# Define location of cdna links
- cdna_search_url = (
- database + f"release-{ENS_rel}/{kingdom}/fasta/{species}/cdna/"
- )
+ cdna_search_url = database + f"release-{ENS_rel}/{kingdom}/fasta/{species}/cdna/"
else:
# Define location of cdna links
cdna_search_url = database + f"release-{ENS_rel}/fasta/{species}/cdna/"
# Get link, release date and dataset size
- cdna_str, cdna_date, cdna_size = find_FTP_link(
- url=cdna_search_url, link_substring="cdna.all.fa"
- )
+ cdna_str, cdna_date, cdna_size = find_FTP_link(url=cdna_search_url, link_substring="cdna.all.fa")
# Build the final download link
if not isinstance(cdna_str, type(None)):
cdna_url = cdna_search_url + cdna_str
@@ -289,22 +268,16 @@ def ref(
if "all" in which or "dna" in which:
# Define location of dna links
if database == ENSEMBL_FTP_URL_NV:
- dna_search_url = (
- database + f"release-{ENS_rel}/{kingdom}/fasta/{species}/dna/"
- )
+ dna_search_url = database + f"release-{ENS_rel}/{kingdom}/fasta/{species}/dna/"
else:
dna_search_url = database + f"release-{ENS_rel}/fasta/{species}/dna/"
# Get link, release date and dataset size
- dna_str, dna_date, dna_size = find_FTP_link(
- url=dna_search_url, link_substring=".dna.primary_assembly.fa"
- )
+ dna_str, dna_date, dna_size = find_FTP_link(url=dna_search_url, link_substring=".dna.primary_assembly.fa")
# Get toplevel if primary assembly not available
if dna_str is None:
# Get link, release date and dataset size
- dna_str, dna_date, dna_size = find_FTP_link(
- url=dna_search_url, link_substring=".dna.toplevel.fa"
- )
+ dna_str, dna_date, dna_size = find_FTP_link(url=dna_search_url, link_substring=".dna.toplevel.fa")
# Build the final download link
if not isinstance(dna_str, type(None)):
@@ -318,15 +291,11 @@ def ref(
if "all" in which or "cds" in which:
# Define location of cds links
if database == ENSEMBL_FTP_URL_NV:
- cds_search_url = (
- database + f"release-{ENS_rel}/{kingdom}/fasta/{species}/cds/"
- )
+ cds_search_url = database + f"release-{ENS_rel}/{kingdom}/fasta/{species}/cds/"
else:
cds_search_url = database + f"release-{ENS_rel}/fasta/{species}/cds/"
# Get link, release date and dataset size
- cds_str, cds_date, cds_size = find_FTP_link(
- url=cds_search_url, link_substring="cds.all.fa"
- )
+ cds_str, cds_date, cds_size = find_FTP_link(url=cds_search_url, link_substring="cds.all.fa")
# Build the final download link
if not isinstance(cds_str, type(None)):
cds_url = cds_search_url + cds_str
@@ -339,9 +308,7 @@ def ref(
if "all" in which or "ncrna" in which:
# Define location of ncRNA links
if database == ENSEMBL_FTP_URL_NV:
- ncrna_search_url = (
- database + f"release-{ENS_rel}/{kingdom}/fasta/{species}/ncrna/"
- )
+ ncrna_search_url = database + f"release-{ENS_rel}/{kingdom}/fasta/{species}/ncrna/"
else:
ncrna_search_url = database + f"release-{ENS_rel}/fasta/{species}/ncrna/"
@@ -373,15 +340,11 @@ def ref(
if "all" in which or "pep" in which:
# Define location of pep links
if database == ENSEMBL_FTP_URL_NV:
- pep_search_url = (
- database + f"release-{ENS_rel}/{kingdom}/fasta/{species}/pep/"
- )
+ pep_search_url = database + f"release-{ENS_rel}/{kingdom}/fasta/{species}/pep/"
else:
pep_search_url = database + f"release-{ENS_rel}/fasta/{species}/pep/"
# Get link, release date and dataset size
- pep_str, pep_date, pep_size = find_FTP_link(
- url=pep_search_url, link_substring=".pep.all.fa"
- )
+ pep_str, pep_date, pep_size = find_FTP_link(url=pep_search_url, link_substring=".pep.all.fa")
# Build the final download link
if not isinstance(pep_str, type(None)):
pep_url = pep_search_url + pep_str
@@ -517,17 +480,13 @@ def ref(
with open("gget_ref_results.json", "w", encoding="utf-8") as file:
json.dump(ref_dict, file, ensure_ascii=False, indent=4)
if verbose:
- logger.info(
- f"Fetching reference information for {species} from Ensembl release: {ENS_rel}."
- )
+ logger.info(f"Fetching reference information for {species} from Ensembl release: {ENS_rel}.")
return ref_dict
# If FTP==True, return only the specified URLs as a list
if ftp:
if verbose:
- logger.info(
- f"Fetching reference information for {species} from Ensembl release: {ENS_rel}."
- )
+ logger.info(f"Fetching reference information for {species} from Ensembl release: {ENS_rel}.")
results = []
for return_val in which:
if return_val == "all":
diff --git a/gget/gget_search.py b/gget/gget_search.py
index 221bb6d96..1ffbf236f 100644
--- a/gget/gget_search.py
+++ b/gget/gget_search.py
@@ -1,27 +1,29 @@
-import numpy as np
-import pandas as pd
import json as json_package
-import mysql.connector as sql
import time
import warnings
+import mysql.connector as sql
+import numpy as np
+import pandas as pd
+
warnings.simplefilter(action="ignore", category=UserWarning)
# Custom functions
-from .utils import (
- search_species_options,
+from .utils import ( # noqa: E402
find_latest_ens_rel,
- wrap_cols_func,
find_nv_kingdom,
+ search_species_options,
set_up_logger,
+ wrap_cols_func,
)
logger = set_up_logger()
-from gget.constants import ENSEMBL_FTP_URL, ENSEMBL_FTP_URL_NV
+from gget.constants import ENSEMBL_FTP_URL, ENSEMBL_FTP_URL_NV # noqa: E402
def clean_cols(x):
+ """Collapse a list to its single unique value, or return x unchanged if not a list."""
if isinstance(x, list):
unique_list = list(set(x))
if len(unique_list) == 1:
@@ -45,8 +47,8 @@ def search(
save=False,
verbose=True,
):
- """
- Function to query Ensembl for genes based on species and free form search terms.
+ """Function to query Ensembl for genes based on species and free form search terms.
+
Automatically fetches results from latest Ensembl release, unless user specifies database (see 'species' argument)
or release database (see 'release' argument).
@@ -81,9 +83,7 @@ def search(
"""
# Handle deprecated arguments
if seqtype:
- logger.error(
- "'seqtype' argument deprecated! Please use argument 'id_type' instead."
- )
+ logger.error("'seqtype' argument deprecated! Please use argument 'id_type' instead.")
return
start_time = time.time()
@@ -93,17 +93,13 @@ def search(
id_types = ["gene", "transcript"]
id_type = id_type.lower()
if id_type not in id_types:
- raise ValueError(
- f"ID type (id_type) specified is '{id_type}'. Expected one of: {', '.join(id_types)}"
- )
+ raise ValueError(f"ID type (id_type) specified is '{id_type}'. Expected one of: {', '.join(id_types)}")
# Check if 'andor' arg is valid
andors = ["and", "or"]
andor = andor.lower()
if andor not in andors:
- raise ValueError(
- f"'andor' argument specified as {andor}. Expected one of {', '.join(andors)}"
- )
+ raise ValueError(f"'andor' argument specified as {andor}. Expected one of {', '.join(andors)}")
## Get database for specified species
# Species shortcuts
@@ -122,9 +118,7 @@ def search(
if "core" in species:
db = species
if release:
- logger.warning(
- "Specified release overwritten because database name was provided."
- )
+ logger.warning("Specified release overwritten because database name was provided.")
else:
if release:
ens_rel = release
@@ -157,11 +151,7 @@ def search(
db = f"homo_sapiens_core_{ens_rel}_38"
# Check for ambiguous species matches in species other than mouse and human
- elif (
- len(db) > 1
- and "mus_musculus" not in species
- and "homo_sapiens" not in species
- ):
+ elif len(db) > 1 and "mus_musculus" not in species and "homo_sapiens" not in species:
logger.warning(
f"Species matches more than one database. Defaulting to first database: {db[0]}.\n"
"All available databases can be found here:\n"
@@ -203,7 +193,7 @@ def search(
)
connection_successful = True
break
- except Exception as e:
+ except Exception as e: # noqa: BLE001
last_exception = e
# Continue to the next port if the connection is unsuccessful
continue
@@ -214,18 +204,16 @@ def search(
raise RuntimeError(
f"""
The Ensembl server returned the following error: {str(last_exception)}.
- This might be caused by the Ensembl release number being too low.
+ This might be caused by the Ensembl release number being too low.
Please try again with a more recent release.
"""
)
else:
- raise RuntimeError(
- f"The Ensembl server returned the following error: {str(last_exception)}"
- )
+ raise RuntimeError(f"The Ensembl server returned the following error: {str(last_exception)}")
## Clean up list of searchwords
# If single searchword passed as string, convert to list
- if type(searchwords) == str:
+ if isinstance(searchwords, str):
searchwords = [searchwords]
## Find genes
@@ -233,10 +221,10 @@ def search(
if id_type == "gene":
query = f"""
SELECT gene.stable_id AS 'ensembl_id', xref.display_label AS 'gene_name', gene.description AS 'ensembl_description', xref.description AS 'ext_ref_description', gene.biotype AS 'biotype', external_synonym.synonym AS 'synonym'
- FROM gene
- LEFT JOIN xref ON gene.display_xref_id = xref.xref_id
- LEFT JOIN external_synonym ON gene.display_xref_id = external_synonym.xref_id
- LEFT JOIN gene_attrib ON gene.gene_id = gene_attrib.gene_id
+ FROM gene
+ LEFT JOIN xref ON gene.display_xref_id = xref.xref_id
+ LEFT JOIN external_synonym ON gene.display_xref_id = external_synonym.xref_id
+ LEFT JOIN gene_attrib ON gene.gene_id = gene_attrib.gene_id
WHERE (gene.description LIKE '%{searchword}%' OR xref.description LIKE '%{searchword}%' OR xref.display_label LIKE '%{searchword}%' OR external_synonym.synonym LIKE '%{searchword}%' OR gene_attrib.value LIKE '%{searchword}%')
"""
@@ -268,10 +256,10 @@ def search(
if id_type == "transcript":
query = f"""
SELECT transcript.stable_id AS 'ensembl_id', xref.display_label AS 'gene_name', transcript.description AS 'ensembl_description', xref.description AS 'ext_ref_description', transcript.biotype AS 'biotype', external_synonym.synonym AS 'synonym'
- FROM transcript
- LEFT JOIN xref ON transcript.display_xref_id = xref.xref_id
- LEFT JOIN external_synonym ON transcript.display_xref_id = external_synonym.xref_id
- LEFT JOIN transcript_attrib ON transcript.transcript_id = transcript_attrib.transcript_id
+ FROM transcript
+ LEFT JOIN xref ON transcript.display_xref_id = xref.xref_id
+ LEFT JOIN external_synonym ON transcript.display_xref_id = external_synonym.xref_id
+ LEFT JOIN transcript_attrib ON transcript.transcript_id = transcript_attrib.transcript_id
WHERE (transcript.description LIKE '%{searchword}%' OR xref.description LIKE '%{searchword}%' OR xref.display_label LIKE '%{searchword}%' OR external_synonym.synonym LIKE '%{searchword}%' OR transcript_attrib.value LIKE '%{searchword}%')
"""
@@ -317,12 +305,11 @@ def search(
# Keep synonyms always of type list for consistency
df["synonym"] = [
- np.sort(syn).tolist() if isinstance(syn, list) else np.sort([syn]).tolist()
- for syn in df["synonym"].values
+ np.sort(syn).tolist() if isinstance(syn, list) else np.sort([syn]).tolist() for syn in df["synonym"].values
]
# If limit is not None, keep only the first {limit} rows
- if limit != None:
+ if limit is not None:
# Print number of genes/transcripts found versus fetched
if verbose:
logger.info(f"Returning {limit} matches of {len(df)} total matches found.")
@@ -342,33 +329,19 @@ def search(
clean_db = "_".join(db.split("_")[:3]).replace("_core", "")
## Find kingdom for non-vertebrate species
- kingdom = find_nv_kingdom(
- clean_db, release=find_latest_ens_rel(database=ENSEMBL_FTP_URL_NV)
- )
+ kingdom = find_nv_kingdom(clean_db, release=find_latest_ens_rel(database=ENSEMBL_FTP_URL_NV))
if kingdom:
# Add URL to gene summary on Ensembl for invertebrates
- df["url"] = (
- f"https://{kingdom}.ensembl.org/"
- + clean_db
- + "/Gene/Summary?g="
- + df["ensembl_id"]
- )
+ df["url"] = f"https://{kingdom}.ensembl.org/" + clean_db + "/Gene/Summary?g=" + df["ensembl_id"]
else:
# Add URL to gene summary on Ensembl for vertebrates
- df["url"] = (
- "https://useast.ensembl.org/"
- + clean_db
- + "/Gene/Summary?g="
- + df["ensembl_id"]
- )
+ df["url"] = "https://useast.ensembl.org/" + clean_db + "/Gene/Summary?g=" + df["ensembl_id"]
if wrap_text:
df_wrapped = df.copy()
- wrap_cols_func(
- df_wrapped, ["ensembl_description", "ext_ref_description", "url"]
- )
+ wrap_cols_func(df_wrapped, ["ensembl_description", "ext_ref_description", "url"])
if json:
results_dict = json_package.loads(df.to_json(orient="records"))
diff --git a/gget/gget_seq.py b/gget/gget_seq.py
index 535f739fc..15bd3a859 100644
--- a/gget/gget_seq.py
+++ b/gget/gget_seq.py
@@ -1,13 +1,10 @@
-import numpy as np
-
# Custom functions
-from .utils import rest_query, get_uniprot_seqs, set_up_logger, post_query
+from .utils import get_uniprot_seqs, post_query, rest_query, set_up_logger
logger = set_up_logger()
-from .gget_info import info
-
# Constants
-from .constants import ENSEMBL_REST_API, UNIPROT_REST_API
+from .constants import ENSEMBL_REST_API, UNIPROT_REST_API # noqa: E402
+from .gget_info import info # noqa: E402
def seq(
@@ -19,9 +16,9 @@ def seq(
seqtype=None,
verbose=True,
):
- """
- Fetch nucleotide or amino acid sequence (FASTA) of a gene
- (and all its isoforms) or transcript by Ensembl, WormBase or FlyBase ID.
+ """Fetch nucleotide or amino acid sequence (FASTA) of a gene or transcript.
+
+ Fetches the gene (and all its isoforms) or transcript by Ensembl, WormBase or FlyBase ID.
Args:
- ens_ids One or more Ensembl IDs (passed as string or list of strings).
@@ -41,9 +38,7 @@ def seq(
"""
# Handle deprecated arguments
if seqtype:
- logger.error(
- "'seqtype' argument deprecated! Please use True/False argument 'translate' instead."
- )
+ logger.error("'seqtype' argument deprecated! Please use True/False argument 'translate' instead.")
return
if transcribe:
translate = transcribe
@@ -51,7 +46,7 @@ def seq(
## Clean up arguments
# Clean up Ensembl IDs
# If single Ensembl ID passed as string, convert to list
- if type(ens_ids) == str:
+ if isinstance(ens_ids, str):
ens_ids = [ens_ids]
# Remove Ensembl ID version if passed
ens_ids_clean = []
@@ -109,16 +104,12 @@ def seq(
actual_results_dict[ensembl_ID] = {"seq": df_temp}
if verbose:
- logger.info(
- f"Requesting nucleotide sequence of {ensembl_ID} from Ensembl."
- )
+ logger.info(f"Requesting nucleotide sequence of {ensembl_ID} from Ensembl.")
missing_ids = set(ens_ids_clean) - set(actual_results_dict.keys())
for missing in missing_ids:
- logger.error(
- f"ID {missing} not found. Please double-check spelling/arguments and try again."
- )
+ logger.error(f"ID {missing} not found. Please double-check spelling/arguments and try again.")
# Add results to master dict
master_dict.update(actual_results_dict)
@@ -131,9 +122,7 @@ def seq(
results_dict = {ensembl_ID: {}}
# Get ID type (gene, transcript, ...) using gget info
- info_df = info(
- ensembl_ID, verbose=False, pdb=False, ncbi=False, uniprot=False
- )
+ info_df = info(ensembl_ID, verbose=False, pdb=False, ncbi=False, uniprot=False)
# Check if Ensembl ID was found
if isinstance(info_df, type(None)):
@@ -147,9 +136,7 @@ def seq(
# If the ID is a gene, get the IDs of all its transcripts
if ens_ID_type == "Gene":
if verbose:
- logger.info(
- f"Requesting nucleotide sequences of all transcripts of {ensembl_ID} from Ensembl."
- )
+ logger.info(f"Requesting nucleotide sequences of all transcripts of {ensembl_ID} from Ensembl.")
for transcipt_id in info_df.loc[ensembl_ID]["all_transcripts"]:
# Remove version number for Ensembl IDs (not for flybase/wormbase IDs)
@@ -170,14 +157,11 @@ def seq(
df_temp.pop(key, None)
# Add results to main dict
- results_dict[ensembl_ID].update(
- {f"{transcipt_id}": df_temp}
- )
+ results_dict[ensembl_ID].update({f"{transcipt_id}": df_temp})
except RuntimeError:
logger.error(
- f"ID {transcipt_id} not found. "
- "Please double-check spelling/arguments and try again."
+ f"ID {transcipt_id} not found. Please double-check spelling/arguments and try again."
)
# If isoform true, but ID is not a gene; ignore the isoform parameter
@@ -199,15 +183,12 @@ def seq(
# Add results to main dict
results_dict[ensembl_ID].update({"seq": df_temp})
- logger.info(
- f"Requesting nucleotide sequence of {ensembl_ID} from Ensembl."
- )
+ logger.info(f"Requesting nucleotide sequence of {ensembl_ID} from Ensembl.")
logger.warning("The isoform option only applies to gene IDs.")
except RuntimeError:
logger.error(
- f"ID {ensembl_ID} not found. "
- "Please double-check spelling/arguments and try again."
+ f"ID {ensembl_ID} not found. Please double-check spelling/arguments and try again."
)
# Add results to master dict
@@ -220,12 +201,7 @@ def seq(
fasta.append(">" + ens_ID + " " + master_dict[ens_ID][key]["desc"])
fasta.append(master_dict[ens_ID][key]["seq"])
else:
- fasta.append(
- ">"
- + master_dict[ens_ID][key]["id"]
- + " "
- + master_dict[ens_ID][key]["desc"]
- )
+ fasta.append(">" + master_dict[ens_ID][key]["id"] + " " + master_dict[ens_ID][key]["desc"])
fasta.append(master_dict[ens_ID][key]["seq"])
## Fetch amino acid sequences from UniProt
@@ -236,15 +212,11 @@ def seq(
for ensembl_ID in ens_ids_clean:
# Get ID type (gene, transcript, ...) using gget info
- info_df = info(
- ensembl_ID, verbose=False, pdb=False, ncbi=False, uniprot=False
- )
+ info_df = info(ensembl_ID, verbose=False, pdb=False, ncbi=False, uniprot=False)
# Check that Ensembl ID was found
if isinstance(info_df, type(None)):
- logger.warning(
- f"ID '{ensembl_ID}' not found. Please double-check spelling/arguments."
- )
+ logger.warning(f"ID '{ensembl_ID}' not found. Please double-check spelling/arguments.")
continue
ens_ID_type = info_df.loc[ensembl_ID]["object_type"]
@@ -285,9 +257,7 @@ def seq(
trans_ids.append(ensembl_ID)
if verbose:
- logger.info(
- f"Requesting amino acid sequence of {ensembl_ID} from UniProt."
- )
+ logger.info(f"Requesting amino acid sequence of {ensembl_ID} from UniProt.")
else:
logger.warning(
@@ -303,15 +273,11 @@ def seq(
for ensembl_ID in ens_ids_clean:
# Get ID type (gene, transcript, ...) using gget info
- info_df = info(
- ensembl_ID, verbose=False, pdb=False, ncbi=False, uniprot=False
- )
+ info_df = info(ensembl_ID, verbose=False, pdb=False, ncbi=False, uniprot=False)
# Check that Ensembl ID was found
if isinstance(info_df, type(None)):
- logger.warning(
- f"ID '{ensembl_ID}' not found. Please double-check spelling/arguments."
- )
+ logger.warning(f"ID '{ensembl_ID}' not found. Please double-check spelling/arguments.")
continue
ens_ID_type = info_df.loc[ensembl_ID]["object_type"]
@@ -347,9 +313,7 @@ def seq(
trans_ids.append(ensembl_ID)
if verbose:
- logger.info(
- f"Requesting amino acid sequence of {ensembl_ID} from UniProt."
- )
+ logger.info(f"Requesting amino acid sequence of {ensembl_ID} from UniProt.")
logger.warning("The isoform option only applies to gene IDs.")
else:
@@ -380,6 +344,7 @@ def seq(
df_uniprot["organism"].values,
df_uniprot["sequence_length"].values,
df_uniprot["sequence"].values,
+ strict=False,
):
fasta.append(
">"
diff --git a/gget/gget_setup.py b/gget/gget_setup.py
index a2a24d0d8..650988449 100644
--- a/gget/gget_setup.py
+++ b/gget/gget_setup.py
@@ -1,22 +1,22 @@
-import os
+import importlib
import logging
+import os
+import pathlib
+import platform
import shutil
-import sys
import subprocess
-import platform
-import uuid
+import sys
import tempfile
-import pathlib
-import importlib
+import uuid
-from .utils import set_up_logger, check_file_for_error_message
+from .utils import check_file_for_error_message, set_up_logger
logger = set_up_logger()
-from .compile import PACKAGE_PATH
-from .constants import (
- ELM_INSTANCES_FASTA_DOWNLOAD,
+from .compile import PACKAGE_PATH # noqa: E402
+from .constants import ( # noqa: E402
ELM_CLASSES_TSV_DOWNLOAD,
+ ELM_INSTANCES_FASTA_DOWNLOAD,
ELM_INSTANCES_TSV_DOWNLOAD,
ELM_INTDOMAINS_TSV_DOWNLOAD,
)
@@ -37,9 +37,7 @@
# # Path to temporary mounted disk (global)
# TMP_DISK = ""
# Model parameters
-PARAMS_URL = (
- "https://storage.googleapis.com/alphafold/alphafold_params_colab_2022-12-06.tar"
-)
+PARAMS_URL = "https://storage.googleapis.com/alphafold/alphafold_params_colab_2022-12-06.tar"
PARAMS_DIR = os.path.join(PACKAGE_PATH, "bins/alphafold/")
PARAMS_PATH = os.path.join(PARAMS_DIR, "params_temp.tar")
@@ -60,11 +58,11 @@ def _install(package: str, import_name: str, verbose: bool = True):
if process.wait() != 0:
if stderr:
sys.stderr.write(stderr)
- logger.error(
- f"{package} installation with '{cmd_str}' (https://pypi.org/project/{package}) failed."
- )
+ logger.error(f"{package} installation with '{cmd_str}' (https://pypi.org/project/{package}) failed.")
if cmd == pip_cmds[-1]:
- logger.error(f"All installation attempts for {package} have failed. Note: Some dependencies (e.g., cellxgene-census) may not support the latest Python versions. If you encounter installation errors, try using an earlier Python version.")
+ logger.error(
+ f"All installation attempts for {package} have failed. Note: Some dependencies (e.g., cellxgene-census) may not support the latest Python versions. If you encounter installation errors, try using an earlier Python version."
+ )
return
else:
if verbose:
@@ -82,7 +80,9 @@ def _install(package: str, import_name: str, verbose: bool = True):
)
# Retry with pip if import after uv installation failed
if cmd == pip_cmds[-1]:
- logger.error(f"All installation attempts for {package} have failed. Note: Some dependencies (e.g., cellxgene-census) may not support the latest Python versions. If you encounter installation errors, try using an earlier Python version.")
+ logger.error(
+ f"All installation attempts for {package} have failed. Note: Some dependencies (e.g., cellxgene-census) may not support the latest Python versions. If you encounter installation errors, try using an earlier Python version."
+ )
return
else:
if verbose:
@@ -91,8 +91,8 @@ def _install(package: str, import_name: str, verbose: bool = True):
def setup(module, verbose=True, out=None):
- """
- Function to install third-party dependencies for a specified gget module.
+ """Function to install third-party dependencies for a specified gget module.
+
Some modules require pip to be installed (https://pip.pypa.io/en/stable/installation).
Some modules require curl to be installed (https://everything.curl.dev/get).
@@ -105,9 +105,7 @@ def setup(module, verbose=True, out=None):
"""
supported_modules = ["alphafold", "cellxgene", "elm", "gpt", "cbio"]
if module not in supported_modules:
- raise ValueError(
- f"'module' argument specified as {module}. Expected one of: {', '.join(supported_modules)}"
- )
+ raise ValueError(f"'module' argument specified as {module}. Expected one of: {', '.join(supported_modules)}")
if module == "gpt":
_install("openai<=0.28.1", "openai", verbose=verbose)
@@ -120,18 +118,14 @@ def setup(module, verbose=True, out=None):
logger.info(
"ELM data can be downloaded & distributed for non-commercial use according to the following license: http://elm.eu.org/media/Elm_academic_license.pdf"
)
- logger.info(
- "Downloading ELM database files (requires curl to be installed)..."
- )
+ logger.info("Downloading ELM database files (requires curl to be installed)...")
if out is not None:
elm_files_out = os.path.abspath(out)
elm_instances_fasta = os.path.join(elm_files_out, "elm_instances.fasta")
elm_classes_tsv = os.path.join(elm_files_out, "elms_classes.tsv")
elm_instances_tsv = os.path.join(elm_files_out, "elm_instances.tsv")
- elm_intdomains_tsv = os.path.join(
- elm_files_out, "elm_interaction_domains.tsv"
- )
+ elm_intdomains_tsv = os.path.join(elm_files_out, "elm_interaction_domains.tsv")
# Create folder for ELM files (if it does not exist)
if not os.path.exists(elm_files_out):
@@ -199,16 +193,11 @@ def setup(module, verbose=True, out=None):
missing.append(label)
if missing:
- raise RuntimeError(
- "ELM database files download failed; missing files: "
- + ", ".join(missing)
- )
+ raise RuntimeError("ELM database files download failed; missing files: " + ", ".join(missing))
elif module == "alphafold":
if platform.system() == "Windows":
- logger.error(
- "gget setup alphafold and gget alphafold are not supported on Windows OS."
- )
+ logger.error("gget setup alphafold and gget alphafold are not supported on Windows OS.")
return
## Ask user to install openmm if not already installed
@@ -229,19 +218,19 @@ def setup(module, verbose=True, out=None):
except ImportError as e:
raise ImportError(
f"""
- Trying to import openmm resulted in the following error:
+ Trying to import openmm resulted in the following error:
{e}
- Please install AlphaFold third-party dependency openmm by running the following command from the command line:
- For Python version < 3.10:
- 'conda install -qy conda==4.13.0 && conda install -qy -c conda-forge openmm=7.5.1'
- For Python version 3.10:
- 'conda install -qy conda==24.1.2 && conda install -qy -c conda-forge openmm=7.7.0'
- For Python version 3.11:
- 'conda install -qy conda==24.11.1 && conda install -qy -c conda-forge openmm=8.0.0'
+ Please install AlphaFold third-party dependency openmm by running the following command from the command line:
+ For Python version < 3.10:
+ 'conda install -qy conda==4.13.0 && conda install -qy -c conda-forge openmm=7.5.1'
+ For Python version 3.10:
+ 'conda install -qy conda==24.1.2 && conda install -qy -c conda-forge openmm=7.7.0'
+ For Python version 3.11:
+ 'conda install -qy conda==24.11.1 && conda install -qy -c conda-forge openmm=8.0.0'
(Recommendation: Follow with 'conda update -qy conda' to update conda to the latest version afterwards.)
"""
- )
+ ) from e
## Install py3Dmol
_install("py3Dmol", "py3Dmol", verbose=verbose)
@@ -257,9 +246,7 @@ def setup(module, verbose=True, out=None):
os.environ.setdefault("UV_HTTP_TIMEOUT", "300")
# Define AlphaFold folder name and location
- alphafold_folder = os.path.join(
- tempfile.gettempdir(), f"tmp_alphafold_{uuid.uuid4()}"
- )
+ alphafold_folder = os.path.join(tempfile.gettempdir(), f"tmp_alphafold_{uuid.uuid4()}")
pathlib.Path(alphafold_folder).mkdir(parents=True, exist_ok=True)
# Clean (unescaped) jackhmmer cache dir; we’ll patch file contents via Python
@@ -268,7 +255,7 @@ def setup(module, verbose=True, out=None):
# Core AlphaFold dependencies (Colab/CPU friendly set)
alphafold_deps = [
"absl-py>=2.1,<3",
- "dm-haiku<=0.0.12", # dont upgrade to avoid clash with jax
+ "dm-haiku<=0.0.12", # dont upgrade to avoid clash with jax
"dm-tree>=0.1.8",
"filelock>=3.12",
"jax==0.4.26",
@@ -278,7 +265,7 @@ def setup(module, verbose=True, out=None):
"jmp>=0.0.4",
"ml-collections>=0.1,<1",
"ml-dtypes>=0.3.1,<0.6",
- "numpy>=1.26,<2", # keeps TF 2.17 CPU happy
+ "numpy>=1.26,<2", # keeps TF 2.17 CPU happy
"opt-einsum>=3.4,<4",
"pillow>=10,<12",
"protobuf<4",
@@ -300,7 +287,7 @@ def setup(module, verbose=True, out=None):
# Patch jackhmmer.py
jack_py = os.path.join(alphafold_folder, "alphafold", "data", "tools", "jackhmmer.py")
- with open(jack_py, "r", encoding="utf-8") as f:
+ with open(jack_py, encoding="utf-8") as f:
txt = f.read()
txt = txt.replace("/tmp/ramdisk", jack_dir)
@@ -315,16 +302,10 @@ def setup(module, verbose=True, out=None):
f.write(txt)
# Base deps first (NumPy/TF/JAX in a known good combo)
- subprocess.run(
- [*pip_upgrade.split(), "numpy>=1.26,<2", "tensorflow-cpu>=2.17,<2.18"],
- check=True
- )
+ subprocess.run([*pip_upgrade.split(), "numpy>=1.26,<2", "tensorflow-cpu>=2.17,<2.18"], check=True)
# The rest of the deps
- subprocess.run(
- [*pip_upgrade.split(), *alphafold_deps],
- check=True
- )
+ subprocess.run([*pip_upgrade.split(), *alphafold_deps], check=True)
# Install AF itself without bringing in its pinned requirements
subprocess.run(f'{pip_nodeps} "{alphafold_folder}"', check=True, shell=True)
@@ -334,7 +315,7 @@ def setup(module, verbose=True, out=None):
# Show any captured stderr from our last step, if available
try:
sys.stderr.write(str(e) + "\n")
- except Exception:
+ except Exception: # noqa: BLE001
pass
shutil.rmtree(alphafold_folder, ignore_errors=True)
return
@@ -344,6 +325,7 @@ def setup(module, verbose=True, out=None):
try:
import alphafold as AlphaFold
+
if verbose:
logger.info("AlphaFold installed succesfully.")
except ImportError as e:
@@ -359,9 +341,7 @@ def setup(module, verbose=True, out=None):
if verbose:
logger.info("Installing pdbfixer from source (requires pip and git).")
- pdbfixer_folder = os.path.join(
- tempfile.gettempdir(), f"tmp_pdbfixer_{uuid.uuid4()}"
- )
+ pdbfixer_folder = os.path.join(tempfile.gettempdir(), f"tmp_pdbfixer_{uuid.uuid4()}")
try:
if openmm.__version__ == "7.5.1":
@@ -369,7 +349,7 @@ def setup(module, verbose=True, out=None):
PDBFIXER_VERSION = "v1.7"
else:
PDBFIXER_VERSION = "v1.8.1"
- except:
+ except Exception: # noqa: BLE001
PDBFIXER_VERSION = "v1.8.1"
pip_cmd = "uv pip install" if shutil.which("uv") else "pip install -q"
@@ -398,7 +378,7 @@ def setup(module, verbose=True, out=None):
pdb_out, err = process.communicate()
if pdb_out.decode() != "":
- logger.info(f"pdbfixer installed succesfully.")
+ logger.info("pdbfixer installed succesfully.")
else:
logger.error("pdbfixer installation failed.")
return
@@ -418,19 +398,17 @@ def setup(module, verbose=True, out=None):
# The double-quotation marks allow white spaces in the path, but this does not work for Windows
command = f"""
curl -# -o {PARAMS_PATH} {PARAMS_URL} \\
- && tar --extract --file={PARAMS_PATH} --directory={PARAMS_DIR+'params/'} --preserve-permissions \\
+ && tar --extract --file={PARAMS_PATH} --directory={PARAMS_DIR + "params/"} --preserve-permissions \\
&& rm {PARAMS_PATH}
"""
else:
command = f"""
curl -# -o '{PARAMS_PATH}' '{PARAMS_URL}' \\
- && tar --extract --file='{PARAMS_PATH}' --directory='{PARAMS_DIR+'params/'}' --preserve-permissions \\
+ && tar --extract --file='{PARAMS_PATH}' --directory='{PARAMS_DIR + "params/"}' --preserve-permissions \\
&& rm '{PARAMS_PATH}'
"""
- with subprocess.Popen(
- command, shell=True, stderr=subprocess.PIPE
- ) as process:
+ with subprocess.Popen(command, shell=True, stderr=subprocess.PIPE) as process:
stderr = process.stderr.read().decode("utf-8")
# Log the standard error if it is not empty
if stderr:
diff --git a/gget/gget_virus.py b/gget/gget_virus.py
index 5970878ed..b6bf49383 100644
--- a/gget/gget_virus.py
+++ b/gget/gget_virus.py
@@ -1,37 +1,40 @@
+import calendar
+import gc # For garbage collection to manage memory
+import http.client
+import json
+import logging # For logging level checks
import os
+import platform # For OS detection
import re
-import json
-import sys # For accessing command line arguments
-import time # For adding delays between requests
-import logging # For logging level checks
-import shutil # For directory operations
-import subprocess # For executing external commands
-import traceback # For error traceback logging
-import platform # For OS detection
-import stat # For file permission constants
-import gc # For garbage collection to manage memory
-import pandas as pd # For data manipulation and CSV output
-import requests # For HTTP requests to NCBI API
-import zipfile # For extracting downloaded ZIP files
-from tqdm import tqdm # For progress bar display
+import shutil # For directory operations
+import stat # For file permission constants
+import subprocess # For executing external commands
+import sys # For accessing command line arguments
+import time # For adding delays between requests
+import traceback # For error traceback logging
+import xml.etree.ElementTree as ET # For XML parsing
+import zipfile # For extracting downloaded ZIP files
from datetime import datetime # For date handling
-from dateutil import parser # For flexible date parsing
-import xml.etree.ElementTree as ET # For XML parsing
-import http.client
+from urllib.parse import quote
+
+import pandas as pd # For data manipulation and CSV output
+import requests # For HTTP requests to NCBI API
import urllib3
-from urllib3.util.retry import Retry
+from dateutil import parser # For flexible date parsing
from requests.adapters import HTTPAdapter
-from urllib.parse import quote
-import calendar
+from tqdm import tqdm # For progress bar display
+from urllib3.util.retry import Retry
-# Internal imports for logging, unique ID generation, and FASTA parsing
-from .utils import set_up_logger, FastaIO
-from .constants import NCBI_API_BASE, NCBI_EUTILS_BASE_EFETCH, NCBI_EUTILS_BASE_ESEARCH
from .compile import PACKAGE_PATH
+from .constants import NCBI_API_BASE, NCBI_EUTILS_BASE_EFETCH, NCBI_EUTILS_BASE_ESEARCH
+
+# Internal imports for logging, unique ID generation, and FASTA parsing
+from .utils import FastaIO, set_up_logger
# Optional psutil import for memory monitoring
try:
import psutil
+
PSUTIL_AVAILABLE = True
except ImportError:
PSUTIL_AVAILABLE = False
@@ -67,7 +70,7 @@
GENBANK_MAX_BATCH_SIZE_WARNING = 500 # Warn user if batch size exceeds this
GENBANK_RETRY_ATTEMPTS = 5 # Number of retry attempts for GenBank requests
GENBANK_XML_CHUNK_SIZE = 10000 # Rows to process before writing to CSV
-GENBANK_COMPLEXITY = 1 # Complexity level with only the accessions requested. All levels explained here: https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch
+GENBANK_COMPLEXITY = 1 # Complexity level with only the accessions requested. All levels explained here: https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch
# Resolve API key from environment variable NCBI_API_KEY.
# Users can also pass an api_key argument directly to the virus() function / CLI --api_key.
@@ -94,33 +97,62 @@
# Virus Detection Identifiers
SARS_COV2_IDENTIFIERS = {
- 'sarscov2', 'sars2', '2697049', 'sarscov',
- 'severeacuterespiratorysyndromecoronavirus2',
- 'covid19', 'covid', 'coronavirusdisease', 'ncov', 'hcov19'
+ "sarscov2",
+ "sars2",
+ "2697049",
+ "sarscov",
+ "severeacuterespiratorysyndromecoronavirus2",
+ "covid19",
+ "covid",
+ "coronavirusdisease",
+ "ncov",
+ "hcov19",
}
ALPHAINFLUENZA_IDENTIFIERS = {
- 'alphainfluenza', 'alphainfluenzavirus', 'alphainfluenzavirusinfluenzae',
- 'influenzaavirus', 'influenzaa', 'flua',
- '197911', # Alphainfluenza genus
- '2955291', # Alphainfluenzavirus influenzae species
- '11320' # Influenza A virus
+ "alphainfluenza",
+ "alphainfluenzavirus",
+ "alphainfluenzavirusinfluenzae",
+ "influenzaavirus",
+ "influenzaa",
+ "flua",
+ "197911", # Alphainfluenza genus
+ "2955291", # Alphainfluenzavirus influenzae species
+ "11320", # Influenza A virus
}
# Default taxon for Alphainfluenza downloads (most comprehensive cached data)
ALPHAINFLUENZA_DEFAULT_TAXON = "Alphainfluenzavirus influenzae"
# Progress Indicator Keywords (for subprocess monitoring)
-PROGRESS_INDICATORS = ['%', '=', 'downloading', 'fetching', 'MB', 'GB', 'bytes']
+PROGRESS_INDICATORS = ["%", "=", "downloading", "fetching", "MB", "GB", "bytes"]
# Protein/Gene Keywords for Header Parsing
PROTEIN_KEYWORDS = [
- 'hemagglutinin', 'neuraminidase', 'polymerase', 'nucleoprotein',
- 'matrix protein', 'nonstructural protein', 'ns1', 'ns2',
- 'spike', 'envelope', 'membrane', 'nucleocapsid',
- 'orf', 'nsp', 'pp1a', 'pp1ab',
- 'segment 1', 'segment 2', 'segment 3', 'segment 4',
- 'segment 5', 'segment 6', 'segment 7', 'segment 8',
+ "hemagglutinin",
+ "neuraminidase",
+ "polymerase",
+ "nucleoprotein",
+ "matrix protein",
+ "nonstructural protein",
+ "ns1",
+ "ns2",
+ "spike",
+ "envelope",
+ "membrane",
+ "nucleocapsid",
+ "orf",
+ "nsp",
+ "pp1a",
+ "pp1ab",
+ "segment 1",
+ "segment 2",
+ "segment 3",
+ "segment 4",
+ "segment 5",
+ "segment 6",
+ "segment 7",
+ "segment 8",
]
# Date Parsing Configuration
@@ -156,120 +188,122 @@
# MEMORY MONITORING HELPERS
# =============================================================================
+
def _get_memory_usage():
- """
- Get current memory usage information for debugging.
-
- Returns:
+ """Get current memory usage information for debugging.
+
+ Returns
+ -------
dict: Dictionary with memory stats including:
- rss_mb: Resident Set Size in MB (actual RAM used)
- vms_mb: Virtual Memory Size in MB
- percent: Percent of total system memory used
- available_mb: Available system memory in MB
-
+
Note:
Falls back to /proc/self/status on Linux if psutil is not available.
+
"""
if PSUTIL_AVAILABLE:
try:
process = psutil.Process()
mem_info = process.memory_info()
sys_mem = psutil.virtual_memory()
-
+
return {
- 'rss_mb': mem_info.rss / (1024 * 1024),
- 'vms_mb': mem_info.vms / (1024 * 1024),
- 'percent': process.memory_percent(),
- 'available_mb': sys_mem.available / (1024 * 1024),
- 'total_mb': sys_mem.total / (1024 * 1024),
- 'system_percent': sys_mem.percent,
- 'psutil_available': True
+ "rss_mb": mem_info.rss / (1024 * 1024),
+ "vms_mb": mem_info.vms / (1024 * 1024),
+ "percent": process.memory_percent(),
+ "available_mb": sys_mem.available / (1024 * 1024),
+ "total_mb": sys_mem.total / (1024 * 1024),
+ "system_percent": sys_mem.percent,
+ "psutil_available": True,
}
- except Exception as e:
+ except Exception: # noqa: BLE001
pass # Fall through to /proc fallback
-
+
# Fallback for Linux: read from /proc/self/status
- result = {
- 'rss_mb': None,
- 'vms_mb': None,
- 'percent': None,
- 'available_mb': None,
- 'psutil_available': False
- }
-
+ result = {"rss_mb": None, "vms_mb": None, "percent": None, "available_mb": None, "psutil_available": False}
+
try:
- with open('/proc/self/status', 'r') as f:
+ with open("/proc/self/status") as f:
for line in f:
- if line.startswith('VmRSS:'):
+ if line.startswith("VmRSS:"):
# VmRSS is in kB
rss_kb = int(line.split()[1])
- result['rss_mb'] = rss_kb / 1024
- elif line.startswith('VmSize:'):
+ result["rss_mb"] = rss_kb / 1024
+ elif line.startswith("VmSize:"):
vms_kb = int(line.split()[1])
- result['vms_mb'] = vms_kb / 1024
+ result["vms_mb"] = vms_kb / 1024
except (FileNotFoundError, PermissionError, ValueError):
pass # Not on Linux or can't read /proc
-
+
# Try to get system memory from /proc/meminfo
try:
- with open('/proc/meminfo', 'r') as f:
+ with open("/proc/meminfo") as f:
for line in f:
- if line.startswith('MemAvailable:'):
+ if line.startswith("MemAvailable:"):
avail_kb = int(line.split()[1])
- result['available_mb'] = avail_kb / 1024
- elif line.startswith('MemTotal:'):
+ result["available_mb"] = avail_kb / 1024
+ elif line.startswith("MemTotal:"):
total_kb = int(line.split()[1])
- result['total_mb'] = total_kb / 1024
+ result["total_mb"] = total_kb / 1024
except (FileNotFoundError, PermissionError, ValueError):
pass
-
+
# Calculate percent if we have both values
- if result.get('rss_mb') and result.get('total_mb'):
- result['percent'] = (result['rss_mb'] / result['total_mb']) * 100
-
+ if result.get("rss_mb") and result.get("total_mb"):
+ result["percent"] = (result["rss_mb"] / result["total_mb"]) * 100
+
return result
def _log_memory_usage(context=""):
- """
- Log current memory usage with context information.
-
+ """Log current memory usage with context information.
+
Args:
context (str): Description of where in the code this is being called.
"""
mem = _get_memory_usage()
-
- if not mem.get('psutil_available'):
+
+ if not mem.get("psutil_available"):
logger.debug("Memory monitoring: psutil not available (install with 'pip install psutil' for memory debugging)")
return
-
- if mem.get('rss_mb') is not None:
- logger.info("📊 MEMORY [%s]: Process RSS=%.1f MB (%.1f%%), System: %.1f%% used, %.1f MB available of %.1f MB total",
- context,
- mem['rss_mb'],
- mem.get('percent', 0),
- mem.get('system_percent', 0),
- mem.get('available_mb', 0),
- mem.get('total_mb', 0))
+
+ if mem.get("rss_mb") is not None:
+ logger.info(
+ "📊 MEMORY [%s]: Process RSS=%.1f MB (%.1f%%), System: %.1f%% used, %.1f MB available of %.1f MB total",
+ context,
+ mem["rss_mb"],
+ mem.get("percent", 0),
+ mem.get("system_percent", 0),
+ mem.get("available_mb", 0),
+ mem.get("total_mb", 0),
+ )
else:
- logger.debug("Memory monitoring: Unable to get memory info - %s", mem.get('error', 'unknown error'))
+ logger.debug("Memory monitoring: Unable to get memory info - %s", mem.get("error", "unknown error"))
def _force_garbage_collection(context=""):
- """
- Force garbage collection and log the results.
-
+ """Force garbage collection and log the results.
+
Args:
context (str): Description of where in the code this is being called.
"""
before = _get_memory_usage()
collected = gc.collect()
after = _get_memory_usage()
-
- if before.get('rss_mb') is not None and after.get('rss_mb') is not None:
- freed = before['rss_mb'] - after['rss_mb']
- logger.info("🗑️ GC [%s]: Collected %d objects, freed %.1f MB (%.1f MB -> %.1f MB)",
- context, collected, freed, before['rss_mb'], after['rss_mb'])
+
+ if before.get("rss_mb") is not None and after.get("rss_mb") is not None:
+ freed = before["rss_mb"] - after["rss_mb"]
+ logger.info(
+ "🗑️ GC [%s]: Collected %d objects, freed %.1f MB (%.1f MB -> %.1f MB)",
+ context,
+ collected,
+ freed,
+ before["rss_mb"],
+ after["rss_mb"],
+ )
else:
logger.debug("GC [%s]: Collected %d objects", context, collected)
@@ -281,17 +315,13 @@ def _force_garbage_collection(context=""):
# Set up logger for this module
logger = set_up_logger()
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
-random_suffix = os.urandom(3).hex() # random suffix for naming uniqueness
+random_suffix = os.urandom(3).hex() # random suffix for naming uniqueness
# Path to precompiled datasets binary
if platform.system() == "Windows":
- PRECOMPILED_DATASETS_PATH = os.path.join(
- PACKAGE_PATH, "bins", "Windows", "datasets.exe"
- )
+ PRECOMPILED_DATASETS_PATH = os.path.join(PACKAGE_PATH, "bins", "Windows", "datasets.exe")
else:
- PRECOMPILED_DATASETS_PATH = os.path.join(
- PACKAGE_PATH, "bins", platform.system(), "datasets"
- )
+ PRECOMPILED_DATASETS_PATH = os.path.join(PACKAGE_PATH, "bins", platform.system(), "datasets")
# Cache for the datasets path to avoid repeated checks
_datasets_path_cache = None
@@ -300,6 +330,7 @@ def _force_garbage_collection(context=""):
# HELPER FUNCTIONS FOR RETRIES AND ERROR TRACKING
# =============================================================================
+
def _retry_with_exponential_backoff(
operation_name,
operation_func,
@@ -310,13 +341,12 @@ def _retry_with_exponential_backoff(
retryable_exceptions=(requests.exceptions.ConnectionError, requests.exceptions.HTTPError),
failed_commands=None,
):
- """
- Execute an operation with exponential backoff retry logic.
-
+ """Execute an operation with exponential backoff retry logic.
+
This is a reusable helper that consolidates the exponential backoff retry
pattern used throughout the module. It handles retryable exceptions with
configurable delays and logging.
-
+
Args:
operation_name (str): Name of the operation for logging (e.g., "batch_10").
operation_func (callable): Function to execute, should raise an exception on failure.
@@ -325,34 +355,40 @@ def _retry_with_exponential_backoff(
backoff_multiplier (float): Multiplier for exponential backoff.
retryable_exceptions (tuple): Exception types to retry on.
failed_commands (dict, optional): Dictionary to track failed operations.
-
- Returns:
+
+ Returns
+ -------
tuple: (success, result, error_info)
- success (bool): True if operation succeeded.
- result: Return value of operation_func (or None if failed).
- error_info (dict): Details about the failure (if any).
+
"""
retry_delay = initial_delay
last_exception = None
-
+
for attempt in range(max_retries):
try:
result = operation_func()
return True, result, None
-
+
except retryable_exceptions as e:
last_exception = e
is_retryable = True
-
+
# For HTTPError, check if it's a server error (5xx)
- if isinstance(e, requests.exceptions.HTTPError) and hasattr(e, 'response') and e.response:
+ if isinstance(e, requests.exceptions.HTTPError) and hasattr(e, "response") and e.response:
is_retryable = 500 <= e.response.status_code < 600
-
+
if attempt < max_retries - 1 and is_retryable:
capped_delay = min(retry_delay, max_delay)
logger.warning(
"⚠️ %s failed (attempt %d/%d): %s. Retrying in %.1f seconds...",
- operation_name, attempt + 1, max_retries, e, capped_delay
+ operation_name,
+ attempt + 1,
+ max_retries,
+ e,
+ capped_delay,
)
time.sleep(capped_delay)
retry_delay *= backoff_multiplier
@@ -360,28 +396,27 @@ def _retry_with_exponential_backoff(
else:
# Out of retries or non-retryable error
break
-
- except Exception as e:
+
+ except Exception as e: # noqa: BLE001
# Non-retryable exception types
last_exception = e
break
-
+
# Operation failed after retries
error_info = {
- 'error': str(last_exception),
- 'exception_type': type(last_exception).__name__,
+ "error": str(last_exception),
+ "exception_type": type(last_exception).__name__,
}
-
+
return False, None, error_info
def _track_failed_operation(failed_commands, operation_type, batch_info, error_info):
- """
- Track a failed operation in the failed_commands dictionary.
-
+ """Track a failed operation in the failed_commands dictionary.
+
This ensures consistent error tracking across all operation types for
later reporting in the command summary.
-
+
Args:
failed_commands (dict): Dictionary to track failures.
operation_type (str): Type of operation ('metadata_batch', 'sequence_batch', 'pagination', etc.).
@@ -390,32 +425,33 @@ def _track_failed_operation(failed_commands, operation_type, batch_info, error_i
"""
if failed_commands is None:
return
-
+
if operation_type not in failed_commands:
failed_commands[operation_type] = []
-
+
failure_record = {**batch_info, **error_info}
failed_commands[operation_type].append(failure_record)
logger.debug("Tracked failed %s: %s", operation_type, failure_record)
def _validate_datasets_binary(path):
- """
- Validate that a datasets binary exists and is functional.
-
+ """Validate that a datasets binary exists and is functional.
+
Args:
path (str): Path to the datasets binary to validate.
-
- Returns:
+
+ Returns
+ -------
bool: True if the binary exists and runs successfully, False otherwise.
+
"""
if not path:
return False
-
+
# Check if the file exists (for bundled binary) or is in PATH (for system binary)
if not os.path.isfile(path) and not shutil.which(path):
return False
-
+
# Verify the binary actually works
try:
result = subprocess.run(
@@ -430,9 +466,8 @@ def _validate_datasets_binary(path):
def _clear_datasets_cache():
- """
- Clear the cached datasets path, forcing re-detection on next call.
-
+ """Clear the cached datasets path, forcing re-detection on next call.
+
This is useful when the environment changes (e.g., user installs/uninstalls
the datasets CLI) or when the cached binary becomes unavailable.
"""
@@ -442,8 +477,7 @@ def _clear_datasets_cache():
def _get_datasets_path():
- """
- Get the path to the NCBI datasets CLI binary.
+ """Get the path to the NCBI datasets CLI binary.
This helper first checks if datasets is available in the system PATH.
If found, it uses the system-installed version. Otherwise, it falls back
@@ -453,14 +487,17 @@ def _get_datasets_path():
invalidated if the cached binary becomes unavailable (e.g., deleted or environment
changed), triggering re-detection.
- Returns:
+ Returns
+ -------
str: Path to the datasets binary ("datasets" for system PATH, or full path for bundled).
- Raises:
+ Raises
+ ------
RuntimeError: If no working datasets binary is available.
+
"""
global _datasets_path_cache
-
+
# If we have a cached path, validate it's still working
if _datasets_path_cache is not None:
if _validate_datasets_binary(_datasets_path_cache):
@@ -468,11 +505,11 @@ def _get_datasets_path():
else:
# Cached binary is no longer valid, clear cache and re-detect
logger.warning(
- "⚠️ Previously cached datasets binary at '%s' is no longer available. "
- "Re-detecting...", _datasets_path_cache
+ "⚠️ Previously cached datasets binary at '%s' is no longer available. Re-detecting...",
+ _datasets_path_cache,
)
_clear_datasets_cache()
-
+
# First, check if datasets is available in the system PATH
datasets_path = shutil.which("datasets")
if datasets_path:
@@ -484,17 +521,15 @@ def _get_datasets_path():
timeout=SUBPROCESS_VERSION_TIMEOUT,
)
if result.returncode == 0:
- logger.info(
- "✅ Using system-installed NCBI datasets CLI: %s", result.stdout.strip()
- )
+ logger.info("✅ Using system-installed NCBI datasets CLI: %s", result.stdout.strip())
_datasets_path_cache = datasets_path
return datasets_path
except (subprocess.TimeoutExpired, OSError):
pass # System binary didn't work, try bundled
-
+
# Fall back to the bundled binary
datasets_path = PRECOMPILED_DATASETS_PATH
-
+
# Check if the precompiled binary exists
if not os.path.isfile(datasets_path):
raise RuntimeError(
@@ -503,16 +538,14 @@ def _get_datasets_path():
"or install the NCBI datasets CLI manually: "
"https://www.ncbi.nlm.nih.gov/datasets/docs/v2/download-and-install/"
)
-
+
# On non-Windows systems, ensure the binary is executable
if platform.system() != "Windows":
try:
os.chmod(datasets_path, os.stat(datasets_path).st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)
except OSError as e:
- raise RuntimeError(
- f"Failed to make NCBI datasets binary executable: {e}"
- )
-
+ raise RuntimeError(f"Failed to make NCBI datasets binary executable: {e}") from e
+
# Verify the bundled binary works
try:
result = subprocess.run(
@@ -522,30 +555,25 @@ def _get_datasets_path():
timeout=SUBPROCESS_VERSION_TIMEOUT,
)
if result.returncode == 0:
- logger.info(
- "✅ Using bundled NCBI datasets CLI: %s", result.stdout.strip()
- )
+ logger.info("✅ Using bundled NCBI datasets CLI: %s", result.stdout.strip())
_datasets_path_cache = datasets_path
return datasets_path
except (subprocess.TimeoutExpired, OSError) as e:
- raise RuntimeError(
- f"Failed to verify bundled NCBI datasets binary at {datasets_path}: {e}"
- )
-
- raise RuntimeError(
- f"NCBI datasets binary at {datasets_path} failed verification."
- )
+ raise RuntimeError(f"Failed to verify bundled NCBI datasets binary at {datasets_path}: {e}") from e
+
+ raise RuntimeError(f"NCBI datasets binary at {datasets_path} failed verification.")
def _get_datasets_version():
- """
- Get the version of the NCBI datasets CLI if available.
-
+ """Get the version of the NCBI datasets CLI if available.
+
Attempts to retrieve the version string from the datasets binary.
Returns None if datasets is not available or version check fails.
-
- Returns:
+
+ Returns
+ -------
str or None: Version string from datasets (e.g., "16.11.0") or None if unavailable.
+
"""
try:
datasets_path = _get_datasets_path()
@@ -562,28 +590,29 @@ def _get_datasets_version():
return version_output
except (RuntimeError, subprocess.TimeoutExpired, OSError) as e:
logger.debug("Could not retrieve datasets version: %s", e)
-
+
return None
def _get_gget_version():
- """
- Get the version of gget.
-
- Returns:
+ """Get the version of gget.
+
+ Returns
+ -------
str: Version string (e.g., "1.2.0") or "unknown" if not available.
+
"""
try:
from . import __version__
+
return __version__
except (ImportError, AttributeError):
return "unknown"
def _get_modified_virus_name(virus_name, attempt=1):
- """
- Modify the virus name for retry attempts when the NCBI server is unreachable.
-
+ """Modify the virus name for retry attempts when the NCBI server is unreachable.
+
This function generates alternative virus names to try when the initial
query fails due to server unreachability. The modification strategies are:
1. (attempt=1) If the name contains parentheses, remove them and their contents.
@@ -592,14 +621,15 @@ def _get_modified_virus_name(virus_name, attempt=1):
(e.g., "Dengue" -> "Dengue virus")
3. (attempt=2) If the name ends with "virus" without a space, add a space.
(e.g., "Denguevirus" -> "Dengue virus")
-
+
Args:
virus_name (str): Original virus name that failed.
attempt (int): Which modification attempt this is (1 or 2).
-
- Returns:
+
+ Returns
+ -------
str or None: Modified virus name to retry, or None if no modification is possible.
-
+
Example:
>>> _get_modified_virus_name("Lassa virus (LASV)", attempt=1)
'Lassa virus'
@@ -611,160 +641,148 @@ def _get_modified_virus_name(virus_name, attempt=1):
'Dengue virus'
>>> _get_modified_virus_name("Dengue virus", attempt=2)
None # Already contains "virus" properly
+
"""
if not virus_name:
return None
-
+
virus_lower = virus_name.lower().strip()
-
+
# Attempt 1: Try removing parenthetical content
if attempt == 1:
# Check if there are parentheses to remove
- if '(' in virus_name and ')' in virus_name:
+ if "(" in virus_name and ")" in virus_name:
# Remove parenthetical content (e.g., "(LASV)" or "(strain XYZ)")
- modified = re.sub(r'\s*\([^)]*\)\s*', ' ', virus_name).strip()
+ modified = re.sub(r"\s*\([^)]*\)\s*", " ", virus_name).strip()
# Clean up any double spaces
- modified = re.sub(r'\s+', ' ', modified)
+ modified = re.sub(r"\s+", " ", modified)
if modified and modified.lower() != virus_lower:
- logger.debug("Modified virus name by removing parentheses: '%s' -> '%s'",
- virus_name, modified)
+ logger.debug("Modified virus name by removing parentheses: '%s' -> '%s'", virus_name, modified)
return modified
return None
-
+
# Attempt 2: Try adding "virus" suffix or spacing
if attempt == 2:
# Check if the name already contains "virus" anywhere (case-insensitive)
if "virus" in virus_lower:
# Add a space before "virus" only if there isn't one already
idx = virus_name.lower().rfind("virus")
- if idx > 0 and virus_name[idx - 1] != ' ':
+ if idx > 0 and virus_name[idx - 1] != " ":
modified = virus_name[:idx] + " " + virus_name[idx:]
- logger.debug("Modified virus name by adding space before 'virus': '%s' -> '%s'",
- virus_name, modified)
+ logger.debug("Modified virus name by adding space before 'virus': '%s' -> '%s'", virus_name, modified)
return modified
# Already has "virus" correctly spaced in the name, no modification needed
return None
-
+
# Name doesn't contain "virus" anywhere, so append " virus"
modified = virus_name + " virus"
- logger.debug("Modified virus name by appending ' virus': '%s' -> '%s'",
- virus_name, modified)
+ logger.debug("Modified virus name by appending ' virus': '%s' -> '%s'", virus_name, modified)
return modified
-
+
return None
def _parse_accession_input(accession_input):
- """
- Parse accession input which can be:
+ """Parse accession input which can be:
+
1. Single accession: 'NC_045512.2'
2. Space-separated accessions: 'NC_045512.2 MN908947.3 MT020781.1'
- 3. Path to text file: '/path/to/accessions.txt' (one accession per line)
-
+ 3. Path to text file: '/path/to/accessions.txt' (one accession per line).
+
Args:
accession_input (str): The accession input string.
-
- Returns:
+
+ Returns
+ -------
dict: A dictionary with keys:
- 'type': 'single', 'list', or 'file'
- 'accessions': list of accession strings (for 'list' type) or single accession (for 'single')
- 'file_path': file path (for 'file' type only)
- 'is_file': True if input is a file path
-
- Raises:
+
+ Raises
+ ------
ValueError: If file path doesn't exist or file is empty.
-
+
Example:
- >>> _parse_accession_input('NC_045512.2')
+ >>> _parse_accession_input("NC_045512.2")
{'type': 'single', 'accessions': 'NC_045512.2', 'file_path': None, 'is_file': False}
-
- >>> _parse_accession_input('NC_045512.2 MN908947.3')
+
+ >>> _parse_accession_input("NC_045512.2 MN908947.3")
{'type': 'list', 'accessions': ['NC_045512.2', 'MN908947.3'], 'file_path': None, 'is_file': False}
-
- >>> _parse_accession_input('/path/to/accessions.txt')
+
+ >>> _parse_accession_input("/path/to/accessions.txt")
{'type': 'file', 'accessions': ['NC_045512.2', 'MN908947.3', ...], 'file_path': '/path/to/accessions.txt', 'is_file': True}
+
"""
accession_input = accession_input.strip()
-
+
# Check if input is a file path
if os.path.isfile(accession_input):
logger.info("Parsing accession numbers from file: %s", accession_input)
try:
- with open(accession_input, 'r') as f:
+ with open(accession_input) as f:
accessions = [line.strip() for line in f if line.strip()]
-
+
if not accessions:
raise ValueError(f"Accession file {accession_input} is empty.")
-
+
logger.info("Loaded %d accession(s) from file", len(accessions))
- return {
- 'type': 'file',
- 'accessions': accessions,
- 'file_path': accession_input,
- 'is_file': True
- }
- except IOError as e:
- raise ValueError(f"Error reading accession file {accession_input}: {e}")
-
+ return {"type": "file", "accessions": accessions, "file_path": accession_input, "is_file": True}
+ except OSError as e:
+ raise ValueError(f"Error reading accession file {accession_input}: {e}") from e
+
# Check if input is space-separated accessions
- if ' ' in accession_input:
+ if " " in accession_input:
accessions = accession_input.split()
logger.info("Parsed %d accession(s) from space-separated input", len(accessions))
- return {
- 'type': 'list',
- 'accessions': accessions,
- 'file_path': None,
- 'is_file': False
- }
-
+ return {"type": "list", "accessions": accessions, "file_path": None, "is_file": False}
+
# Single accession
logger.debug("Single accession input: %s", accession_input)
- return {
- 'type': 'single',
- 'accessions': accession_input,
- 'file_path': None,
- 'is_file': False
- }
+ return {"type": "single", "accessions": accession_input, "file_path": None, "is_file": False}
def _parse_baseline_file(baseline_path):
- """
- Parse a baseline metadata file to extract accession numbers for deduplication.
-
+ """Parse a baseline metadata file to extract accession numbers for deduplication.
+
Supports multiple file formats:
- CSV: Looks for 'accession' column (case-insensitive)
- JSONL: Looks for 'accession' key in each JSON object
- JSON: Looks for 'accession' key in a list of objects
- Text: Treats each non-empty line as an accession number
-
+
Accession numbers are normalized (stripped, lowercased) for consistent comparison.
-
+
Args:
baseline_path (str): Path to the baseline metadata file.
-
- Returns:
+
+ Returns
+ -------
set: Set of normalized accession numbers from the baseline file.
-
- Raises:
+
+ Raises
+ ------
FileNotFoundError: If the baseline file does not exist.
ValueError: If no accessions could be extracted.
+
"""
if not baseline_path or not os.path.exists(baseline_path):
raise FileNotFoundError(f"Baseline file not found: {baseline_path}")
-
+
baseline_accessions = set()
file_ext = os.path.splitext(baseline_path)[1].lower()
logger.info("Parsing baseline file: %s (format: %s)", baseline_path, file_ext or "auto-detect")
-
+
try:
- if file_ext == '.csv':
+ if file_ext == ".csv":
# CSV format: look for 'accession' column
df = pd.read_csv(baseline_path, low_memory=False)
# Case-insensitive column name search
acc_col = None
for col in df.columns:
- if col.strip().lower() == 'accession':
+ if col.strip().lower() == "accession":
acc_col = col
break
if acc_col is None:
@@ -772,33 +790,31 @@ def _parse_baseline_file(baseline_path):
f"Baseline CSV file '{baseline_path}' has no 'accession' column. "
f"Available columns: {list(df.columns)}"
)
- baseline_accessions = set(
- str(acc).strip().lower() for acc in df[acc_col].dropna() if str(acc).strip()
- )
-
- elif file_ext == '.jsonl':
+ baseline_accessions = {str(acc).strip().lower() for acc in df[acc_col].dropna() if str(acc).strip()}
+
+ elif file_ext == ".jsonl":
# JSONL format: one JSON object per line
- with open(baseline_path, 'r', encoding='utf-8') as f:
+ with open(baseline_path, encoding="utf-8") as f:
for line_num, line in enumerate(f, 1):
line = line.strip()
if not line:
continue
try:
record = json.loads(line)
- acc = record.get('accession', '')
+ acc = record.get("accession", "")
if acc and str(acc).strip():
baseline_accessions.add(str(acc).strip().lower())
except json.JSONDecodeError:
logger.debug("Skipping invalid JSON on line %d of baseline file", line_num)
-
- elif file_ext == '.json':
+
+ elif file_ext == ".json":
# JSON format: list of objects
- with open(baseline_path, 'r', encoding='utf-8') as f:
+ with open(baseline_path, encoding="utf-8") as f:
data = json.load(f)
if isinstance(data, list):
for record in data:
if isinstance(record, dict):
- acc = record.get('accession', '')
+ acc = record.get("accession", "")
if acc and str(acc).strip():
baseline_accessions.add(str(acc).strip().lower())
elif isinstance(data, dict):
@@ -812,52 +828,50 @@ def _parse_baseline_file(baseline_path):
df = pd.read_csv(baseline_path, low_memory=False)
acc_col = None
for col in df.columns:
- if col.strip().lower() == 'accession':
+ if col.strip().lower() == "accession":
acc_col = col
break
if acc_col is not None:
- baseline_accessions = set(
- str(acc).strip().lower() for acc in df[acc_col].dropna() if str(acc).strip()
- )
+ baseline_accessions = {str(acc).strip().lower() for acc in df[acc_col].dropna() if str(acc).strip()}
logger.debug("Auto-detected CSV format with 'accession' column")
else:
raise ValueError("No accession column found, trying text format")
except (ValueError, pd.errors.ParserError):
# Fall back to text format: one accession per line
- with open(baseline_path, 'r', encoding='utf-8') as f:
+ with open(baseline_path, encoding="utf-8") as f:
for line in f:
line = line.strip()
- if line and not line.startswith('#'):
+ if line and not line.startswith("#"):
# Take first whitespace-delimited token as accession
acc = line.split()[0]
baseline_accessions.add(acc.lower())
logger.debug("Parsed as text format (one accession per line)")
-
+
except (FileNotFoundError, ValueError):
raise
except Exception as e:
raise ValueError(f"Failed to parse baseline file '{baseline_path}': {e}") from e
-
+
if not baseline_accessions:
raise ValueError(
- f"No accessions found in baseline file '{baseline_path}'. "
- f"Ensure the file contains accession numbers."
+ f"No accessions found in baseline file '{baseline_path}'. Ensure the file contains accession numbers."
)
-
+
logger.info("✅ Loaded %d accessions from baseline file", len(baseline_accessions))
return baseline_accessions
def _deduplicate_metadata_against_baseline(metadata_dict, baseline_accessions):
- """
- Remove metadata records whose accessions are already in the baseline set.
-
+ """Remove metadata records whose accessions are already in the baseline set.
+
Args:
metadata_dict (dict): Dictionary mapping accession -> metadata.
baseline_accessions (set): Set of normalized accession numbers from baseline.
-
- Returns:
+
+ Returns
+ -------
tuple: (new_metadata_dict, skipped_count)
+
"""
new_metadata = {}
skipped_count = 0
@@ -868,14 +882,12 @@ def _deduplicate_metadata_against_baseline(metadata_dict, baseline_accessions):
else:
new_metadata[acc] = meta
- logger.info("Deduplication results: %d new, %d skipped (already in baseline)",
- len(new_metadata), skipped_count)
+ logger.info("Deduplication results: %d new, %d skipped (already in baseline)", len(new_metadata), skipped_count)
return new_metadata, skipped_count
def _save_partial_metadata(metadata_dict, outfolder, virus_clean, reason="api_failure"):
- """
- Save partial metadata to CSV for recovery via --baseline.
+ """Save partial metadata to CSV for recovery via --baseline.
Args:
metadata_dict (dict): Dictionary mapping accession -> metadata.
@@ -883,8 +895,10 @@ def _save_partial_metadata(metadata_dict, outfolder, virus_clean, reason="api_fa
virus_clean (str): Sanitized virus name for the filename.
reason (str): Reason for saving (for the filename).
- Returns:
+ Returns
+ -------
str or None: Path to the saved partial metadata file.
+
"""
if not metadata_dict:
return None
@@ -895,48 +909,56 @@ def _save_partial_metadata(metadata_dict, outfolder, virus_clean, reason="api_fa
try:
rows = []
for acc, meta in metadata_dict.items():
- row = {'accession': acc}
- for key in ['virus_name', 'length', 'completeness', 'releaseDate',
- 'location', 'sourceDatabase', 'isolateName']:
+ row = {"accession": acc}
+ for key in [
+ "virus_name",
+ "length",
+ "completeness",
+ "releaseDate",
+ "location",
+ "sourceDatabase",
+ "isolateName",
+ ]:
if key in meta:
row[key] = meta[key]
- host_info = meta.get('host', {})
+ host_info = meta.get("host", {})
if isinstance(host_info, dict):
- row['host'] = host_info.get('organism_name', '')
+ row["host"] = host_info.get("organism_name", "")
elif host_info:
- row['host'] = str(host_info)
+ row["host"] = str(host_info)
rows.append(row)
df = pd.DataFrame(rows)
df.to_csv(partial_file, index=False)
logger.info("Partial metadata saved: %s (%d records)", partial_file, len(df))
return partial_file
- except Exception as e:
+ except Exception as e: # noqa: BLE001
logger.warning("Failed to save partial metadata: %s", e)
return None
def _merge_baseline_with_new(baseline_path, new_metadata_list, output_path):
- """
- Merge baseline metadata with newly fetched metadata into a single CSV.
+ """Merge baseline metadata with newly fetched metadata into a single CSV.
Args:
baseline_path (str): Path to the baseline metadata file.
new_metadata_list (list): List of new metadata dictionaries.
output_path (str): Path for the merged CSV output.
- Returns:
+ Returns
+ -------
bool: True if merge was successful, False otherwise.
+
"""
try:
# Load baseline data
file_ext = os.path.splitext(baseline_path)[1].lower()
-
- if file_ext == '.csv':
+
+ if file_ext == ".csv":
baseline_df = pd.read_csv(baseline_path, low_memory=False)
- elif file_ext == '.jsonl':
+ elif file_ext == ".jsonl":
records = []
- with open(baseline_path, 'r', encoding='utf-8') as f:
+ with open(baseline_path, encoding="utf-8") as f:
for line in f:
line = line.strip()
if line:
@@ -945,106 +967,113 @@ def _merge_baseline_with_new(baseline_path, new_metadata_list, output_path):
except json.JSONDecodeError:
continue
baseline_df = pd.DataFrame(records)
- elif file_ext == '.json':
- with open(baseline_path, 'r', encoding='utf-8') as f:
+ elif file_ext == ".json":
+ with open(baseline_path, encoding="utf-8") as f:
data = json.load(f)
if isinstance(data, list):
baseline_df = pd.DataFrame(data)
elif isinstance(data, dict):
- baseline_df = pd.DataFrame(list(data.values()) if all(isinstance(v, dict) for v in data.values()) else [data])
+ baseline_df = pd.DataFrame(
+ list(data.values()) if all(isinstance(v, dict) for v in data.values()) else [data]
+ )
else:
baseline_df = pd.DataFrame()
else:
accessions = []
- with open(baseline_path, 'r', encoding='utf-8') as f:
+ with open(baseline_path, encoding="utf-8") as f:
for line in f:
line = line.strip()
- if line and not line.startswith('#'):
+ if line and not line.startswith("#"):
accessions.append(line.split()[0])
- baseline_df = pd.DataFrame({'accession': accessions})
-
+ baseline_df = pd.DataFrame({"accession": accessions})
+
# Create new DataFrame from new metadata
if new_metadata_list:
new_df = pd.DataFrame(new_metadata_list)
else:
new_df = pd.DataFrame()
-
+
# Merge: concatenate baseline + new
merged_df = pd.concat([baseline_df, new_df], ignore_index=True, sort=False)
acc_col = None
for col in merged_df.columns:
- if col.strip().lower() == 'accession':
+ if col.strip().lower() == "accession":
acc_col = col
break
if acc_col:
- merged_df = merged_df.drop_duplicates(subset=[acc_col], keep='last')
-
+ merged_df = merged_df.drop_duplicates(subset=[acc_col], keep="last")
+
merged_df.to_csv(output_path, index=False)
logger.info("Merged output saved: %s (%d total records)", output_path, len(merged_df))
return True
-
- except Exception as e:
+
+ except Exception as e: # noqa: BLE001
logger.error("❌ Failed to merge baseline with new metadata: %s", e)
return False
def _calculate_max_accessions_per_batch(base_url_length):
- """
- Calculate the maximum number of accessions that can fit in a single API URL.
-
+ """Calculate the maximum number of accessions that can fit in a single API URL.
+
The NCBI API URL format for multiple accessions is:
https://api.ncbi.nlm.nih.gov/datasets/v2/virus/accession/ACC1%2CACC2%2CACC3/dataset_report
-
+
Args:
base_url_length (int): Length of the base URL without accessions.
-
- Returns:
+
+ Returns
+ -------
int: Maximum number of accessions per batch.
-
+
Example:
>>> _calculate_max_accessions_per_batch(80)
100 # Approximate, depends on accession lengths
+
"""
# Calculate available space for accessions
- available_length = MAX_URL_LENGTH - base_url_length - BUFFER_SIZE
-
+ available_length = MAX_URL_LENGTH - base_url_length - BUFFER_SIZE
+
# Each accession takes: average accession length + URL-encoded comma (%2C = 3 chars)
chars_per_accession = ACCESSION_AVG_LENGTH + len(ACCESSION_URL_ENCODING)
-
+
max_accessions = max(1, available_length // chars_per_accession)
- logger.debug("Calculated max accessions per batch: %d (URL limit: %d, base URL: %d)",
- max_accessions, MAX_URL_LENGTH, base_url_length)
-
+ logger.debug(
+ "Calculated max accessions per batch: %d (URL limit: %d, base URL: %d)",
+ max_accessions,
+ MAX_URL_LENGTH,
+ base_url_length,
+ )
+
return max_accessions
def _batch_accessions_for_url(accessions, base_url_length):
- """
- Split a list of accessions into batches that fit within URL length limits.
-
+ """Split a list of accessions into batches that fit within URL length limits.
+
Args:
accessions (list): List of accession numbers.
base_url_length (int): Length of the base URL without accessions.
-
- Returns:
+
+ Returns
+ -------
list: List of accession batches (each batch is a list of accessions).
-
+
Example:
- >>> batches = _batch_accessions_for_url(['NC_045512.2', 'MN908947.3', ...], 80)
+ >>> batches = _batch_accessions_for_url(["NC_045512.2", "MN908947.3", ...], 80)
>>> len(batches) # Number of batches needed
3
+
"""
max_per_batch = _calculate_max_accessions_per_batch(base_url_length)
-
+
batches = []
for i in range(0, len(accessions), max_per_batch):
- batch = accessions[i:i + max_per_batch]
+ batch = accessions[i : i + max_per_batch]
batches.append(batch)
-
- logger.info("Split %d accessions into %d batches (max %d per batch)",
- len(accessions), len(batches), max_per_batch)
-
+
+ logger.info("Split %d accessions into %d batches (max %d per batch)", len(accessions), len(batches), max_per_batch)
+
return batches
@@ -1059,18 +1088,17 @@ def _fetch_metadata_for_accession_list(
failed_commands=None,
temp_output_dir=None,
):
- """
- Fetch metadata for a list of accessions, handling URL length limits with retries.
-
+ """Fetch metadata for a list of accessions, handling URL length limits with retries.
+
This function fetches metadata for multiple accessions by:
1. Splitting the accession list into batches that fit within URL limits
2. Making separate API calls for each batch with exponential backoff retries
3. Combining all results into a single list
4. Continuing processing even if some batches fail (graceful degradation)
-
+
The NCBI API URL format for multiple accessions is:
https://api.ncbi.nlm.nih.gov/datasets/v2/virus/accession/ACC1%2CACC2%2CACC3/dataset_report
-
+
Args:
accessions (list): List of accession numbers to fetch.
host (str, optional): Host organism filter.
@@ -1081,48 +1109,59 @@ def _fetch_metadata_for_accession_list(
refseq_only (bool, optional): RefSeq only filter.
failed_commands (dict, optional): Dictionary to track failed operations.
temp_output_dir (str, optional): Directory for temporary files.
-
- Returns:
- list: Combined list of metadata records from all batches.
+
+ Returns
+ -------
+ list: Combined list of metadata records from all batches.
Returns partial results even if some batches fail.
-
- Raises:
+
+ Raises
+ ------
RuntimeError: If all batches fail to fetch.
+
"""
if not accessions:
logger.warning("No accessions provided to fetch metadata for")
return []
-
+
# Initialize failed_commands tracking if not already done
- if failed_commands is not None and 'api_batches' not in failed_commands:
- failed_commands['api_batches'] = []
-
+ if failed_commands is not None and "api_batches" not in failed_commands:
+ failed_commands["api_batches"] = []
+
# Calculate base URL length for batch sizing
# BUFFER_SIZE accounts for query parameters (filters) added by fetch_virus_metadata
base_url_length = len(f"{NCBI_API_BASE}/virus/accession//dataset_report")
-
+
# Split accessions into URL-safe batches
batches = _batch_accessions_for_url(accessions, base_url_length)
-
+
all_reports = []
failed_batches = []
aggregated_deferred_filters = None # Track deferred filters from batches
-
- logger.info("Fetching metadata for %d accessions in %d batch(es) with exponential backoff retries",
- len(accessions), len(batches))
-
- for batch_num, batch in tqdm(enumerate(batches, 1), total=len(batches), desc="Fetching accession batches", unit="batch", disable=len(batches)==1):
- logger.info("Processing accession batch %d/%d (%d accessions)",
- batch_num, len(batches), len(batch))
-
+
+ logger.info(
+ "Fetching metadata for %d accessions in %d batch(es) with exponential backoff retries",
+ len(accessions),
+ len(batches),
+ )
+
+ for batch_num, batch in tqdm(
+ enumerate(batches, 1),
+ total=len(batches),
+ desc="Fetching accession batches",
+ unit="batch",
+ disable=len(batches) == 1,
+ ):
+ logger.info("Processing accession batch %d/%d (%d accessions)", batch_num, len(batches), len(batch))
+
# Join accessions with URL-encoded comma for the API URL
accession_string = ACCESSION_URL_ENCODING.join(batch)
-
+
# Define the fetch operation for retries
def fetch_batch_metadata():
- """Callable for retry helper"""
+ """Callable for retry helper."""
return fetch_virus_metadata(
- virus=accession_string,
+ virus=accession_string, # noqa: B023
accession=True, # This is an accession-based query
host=host,
geographic_location=geographic_location,
@@ -1133,7 +1172,7 @@ def fetch_batch_metadata():
failed_commands=failed_commands,
temp_output_dir=temp_output_dir,
)
-
+
# Use exponential backoff helper for batch retries
success, batch_result, error_info = _retry_with_exponential_backoff(
operation_name=f"Accession batch {batch_num}/{len(batches)} ({len(batch)} accessions)",
@@ -1141,10 +1180,14 @@ def fetch_batch_metadata():
max_retries=API_MAX_RETRIES,
initial_delay=API_INITIAL_RETRY_DELAY,
backoff_multiplier=API_RETRY_BACKOFF_MULTIPLIER,
- retryable_exceptions=(requests.exceptions.ConnectionError, requests.exceptions.HTTPError, requests.exceptions.Timeout),
+ retryable_exceptions=(
+ requests.exceptions.ConnectionError,
+ requests.exceptions.HTTPError,
+ requests.exceptions.Timeout,
+ ),
failed_commands=failed_commands,
)
-
+
# Unpack the tuple result from fetch_virus_metadata
batch_reports = None
batch_deferred_filters = None
@@ -1154,13 +1197,13 @@ def fetch_batch_metadata():
else:
# Backward compatibility if result is just a list
batch_reports = batch_result
-
+
# If batch_reports is a file path (string), read reports from it
# This happens when fetch_virus_metadata streams to disk
if isinstance(batch_reports, str) and os.path.isfile(batch_reports):
file_reports = []
try:
- with open(batch_reports, 'r', encoding='utf-8') as bf:
+ with open(batch_reports, encoding="utf-8") as bf:
for line in bf:
line = line.strip()
if line:
@@ -1168,10 +1211,10 @@ def fetch_batch_metadata():
file_reports.append(json.loads(line))
except json.JSONDecodeError:
continue
- except IOError:
+ except OSError:
file_reports = []
batch_reports = file_reports
-
+
if success and batch_reports:
all_reports.extend(batch_reports)
# Track deferred filters (should be the same across all batches if any)
@@ -1181,9 +1224,9 @@ def fetch_batch_metadata():
tqdm.write(f"✅ Batch {batch_num}: Retrieved {len(batch_reports)} records")
else:
# Batch failed or returned empty
- error_msg = error_info['error'] if error_info else "No data returned"
+ error_msg = error_info["error"] if error_info else "No data returned"
tqdm.write(f"❌ Batch {batch_num} failed after {API_MAX_RETRIES} retries: {error_msg}")
-
+
# Build URL with applied filters for manual retry
base_url = f"{NCBI_API_BASE}/virus/accession/{accession_string}/dataset_report"
query_params = []
@@ -1199,55 +1242,61 @@ def fetch_batch_metadata():
query_params.append(f"filter.geo_location={geographic_location.replace('_', ' ')}")
if min_release_date:
query_params.append(f"filter.released_since={min_release_date}T00:00:00.000Z")
-
+
api_url = base_url + ("?" + "&".join(query_params) if query_params else "")
-
+
failed_batch_info = {
- 'batch_num': batch_num,
- 'accession_count': len(batch),
- 'accessions': batch,
- 'api_url': api_url,
+ "batch_num": batch_num,
+ "accession_count": len(batch),
+ "accessions": batch,
+ "api_url": api_url,
}
failed_batches.append(failed_batch_info)
-
+
# Track in failed_commands for later reporting
_track_failed_operation(
failed_commands,
- 'api_batches',
+ "api_batches",
failed_batch_info,
- error_info if error_info else {'error': 'No data returned', 'exception_type': 'EmptyResponse'}
+ error_info if error_info else {"error": "No data returned", "exception_type": "EmptyResponse"},
)
-
+
# Add a small delay between batches to respect rate limits
if batch_num < len(batches):
time.sleep(EUTILS_INTER_BATCH_DELAY)
-
+
# Log summary
if failed_batches:
- logger.warning("⚠️ %d out of %d accession batches failed to fetch metadata",
- len(failed_batches), len(batches))
+ logger.warning("⚠️ %d out of %d accession batches failed to fetch metadata", len(failed_batches), len(batches))
for fb in failed_batches:
- logger.debug("Failed batch %d (%d accessions): %s",
- fb['batch_num'], fb['accession_count'], fb['accessions'][:3])
-
+ logger.debug(
+ "Failed batch %d (%d accessions): %s", fb["batch_num"], fb["accession_count"], fb["accessions"][:3]
+ )
+
# Continue with partial results if at least some batches succeeded
if all_reports:
- logger.info("Successfully retrieved %d total metadata records from %d batches",
- len(all_reports), len(batches) - len(failed_batches))
+ logger.info(
+ "Successfully retrieved %d total metadata records from %d batches",
+ len(all_reports),
+ len(batches) - len(failed_batches),
+ )
if failed_batches:
- logger.warning("⚠️ Continuing pipeline with partial results (%d/%d batches succeeded)",
- len(batches) - len(failed_batches), len(batches))
+ logger.warning(
+ "⚠️ Continuing pipeline with partial results (%d/%d batches succeeded)",
+ len(batches) - len(failed_batches),
+ len(batches),
+ )
if aggregated_deferred_filters:
logger.info("Deferred filters will be applied in metadata filtering stage: %s", aggregated_deferred_filters)
return all_reports, aggregated_deferred_filters
-
+
# Only raise if ALL batches failed
if failed_batches:
raise RuntimeError(
f"All {len(batches)} accession batches failed to fetch metadata. "
f"Last error: {failed_batches[-1]['accessions']}"
)
-
+
# Fallback (shouldn't reach here)
logger.warning("No accession batches were processed")
return [], None
@@ -1265,14 +1314,13 @@ def _try_modified_virus_names(
failed_commands,
_retry_attempt,
error_type="Error",
- temp_output_dir=None
+ temp_output_dir=None,
):
- """
- Try fetching virus metadata with modified virus names.
-
+ """Try fetching virus metadata with modified virus names.
+
This helper function iterates through available retry strategies (modification
of virus name) and attempts to fetch metadata with each modified name.
-
+
Args:
virus: Original virus name.
accession: Accession filter.
@@ -1285,20 +1333,26 @@ def _try_modified_virus_names(
failed_commands: List to track failed commands.
_retry_attempt: Current retry attempt number.
error_type: String describing the error type for logging.
-
- Returns:
+
+ Returns
+ -------
list or None: The fetched metadata if successful, None if all retries failed.
+
"""
if _retry_attempt >= 2 or accession or virus.isdigit():
return None
-
+
# Try modification strategies in order
for attempt_num in range(_retry_attempt + 1, 3): # Try remaining attempts (1 and/or 2)
modified_virus = _get_modified_virus_name(virus, attempt=attempt_num)
if modified_virus:
- logger.warning("%s with virus name '%s'. "
- "Retrying with modified name: '%s' (strategy %d)",
- error_type, virus, modified_virus, attempt_num)
+ logger.warning(
+ "%s with virus name '%s'. Retrying with modified name: '%s' (strategy %d)",
+ error_type,
+ virus,
+ modified_virus,
+ attempt_num,
+ )
try:
return fetch_virus_metadata(
virus=modified_virus,
@@ -1317,7 +1371,7 @@ def _try_modified_virus_names(
logger.warning("Retry with modified virus name '%s' failed", modified_virus)
# Continue to try next strategy
continue
-
+
# All retry strategies exhausted
logger.warning("All retry strategies failed")
return None
@@ -1336,18 +1390,17 @@ def fetch_virus_metadata(
_retry_attempt=0,
temp_output_dir=None,
):
- """
- Fetch virus metadata using NCBI Datasets API.
-
+ """Fetch virus metadata using NCBI Datasets API.
+
This function retrieves metadata for virus sequences from the NCBI Datasets API
using either taxon-based or accession-based queries. It handles pagination
automatically to retrieve all available results.
-
+
When the server is unreachable, this function will automatically retry with
modified virus names:
1. First retry: Remove parenthetical content (e.g., "(LASV)")
2. Second retry: Add " virus" suffix or fix spacing
-
+
Args:
virus (str): Virus taxon name/ID or accession number.
accession (bool): Whether virus parameter is an accession number.
@@ -1359,26 +1412,28 @@ def fetch_virus_metadata(
refseq_only (bool, optional): Limit to RefSeq genomes only.
failed_commands (dict, optional): Dictionary to track failed operations.
_retry_attempt (int): Internal counter for retry attempts (0=original, 1=first retry, 2=second retry).
-
- Returns:
+
+ Returns
+ -------
list: List of virus metadata records from the API response.
-
- Raises:
+
+ Raises
+ ------
RuntimeError: If the API request fails.
-
+
Note:
Metadata is streamed to a temporary JSONL file during fetching to reduce RAM usage
for large datasets. If temp_output_dir is provided, the file is saved there;
otherwise it's saved in the system temp directory.
+
"""
-
metadata_file = None
temp_metadata_file = None
-
+
# Save original filter values before URL encoding (for deferred local filtering)
original_geographic_location = geographic_location
original_host = host
-
+
# Choose the appropriate API endpoint based on whether we're querying by accession or taxon
if accession:
# For accession numbers (e.g., NC_045512.2), use the accession-specific endpoint
@@ -1388,7 +1443,7 @@ def fetch_virus_metadata(
logger.debug("Using accession endpoint for virus: %s", virus)
params = {}
else:
- # For taxon names/IDs (e.g., 'Zika Virus', 'influenza'), use the taxon endpoint
+ # For taxon names/IDs (e.g., 'Zika Virus', 'influenza'), use the taxon endpoint
url = f"{NCBI_API_BASE}/virus/taxon/{virus}/dataset_report"
logger.debug("Using taxon endpoint for virus: %s", virus)
params = {}
@@ -1397,50 +1452,50 @@ def fetch_virus_metadata(
# These filters are applied server-side before results are returned
if refseq_only:
# Limit results to RefSeq database entries only
- params['filter.refseq_only'] = 'true'
+ params["filter.refseq_only"] = "true"
logger.debug("Applied RefSeq-only filter")
-
+
if annotated is True:
# Only return sequences that have been annotated with gene/protein information
- params['filter.annotated_only'] = 'true'
+ params["filter.annotated_only"] = "true"
logger.debug("Applied annotated-only filter")
-
+
if complete_only:
# Only return complete genome sequences (not partial sequences)
- params['filter.complete_only'] = 'true'
+ params["filter.complete_only"] = "true"
logger.debug("Applied complete-only filter")
-
+
if host:
# Filter by host organism name, replacing underscores with spaces for API compatibility
- host = host.strip('"\'-_<|>`\'')
- host = host.replace('-', '+').replace('_', '+').replace(' ', '+')
- params['filter.host'] = host
+ host = host.strip("\"'-_<|>`'") # noqa: B005
+ host = host.replace("-", "+").replace("_", "+").replace(" ", "+")
+ params["filter.host"] = host
logger.debug("Applied host filter: %s", host)
-
+
if geographic_location:
# Filter by geographic location, replacing underscores with spaces for API compatibility
geographic_location = geographic_location.strip('"-_<|>`')
geographic_location = geographic_location.replace("'", "%27").replace("`", "%27")
- geographic_location = geographic_location.replace('-', '+').replace('_', '+').replace(' ', '+')
- params['filter.geo_location'] = geographic_location
+ geographic_location = geographic_location.replace("-", "+").replace("_", "+").replace(" ", "+")
+ params["filter.geo_location"] = geographic_location
logger.debug("Applied geographic location filter: %s", geographic_location)
if min_release_date:
# Convert date to ISO format expected by the API (YYYY-MM-DDTHH:MM:SS.sssZ)
- params['filter.released_since'] = f"{min_release_date}T00:00:00.000Z"
+ params["filter.released_since"] = f"{min_release_date}T00:00:00.000Z"
logger.debug("Applied minimum release date filter: %s", min_release_date)
# Set page size to maximum allowed to minimize the number of API calls needed
# The NCBI API supports pagination for large result sets
- params['page_size'] = API_PAGE_SIZE
+ params["page_size"] = API_PAGE_SIZE
logger.debug("Set page size to maximum: %d records per request", API_PAGE_SIZE)
-
+
# Initialize variables for handling paginated results
total_records_streamed = 0 # Counter for records written to temp file (NOT held in RAM)
- page_token = None # Token for accessing subsequent pages
- page_count = 0 # Track number of pages processed for logging
- pages_pbar = None # Progress bar for pagination (created when we know total pages)
-
+ page_token = None # Token for accessing subsequent pages
+ page_count = 0 # Track number of pages processed for logging
+ pages_pbar = None # Progress bar for pagination (created when we know total pages)
+
# Create a temporary file to stream metadata as it arrives from the API
# This prevents large datasets from consuming all system RAM
# Save in output temp directory
@@ -1448,23 +1503,23 @@ def fetch_virus_metadata(
temp_metadata_file = os.path.join(temp_output_dir, f"gget_metadata_{timestamp}_{random_suffix}.jsonl")
metadata_file = None
try:
- metadata_file = open(temp_metadata_file, 'w', encoding='utf-8')
+ metadata_file = open(temp_metadata_file, "w", encoding="utf-8")
logger.info("Streaming API metadata to temporary file: %s", temp_metadata_file)
- except IOError as e:
+ except OSError as e:
logger.warning("Could not open temporary metadata file for streaming: %s. Metadata will be held in RAM.", e)
temp_metadata_file = None
-
+
# Main pagination loop - continue until all pages are retrieved
loop = True
while loop:
page_count += 1
-
+
# Add pagination token if we're not on the first page
if page_token:
- params['page_token'] = page_token
-
+ params["page_token"] = page_token
+
def fetch_single_page():
- """Callable that fetches a single page of results"""
+ """Callable that fetches a single page of results."""
# Build the query string manually to preserve '+' characters in filter values
# The requests library would URL-encode '+' to '%2B', but NCBI API expects literal '+'
query_parts = []
@@ -1473,13 +1528,13 @@ def fetch_single_page():
encoded_value = quote(str(value), safe="+:")
query_parts.append(f"{key}={encoded_value}")
full_url = url + ("?" + "&".join(query_parts) if query_parts else "")
-
- # Make the HTTP GET request to the NCBI API
+
+ # Make the HTTP GET request to the NCBI API
logger.debug("Making API request to: %s", url)
logger.debug("Request parameters: %s", params)
response = requests.get(full_url, timeout=API_REQUEST_TIMEOUT)
logger.debug("Explicit URL request sent: %s", response.url)
-
+
# Raise an exception if the HTTP request failed (4xx or 5xx status codes)
response.raise_for_status()
@@ -1491,9 +1546,9 @@ def fetch_single_page():
f"NCBI API returned non-JSON response (HTTP {response.status_code}): {response.text[:200]}"
) from e
logger.debug("Received response with %d bytes", len(response.content))
-
+
return data
-
+
# Use exponential backoff helper for single page fetch
success, page_data, error_info = _retry_with_exponential_backoff(
operation_name=f"API page {page_count}",
@@ -1501,32 +1556,41 @@ def fetch_single_page():
max_retries=API_MAX_RETRIES,
initial_delay=API_INITIAL_RETRY_DELAY,
backoff_multiplier=API_RETRY_BACKOFF_MULTIPLIER,
- retryable_exceptions=(requests.exceptions.ConnectionError, requests.exceptions.HTTPError, requests.exceptions.Timeout),
+ retryable_exceptions=(
+ requests.exceptions.ConnectionError,
+ requests.exceptions.HTTPError,
+ requests.exceptions.Timeout,
+ ),
failed_commands=failed_commands,
)
-
+
# If the initial page fetch failed, try filter removal strategies FIRST, then page size reduction
if not success and _retry_attempt == 0:
- logger.debug("⚠️ Page fetch failed with page_size=%d. Trying filter removal strategies...", params['page_size'])
-
+ logger.debug(
+ "⚠️ Page fetch failed with page_size=%d. Trying filter removal strategies...", params["page_size"]
+ )
+
# Helper to close temp files before retry
def close_temp_files():
nonlocal pages_pbar, metadata_file
if pages_pbar:
pages_pbar.close()
pages_pbar = None
- if metadata_file:
+ if metadata_file: # noqa: B023
try:
- metadata_file.close()
+ metadata_file.close() # noqa: B023
except OSError as e:
logger.debug("Failed to clean up metadata_file: %s", e)
-
+
# STRATEGY 1: If geo_location filter exists, try without it (keeping host)
if not success and geographic_location:
logger.warning("🔄 FETCH FAILED - ATTEMPTING WITHOUT GEOGRAPHIC FILTER")
- logger.warning("Retrying without the geographic_location filter '%s' (will be applied later)...", original_geographic_location)
+ logger.warning(
+ "Retrying without the geographic_location filter '%s' (will be applied later)...",
+ original_geographic_location,
+ )
close_temp_files()
-
+
try:
retry_result = fetch_virus_metadata(
virus=virus,
@@ -1541,26 +1605,31 @@ def close_temp_files():
_retry_attempt=1, # Mark as retry to prevent infinite loops
temp_output_dir=temp_output_dir,
)
-
+
# Handle None return (signals chunking needed) - propagate it
if retry_result is None:
- logger.warning("Retry without geographic filter returned None (dataset too large for single request)")
+ logger.warning(
+ "Retry without geographic filter returned None (dataset too large for single request)"
+ )
else:
retry_reports = retry_result[0] if isinstance(retry_result, tuple) else retry_result
if retry_reports is not None:
logger.info("✅ Successfully retrieved records without geographic filter")
- logger.info("Geographic location filter '%s' will be applied during metadata filtering", original_geographic_location)
- return retry_reports, {'geographic_location': original_geographic_location}
- except Exception as retry_error:
+ logger.info(
+ "Geographic location filter '%s' will be applied during metadata filtering",
+ original_geographic_location,
+ )
+ return retry_reports, {"geographic_location": original_geographic_location}
+ except Exception as retry_error: # noqa: BLE001
logger.warning("Retry without geographic filter failed: %s", retry_error)
-
+
# STRATEGY 2: If BOTH geo_location and host filters exist, try without both
# Skip this strategy for "all viruses" (taxon 10239) since downloading ~15M unfiltered records is not viable as a retry strategy - chunked download handles it
if not success and geographic_location and host and virus != NCBI_ALL_VIRUSES_TAXID:
logger.warning("🔄 ATTEMPTING WITHOUT BOTH GEOGRAPHIC AND HOST FILTERS")
logger.warning("Retrying without both filters (will be applied later)...")
close_temp_files()
-
+
try:
retry_result = fetch_virus_metadata(
virus=virus,
@@ -1575,29 +1644,39 @@ def close_temp_files():
_retry_attempt=1, # Mark as retry to prevent infinite loops
temp_output_dir=temp_output_dir,
)
-
+
# Handle None return (signals chunking needed) - propagate it
if retry_result is None:
- logger.warning("Retry without both filters returned None (dataset too large for single request)")
+ logger.warning(
+ "Retry without both filters returned None (dataset too large for single request)"
+ )
else:
retry_reports = retry_result[0] if isinstance(retry_result, tuple) else retry_result
if retry_reports is not None:
logger.info("✅ Successfully retrieved records without geographic and host filters")
- logger.info("Geographic location filter '%s' will be applied during metadata filtering", original_geographic_location)
+ logger.info(
+ "Geographic location filter '%s' will be applied during metadata filtering",
+ original_geographic_location,
+ )
logger.info("Host filter '%s' will be applied during metadata filtering", original_host)
- return retry_reports, {'geographic_location': original_geographic_location, 'host': original_host}
- except Exception as retry_error:
+ return retry_reports, {
+ "geographic_location": original_geographic_location,
+ "host": original_host,
+ }
+ except Exception as retry_error: # noqa: BLE001
logger.warning("Retry without both filters failed: %s", retry_error)
elif not success and geographic_location and host and virus == NCBI_ALL_VIRUSES_TAXID:
- logger.info("Skipping unfiltered retry for 'all viruses' taxon (dataset too large) - will use chunked download")
-
+ logger.info(
+ "Skipping unfiltered retry for 'all viruses' taxon (dataset too large) - will use chunked download"
+ )
+
# STRATEGY 3: If host filter exists (whether or not geo_location was tried), try without host only
# Skip for "all viruses" taxon - the API also fails with just geo filter for this taxon
if not success and host and virus != NCBI_ALL_VIRUSES_TAXID:
logger.warning("🔄 ATTEMPTING WITHOUT HOST FILTER ONLY")
logger.warning("Retrying without the host filter '%s' (will be applied later)...", original_host)
close_temp_files()
-
+
try:
retry_result = fetch_virus_metadata(
virus=virus,
@@ -1612,7 +1691,7 @@ def close_temp_files():
_retry_attempt=1, # Mark as retry to prevent infinite loops
temp_output_dir=temp_output_dir,
)
-
+
# Handle None return (signals chunking needed) - propagate it
if retry_result is None:
logger.warning("Retry without host filter returned None (dataset too large for single request)")
@@ -1621,35 +1700,39 @@ def close_temp_files():
if retry_reports is not None:
logger.info("✅ Successfully retrieved records without host filter")
logger.info("Host filter '%s' will be applied during metadata filtering", original_host)
- return retry_reports, {'host': original_host}
- except Exception as retry_error:
+ return retry_reports, {"host": original_host}
+ except Exception as retry_error: # noqa: BLE001
logger.warning("❌ Retry without host filter failed: %s", retry_error)
-
+
# STRATEGY 4: If all filter removal strategies failed, try reducing page size
# Skip for all-viruses taxon since the issue is query scope, not page size
- if not success and params['page_size'] > MIN_PAGE_SIZE_FALLBACK and virus != NCBI_ALL_VIRUSES_TAXID:
+ if not success and params["page_size"] > MIN_PAGE_SIZE_FALLBACK and virus != NCBI_ALL_VIRUSES_TAXID:
logger.info("All filter removal strategies failed. Trying smaller page sizes...")
-
+
# Re-open temp file for continued attempts
try:
- metadata_file = open(temp_metadata_file, 'a', encoding='utf-8')
- except IOError:
+ metadata_file = open(temp_metadata_file, "a", encoding="utf-8")
+ except OSError:
metadata_file = None
-
- current_page_size = params['page_size']
+
+ current_page_size = params["page_size"]
page_size_retry_count = 0
-
+
while not success and current_page_size > MIN_PAGE_SIZE_FALLBACK:
# Decrease page size for next retry
current_page_size = max(MIN_PAGE_SIZE_FALLBACK, current_page_size - PAGE_SIZE_FALLBACK_DECREMENT)
page_size_retry_count += 1
-
- logger.debug("📉 Attempting retry #%d with page_size=%d (page %d)",
- page_size_retry_count, current_page_size, page_count)
-
+
+ logger.debug(
+ "📉 Attempting retry #%d with page_size=%d (page %d)",
+ page_size_retry_count,
+ current_page_size,
+ page_count,
+ )
+
# Update params with new page size
- params['page_size'] = current_page_size
-
+ params["page_size"] = current_page_size
+
# Retry the fetch with the smaller page size
success, page_data, error_info = _retry_with_exponential_backoff(
operation_name=f"API page {page_count} (page_size={current_page_size})",
@@ -1657,71 +1740,81 @@ def close_temp_files():
max_retries=API_MAX_RETRIES,
initial_delay=API_INITIAL_RETRY_DELAY,
backoff_multiplier=API_RETRY_BACKOFF_MULTIPLIER,
- retryable_exceptions=(requests.exceptions.ConnectionError, requests.exceptions.HTTPError, requests.exceptions.Timeout),
+ retryable_exceptions=(
+ requests.exceptions.ConnectionError,
+ requests.exceptions.HTTPError,
+ requests.exceptions.Timeout,
+ ),
failed_commands=failed_commands,
)
-
+
if success:
- logger.debug("✅ Successfully fetched page with page_size=%d after %d retry attempt(s)",
- current_page_size, page_size_retry_count)
+ logger.debug(
+ "✅ Successfully fetched page with page_size=%d after %d retry attempt(s)",
+ current_page_size,
+ page_size_retry_count,
+ )
# Update the global page_size for remaining pages if successful
- params['page_size'] = current_page_size
-
+ params["page_size"] = current_page_size
+
# If progress bar exists, recalculate total pages based on new page size
if pages_pbar is not None and page_data:
- total_count = page_data.get('total_count', 0)
+ total_count = page_data.get("total_count", 0)
if total_count > 0:
new_total_pages = (total_count + current_page_size - 1) // current_page_size
- logger.debug("📊 Recalculating progress bar: page_size changed to %d, total pages now: %d",
- current_page_size, new_total_pages)
+ logger.debug(
+ "📊 Recalculating progress bar: page_size changed to %d, total pages now: %d",
+ current_page_size,
+ new_total_pages,
+ )
pages_pbar.total = new_total_pages
break
-
+
# If still failed after trying all page sizes down to minimum
if not success:
logger.warning("⚠️ All page size fallback attempts failed (page %d)", page_count)
-
+
# Handle page fetch result
if success and page_data:
# Extract the virus reports from the response
- reports = page_data.get('reports', [])
+ reports = page_data.get("reports", [])
# Create progress bar on first page when we know total pages
# Use current page_size (which may have been reduced) for accurate total page calculation
if pages_pbar is None and page_count == 1:
- total_pages = page_data.get('total_count', 0)
- current_page_size = params['page_size']
+ total_pages = page_data.get("total_count", 0)
+ current_page_size = params["page_size"]
if total_pages > 0:
total_pages = (total_pages + current_page_size - 1) // current_page_size
pages_pbar = tqdm(total=max(total_pages, 1), desc="Fetching pages", unit="page", leave=False)
if pages_pbar:
pages_pbar.update(1)
pages_pbar.set_postfix({"records": total_records_streamed})
-
+
# Stream reports to temporary file if available
if metadata_file and reports:
try:
for report in reports:
- metadata_file.write(json.dumps(report) + '\n')
+ metadata_file.write(json.dumps(report) + "\n")
metadata_file.flush() # Ensure data is written to disk
logger.debug("Streamed %d records to temporary metadata file", len(reports))
- except IOError as e:
+ except OSError as e:
logger.warning("Error writing to temporary metadata file: %s", e)
-
+
# Track count only - records are on disk, NOT held in RAM
total_records_streamed += len(reports)
-
+
# Check if there are more pages to retrieve
- next_page_token = page_data.get('next_page_token')
+ next_page_token = page_data.get("next_page_token")
if not next_page_token:
if pages_pbar:
pages_pbar.close()
loop = False
break
-
+
# Set up for the next page
page_token = next_page_token
logger.debug("Next page token received, continuing pagination...")
-
+
else:
# Page fetch failed after retries or returned empty data
# Handle case where success=True but page_data is empty (error_info will be None)
@@ -1746,12 +1839,15 @@ def close_temp_files():
metadata_file.close()
except OSError as e:
logger.debug("Failed to clean up metadata_file: %s", e)
-
+
# FALLBACK 1: If geo_location filter exists, try without it
if geographic_location:
logger.warning("🔄 EMPTY RESPONSE - ATTEMPTING WITHOUT GEOGRAPHIC FILTER")
- logger.warning("Retrying without the geographic_location filter '%s' (will be applied locally)...", original_geographic_location)
-
+ logger.warning(
+ "Retrying without the geographic_location filter '%s' (will be applied locally)...",
+ original_geographic_location,
+ )
+
try:
retry_result = fetch_virus_metadata(
virus=virus,
@@ -1766,30 +1862,39 @@ def close_temp_files():
_retry_attempt=1, # Mark as retry to prevent infinite loops
temp_output_dir=temp_output_dir,
)
-
+
# Handle None return (signals chunking needed) - propagate it
if retry_result is None:
- logger.warning("Retry without geographic filter returned None (dataset too large for single request)")
+ logger.warning(
+ "Retry without geographic filter returned None (dataset too large for single request)"
+ )
else:
retry_reports = retry_result[0] if isinstance(retry_result, tuple) else retry_result
if retry_reports is not None:
logger.info("✅ Successfully retrieved records without geographic filter")
- logger.info("Geographic location filter '%s' will be applied during metadata filtering", original_geographic_location)
- return retry_reports, {'geographic_location': original_geographic_location}
- except Exception as retry_error:
+ logger.info(
+ "Geographic location filter '%s' will be applied during metadata filtering",
+ original_geographic_location,
+ )
+ return retry_reports, {"geographic_location": original_geographic_location}
+ except Exception as retry_error: # noqa: BLE001
logger.warning("Retry without geographic filter failed: %s", retry_error)
-
+
# FALLBACK 2: If host filter exists and geo retry failed or wasn't tried
if host:
logger.warning("🔄 ATTEMPTING WITHOUT HOST FILTER")
- logger.warning("Retrying without the host filter '%s' (will be applied locally)...", original_host)
-
+ logger.warning(
+ "Retrying without the host filter '%s' (will be applied locally)...", original_host
+ )
+
try:
retry_result = fetch_virus_metadata(
virus=virus,
accession=accession,
host=None, # Remove host filter
- geographic_location=None if geographic_location else None, # Also remove geo if present
+ geographic_location=None
+ if geographic_location
+ else None, # Also remove geo if present
annotated=annotated,
complete_only=complete_only,
min_release_date=min_release_date,
@@ -1798,71 +1903,80 @@ def close_temp_files():
_retry_attempt=1,
temp_output_dir=temp_output_dir,
)
-
+
# Handle None return (signals chunking needed) - propagate it
if retry_result is None:
- logger.warning("Retry without host filter returned None (dataset too large for single request)")
+ logger.warning(
+ "Retry without host filter returned None (dataset too large for single request)"
+ )
else:
retry_reports = retry_result[0] if isinstance(retry_result, tuple) else retry_result
if retry_reports is not None:
- deferred = {'host': original_host}
+ deferred = {"host": original_host}
if geographic_location:
- deferred['geographic_location'] = original_geographic_location
+ deferred["geographic_location"] = original_geographic_location
logger.info("✅ Successfully retrieved records without filters")
- logger.info("Deferred filters will be applied during metadata filtering: %s", list(deferred.keys()))
+ logger.info(
+ "Deferred filters will be applied during metadata filtering: %s",
+ list(deferred.keys()),
+ )
return retry_reports, deferred
- except Exception as retry_error:
+ except Exception as retry_error: # noqa: BLE001
logger.warning("Retry without host filter failed: %s", retry_error)
-
+
# All fallback strategies exhausted or we're already in retry mode
error_msg = f"API request returned no data for {virus}. The dataset may be empty or unavailable. Please verify the virus name and filters, or try again later."
if failed_commands is not None:
- failed_commands['empty_response'] = {'error': error_msg}
+ failed_commands["empty_response"] = {"error": error_msg}
logger.error(error_msg)
raise RuntimeError(error_msg) from None
-
+
last_exception = error_info
-
- if isinstance(last_exception.get('exception_type'), str) and last_exception['exception_type'] == 'Timeout':
+
+ if isinstance(last_exception.get("exception_type"), str) and last_exception["exception_type"] == "Timeout":
# For pagination timeouts, we can continue with partial results
if page_count > 1 and total_records_streamed > 0:
# We have collected some pages already
logger.warning("⚠️ Request timed out while fetching additional pages (page %d)", page_count)
logger.info("Continuing with %d records collected so far...", total_records_streamed)
-
+
# Track timeout in failed_commands for user reference
if failed_commands is not None:
- if 'pagination_timeouts' not in failed_commands:
- failed_commands['pagination_timeouts'] = []
- failed_commands['pagination_timeouts'].append({
- 'page': page_count,
- 'error': 'API request timeout',
- 'url': url,
- 'records_retrieved': total_records_streamed,
- })
-
+ if "pagination_timeouts" not in failed_commands:
+ failed_commands["pagination_timeouts"] = []
+ failed_commands["pagination_timeouts"].append(
+ {
+ "page": page_count,
+ "error": "API request timeout",
+ "url": url,
+ "records_retrieved": total_records_streamed,
+ }
+ )
+
# Break pagination loop and return partial results
loop = False
break
else:
# Handle timeout error with specific guidance for known problematic filters
- error_msg = f"Request timed out while fetching virus metadata: {last_exception.get('error', 'Unknown')}"
-
+ error_msg = (
+ f"Request timed out while fetching virus metadata: {last_exception.get('error', 'Unknown')}"
+ )
+
# Track API timeout information for summary
if failed_commands is not None:
- failed_commands['api_timeout'] = {
- 'error': 'API request timeout',
- 'url': url,
- 'alternative_command': None
+ failed_commands["api_timeout"] = {
+ "error": "API request timeout",
+ "url": url,
+ "alternative_command": None,
}
-
+
# Log the timeout error before raising
logger.error("=" * 80)
logger.error("REQUEST TIMEOUT")
logger.error("=" * 80)
logger.error(error_msg)
logger.error("=" * 80)
-
+
# Close temporary file and progress bar before raising exception
if pages_pbar:
pages_pbar.close()
@@ -1871,26 +1985,28 @@ def close_temp_files():
metadata_file.close()
except OSError as e:
logger.debug("Failed to clean up metadata_file: %s", e)
-
+
raise RuntimeError(error_msg) from None
-
- elif last_exception.get('exception_type') == 'ConnectionError':
+
+ elif last_exception.get("exception_type") == "ConnectionError":
# For pagination connection errors, continue with partial results if available
if page_count > 1 and total_records_streamed > 0:
logger.warning("⚠️ Connection error while fetching additional pages (page %d)", page_count)
logger.info("Continuing with %d records collected so far...", total_records_streamed)
-
+
# Track error in failed_commands
if failed_commands is not None:
- if 'pagination_errors' not in failed_commands:
- failed_commands['pagination_errors'] = []
- failed_commands['pagination_errors'].append({
- 'page': page_count,
- 'error_type': 'ConnectionError',
- 'error': last_exception.get('error', 'Unknown'),
- 'records_retrieved': total_records_streamed,
- })
-
+ if "pagination_errors" not in failed_commands:
+ failed_commands["pagination_errors"] = []
+ failed_commands["pagination_errors"].append(
+ {
+ "page": page_count,
+ "error_type": "ConnectionError",
+ "error": last_exception.get("error", "Unknown"),
+ "records_retrieved": total_records_streamed,
+ }
+ )
+
loop = False
break
else:
@@ -1908,75 +2024,93 @@ def close_temp_files():
failed_commands=failed_commands,
_retry_attempt=_retry_attempt,
error_type="Connection error",
- temp_output_dir=temp_output_dir
+ temp_output_dir=temp_output_dir,
)
if retry_result is not None:
return retry_result
-
+
# Log the connection error before raising
- error_msg = f"Connection error while fetching virus metadata: {last_exception.get('error', 'Unknown')}"
+ error_msg = (
+ f"Connection error while fetching virus metadata: {last_exception.get('error', 'Unknown')}"
+ )
logger.error("=" * 80)
logger.error("CONNECTION ERROR")
logger.error("=" * 80)
logger.error(error_msg)
logger.error("Please check your internet connection and try again.")
logger.error("=" * 80)
-
+
# Close temporary file before raising exception
if metadata_file:
try:
metadata_file.close()
except OSError as e:
logger.debug("Failed to clean up metadata_file: %s", e)
-
+
raise RuntimeError(error_msg) from None
-
- elif last_exception.get('exception_type') == 'HTTPError':
+
+ elif last_exception.get("exception_type") == "HTTPError":
# For pagination HTTP errors, continue with partial results if available
if page_count > 1 and total_records_streamed > 0:
- logger.warning("⚠️ HTTP error while fetching additional pages (page %d): %s", page_count, last_exception.get('error'))
+ logger.warning(
+ "⚠️ HTTP error while fetching additional pages (page %d): %s",
+ page_count,
+ last_exception.get("error"),
+ )
logger.info("Continuing with %d records collected so far...", total_records_streamed)
-
+
# Track error in failed_commands
if failed_commands is not None:
- if 'pagination_errors' not in failed_commands:
- failed_commands['pagination_errors'] = []
- failed_commands['pagination_errors'].append({
- 'page': page_count,
- 'error_type': 'HTTPError',
- 'error': last_exception.get('error', 'Unknown'),
- 'records_retrieved': total_records_streamed,
- })
-
+ if "pagination_errors" not in failed_commands:
+ failed_commands["pagination_errors"] = []
+ failed_commands["pagination_errors"].append(
+ {
+ "page": page_count,
+ "error_type": "HTTPError",
+ "error": last_exception.get("error", "Unknown"),
+ "records_retrieved": total_records_streamed,
+ }
+ )
+
loop = False
break
else:
# Handle HTTP errors with specific guidance for known issues
error_msg = f"HTTP error while fetching virus metadata: {last_exception.get('error', 'Unknown')}"
-
+
# Check for specific server error patterns (5xx errors indicate server unreachability)
- is_server_error = '500' in last_exception.get('error', '') or '502' in last_exception.get('error', '') or '503' in last_exception.get('error', '') or '504' in last_exception.get('error', '')
-
+ is_server_error = (
+ "500" in last_exception.get("error", "")
+ or "502" in last_exception.get("error", "")
+ or "503" in last_exception.get("error", "")
+ or "504" in last_exception.get("error", "")
+ )
+
if is_server_error:
# Special handling for "all viruses" query
# If this is the first page and we're querying all viruses without date filters,
# the dataset is too large for NCBI to handle - need to chunk by date
- if virus == NCBI_ALL_VIRUSES_TAXID and not accession and page_count == 1 and not min_release_date:
+ if (
+ virus == NCBI_ALL_VIRUSES_TAXID
+ and not accession
+ and page_count == 1
+ and not min_release_date
+ ):
logger.warning("⚠️ NCBI API cannot handle 'all viruses' query in a single request")
logger.info("🔄 Automatically switching to date-chunked download strategy...")
logger.info("This will split the download into yearly chunks to avoid server overload")
-
+
# Close temporary file before returning None
if metadata_file:
try:
metadata_file.close()
except OSError as e:
logger.debug("Failed to clean up metadata_file: %s", e)
-
+
# Return None to signal that chunking is needed
# The calling function will handle the chunking strategy
return None
-
+
# Special handling for numeric taxon IDs that fail with 500 errors
# These are often transient issues with NCBI's server
if virus.isdigit():
@@ -1986,7 +2120,7 @@ def close_temp_files():
logger.info(" 1. Wait a few minutes and try again")
logger.info(" 2. Try using the virus name instead of the taxon ID")
logger.info(" 3. Consider using more specific filters to reduce the dataset size")
-
+
# Try retrying with modified virus names (skip for numeric IDs since they won't have modified versions)
if not virus.isdigit():
retry_result = _try_modified_virus_names(
@@ -2001,55 +2135,59 @@ def close_temp_files():
failed_commands=failed_commands,
_retry_attempt=_retry_attempt,
error_type="Server error (5xx)",
- temp_output_dir=temp_output_dir
+ temp_output_dir=temp_output_dir,
)
if retry_result is not None:
return retry_result
-
+
# Note: Retry without geographic_location is now handled in the page size
# reduction loop above, after the first smaller page size fails
-
+
error_msg += (
- f"\n\n🔧 SERVER ERROR DETECTED: "
- f"NCBI's API is experiencing temporary server-side issues. "
- f"This could be due to the specific virus/taxon ID or a genuine server problem. "
- f"All page size and filter removal retries have been exhausted."
+ "\n\n🔧 SERVER ERROR DETECTED: "
+ "NCBI's API is experiencing temporary server-side issues. "
+ "This could be due to the specific virus/taxon ID or a genuine server problem. "
+ "All page size and filter removal retries have been exhausted."
)
-
+
# Log the error details before raising
logger.error("=" * 80)
logger.error("API REQUEST FAILED")
logger.error("=" * 80)
logger.error(error_msg)
logger.error("=" * 80)
-
+
# Close temporary file before raising exception
if metadata_file:
try:
metadata_file.close()
except OSError as e:
logger.debug("Failed to clean up metadata_file: %s", e)
-
+
raise RuntimeError(error_msg) from None
-
+
else:
# Handle any other request-related errors
# For pagination errors with partial results, continue
if page_count > 1 and total_records_streamed > 0:
- logger.warning("⚠️ Error while fetching additional pages (page %d): %s", page_count, last_exception.get('error'))
+ logger.warning(
+ "⚠️ Error while fetching additional pages (page %d): %s", page_count, last_exception.get("error")
+ )
logger.info("Continuing with %d records collected so far...", total_records_streamed)
-
+
# Track error in failed_commands
if failed_commands is not None:
- if 'pagination_errors' not in failed_commands:
- failed_commands['pagination_errors'] = []
- failed_commands['pagination_errors'].append({
- 'page': page_count,
- 'error_type': last_exception.get('exception_type', 'Unknown'),
- 'error': last_exception.get('error', 'Unknown'),
- 'records_retrieved': total_records_streamed,
- })
-
+ if "pagination_errors" not in failed_commands:
+ failed_commands["pagination_errors"] = []
+ failed_commands["pagination_errors"].append(
+ {
+ "page": page_count,
+ "error_type": last_exception.get("exception_type", "Unknown"),
+ "error": last_exception.get("error", "Unknown"),
+ "records_retrieved": total_records_streamed,
+ }
+ )
+
if pages_pbar:
pages_pbar.close()
loop = False
@@ -2061,16 +2199,16 @@ def close_temp_files():
logger.error("=" * 80)
logger.error(error_msg)
logger.error("=" * 80)
-
+
# Close temporary file before raising exception
if metadata_file:
try:
metadata_file.close()
except OSError as e:
logger.debug("Failed to clean up metadata_file: %s", e)
-
+
raise RuntimeError(error_msg) from None
-
+
# Close the temporary metadata file if it was created
if metadata_file:
try:
@@ -2078,51 +2216,53 @@ def close_temp_files():
logger.debug("✅ Closed temporary metadata file: %s", temp_metadata_file)
logger.debug(" This file is being used to reduce RAM usage during API metadata fetching")
logger.debug(" It will be kept in: %s", temp_output_dir)
- except IOError as e:
+ except OSError as e:
logger.warning("Error closing temporary metadata file: %s", e)
-
+
# Log the final results summary
- logger.info("Successfully retrieved %d virus records from NCBI API across %d pages",
- total_records_streamed, page_count)
-
+ logger.info(
+ "Successfully retrieved %d virus records from NCBI API across %d pages", total_records_streamed, page_count
+ )
+
if temp_metadata_file and os.path.exists(temp_metadata_file):
file_size_mb = os.path.getsize(temp_metadata_file) / (1024 * 1024)
logger.info("Temporary metadata file size: %.2f MB", file_size_mb)
-
+
# Return the temp file path instead of holding all records in RAM. The caller will stream from this file to build metadata_dict.
return temp_metadata_file, None # (temp_file_path, deferred_filters) - None means no deferred filters
def fetch_virus_metadata_chunked(
- virus,
- accession=False,
- host=None,
- geographic_location=None,
+ virus,
+ accession=False,
+ host=None,
+ geographic_location=None,
annotated=None,
complete_only=False,
min_release_date=None,
max_release_date=None,
refseq_only=False,
failed_commands=None,
- temp_output_dir=None
+ temp_output_dir=None,
):
- """
- Fetch virus metadata using a chunked date-range strategy for very large datasets.
-
+ """Fetch virus metadata using a chunked date-range strategy for very large datasets.
+
This function is used as a fallback when the standard fetch_virus_metadata fails due to dataset size limitations. It breaks down the request into yearly chunks starting from a reasonable start date or user's min_release_date to the present.
-
+
Because the NCBI API currently cannot handle broad taxon queries (e.g., taxon 10239 for all viruses) with server-side filters like host or geographic_location, this function makes UNFILTERED requests and tracks those filters as deferred, to be applied later during metadata filtering.
-
+
Args:
Same as fetch_virus_metadata.
-
- Returns:
+
+ Returns
+ -------
tuple: (list of virus metadata records, dict of deferred_filters or None)
-
- Raises:
+
+ Raises
+ ------
RuntimeError: If any chunk fails to download.
+
"""
-
logger.info("=" * 80)
logger.info("📦 CHUNKED DOWNLOAD MODE ACTIVATED")
logger.info("=" * 80)
@@ -2130,63 +2270,70 @@ def fetch_virus_metadata_chunked(
logger.info("Splitting download into yearly chunks to ensure successful completion.")
logger.info("This may take a while, but ensures all data is retrieved.")
logger.info("=" * 80)
-
+
# Since the API cannot handle broad taxon queries with filters (returns 500), we make unfiltered requests and track filters as deferred for post-hoc application.
deferred_filters = {}
if host:
- deferred_filters['host'] = host
+ deferred_filters["host"] = host
logger.info("Host filter '%s' will be deferred and applied during metadata filtering", host)
if geographic_location:
- deferred_filters['geographic_location'] = geographic_location
- logger.info("Geographic location filter '%s' will be deferred and applied during metadata filtering", geographic_location)
-
+ deferred_filters["geographic_location"] = geographic_location
+ logger.info(
+ "Geographic location filter '%s' will be deferred and applied during metadata filtering",
+ geographic_location,
+ )
+
if deferred_filters:
logger.info("Filters deferred to post-download filtering: %s", list(deferred_filters.keys()))
-
+
# Define date range for chunking
# If user specified min_release_date, use it; otherwise start from default year
if min_release_date:
# Extract year from user's min_release_date
- start_year = int(min_release_date.split('-')[0])
+ start_year = int(min_release_date.split("-")[0])
logger.info(f"Starting from user-specified year: {start_year}")
else:
# Start from default year as most valuable viral sequence data is from then onwards
start_year = CHUNKED_DOWNLOAD_START_YEAR
logger.info("Starting from year %d (default for 'all viruses' downloads)", CHUNKED_DOWNLOAD_START_YEAR)
-
+
current_date = datetime.now()
current_year = current_date.year
-
+
# If max_release_date is specified, limit the end year to avoid downloading unnecessary data
end_year = current_year
if max_release_date:
try:
- end_year = int(max_release_date.split('-')[0])
- logger.info("Limiting chunked download to year %d based on max_release_date '%s'", end_year, max_release_date)
+ end_year = int(max_release_date.split("-")[0])
+ logger.info(
+ "Limiting chunked download to year %d based on max_release_date '%s'", end_year, max_release_date
+ )
# max_release_date will be applied by the caller's metadata filtering step (already in the filters dict), so no need to add it to deferred_filters here
except (ValueError, IndexError):
- logger.warning("Could not parse max_release_date '%s' for year limit, downloading to current year", max_release_date)
+ logger.warning(
+ "Could not parse max_release_date '%s' for year limit, downloading to current year", max_release_date
+ )
end_year = current_year
-
+
all_reports = []
chunk_temp_files = [] # Track temp file paths from each chunk
total_records_count = 0 # Track total records without holding in RAM
total_chunks = end_year - start_year + 1
-
+
logger.info(f"Will process {total_chunks} year(s) from {start_year} to {end_year}")
logger.info("=" * 80)
-
+
for year in tqdm(range(start_year, end_year + 1), total=total_chunks, desc="Fetching yearly chunks", unit="year"):
chunk_start = f"{year}-01-01"
chunk_end = f"{year}-12-31"
-
+
# For the current year, use today's date as the end
if year == current_year:
chunk_end = current_date.strftime("%Y-%m-%d")
-
+
chunk_num = year - start_year + 1
tqdm.write(f"📥 Chunk {chunk_num}/{total_chunks}: Fetching data for year {year} ({chunk_start} to {chunk_end})")
-
+
try:
# Fetch metadata for this date chunk WITHOUT host/geo filters
# (the API currently cannot handle them for broad taxon queries)
@@ -2200,9 +2347,9 @@ def fetch_virus_metadata_chunked(
min_release_date=chunk_start,
refseq_only=refseq_only,
failed_commands=failed_commands,
- temp_output_dir=temp_output_dir
+ temp_output_dir=temp_output_dir,
)
-
+
# Handle tuple return (reports, deferred_filters)
chunk_reports = None
chunk_deferred_filters = None
@@ -2211,30 +2358,30 @@ def fetch_virus_metadata_chunked(
chunk_reports, chunk_deferred_filters = chunk_result
else:
chunk_reports = chunk_result
-
+
# If we got None, it means even this chunk is too large
if chunk_reports is None:
logger.error(f"❌ Chunk for year {year} returned None (dataset too large even for yearly chunk)")
logger.error("This is unexpected and may indicate an API issue")
raise RuntimeError(f"Year {year} chunk failed - dataset too large even when split by year")
-
+
# Merge any deferred filters from the chunk itself (e.g., if annotated was deferred)
if chunk_deferred_filters:
for k, v in chunk_deferred_filters.items():
if k not in deferred_filters:
deferred_filters[k] = v
logger.debug("Chunk %d added deferred filter: %s=%s", chunk_num, k, v)
-
+
# Handle chunk_reports which can be a file path (string) or a list
if isinstance(chunk_reports, str) and os.path.isfile(chunk_reports):
# Count records in the chunk file without loading into RAM
chunk_count = 0
try:
- with open(chunk_reports, 'r', encoding='utf-8') as cf:
+ with open(chunk_reports, encoding="utf-8") as cf:
for line in cf:
if line.strip():
chunk_count += 1
- except IOError:
+ except OSError:
chunk_count = 0
chunk_temp_files.append(chunk_reports)
total_records_count += chunk_count
@@ -2243,162 +2390,168 @@ def fetch_virus_metadata_chunked(
chunk_count = len(chunk_reports)
all_reports.extend(chunk_reports)
total_records_count += chunk_count
-
- tqdm.write(f"✅ Chunk {chunk_num}/{total_chunks}: Retrieved {chunk_count:,} records (total: {total_records_count:,})")
-
+
+ tqdm.write(
+ f"✅ Chunk {chunk_num}/{total_chunks}: Retrieved {chunk_count:,} records (total: {total_records_count:,})"
+ )
+
# Add a small delay between chunks to be respectful to NCBI servers
if year < end_year:
time.sleep(CHUNKED_DOWNLOAD_INTER_CHUNK_DELAY)
-
+
except Exception as e:
logger.error(f"❌ Failed to fetch chunk for year {year}: {e}")
raise RuntimeError(f"Chunked download failed at year {year}") from e
-
+
logger.info("")
logger.info("=" * 80)
- logger.info(f"✅ CHUNKED DOWNLOAD COMPLETE")
+ logger.info("✅ CHUNKED DOWNLOAD COMPLETE")
logger.info(f" Total records retrieved: {total_records_count:,}")
logger.info(f" Total chunks processed: {total_chunks}")
if deferred_filters:
logger.info(" Deferred filters to apply: %s", deferred_filters)
logger.info("=" * 80)
-
+
# If we have chunk temp files, merge them into a single JSONL and return the path
if chunk_temp_files:
# Create a merged temp file path
merged_temp_file = os.path.join(temp_output_dir, f"gget_metadata_chunked_{timestamp}_{random_suffix}.jsonl")
try:
- with open(merged_temp_file, 'w', encoding='utf-8') as outf:
+ with open(merged_temp_file, "w", encoding="utf-8") as outf:
# First, write any in-memory reports (from small chunks)
for report in all_reports:
- outf.write(json.dumps(report) + '\n')
+ outf.write(json.dumps(report) + "\n")
# Then append contents of chunk temp files
for chunk_file in chunk_temp_files:
- with open(chunk_file, 'r', encoding='utf-8') as inf:
+ with open(chunk_file, encoding="utf-8") as inf:
for line in inf:
if line.strip():
- outf.write(line if line.endswith('\n') else line + '\n')
+ outf.write(line if line.endswith("\n") else line + "\n")
logger.info("Merged %d chunk files into: %s", len(chunk_temp_files), merged_temp_file)
return merged_temp_file, deferred_filters if deferred_filters else None
- except IOError as e:
+ except OSError as e:
logger.warning("Failed to merge chunk files: %s. Falling back to in-memory.", e)
# Fall through to return all_reports if merge fails
-
+
return all_reports, deferred_filters if deferred_filters else None
def is_sars_cov2_query(virus, accession=False):
- """
- Check if the query is for SARS-CoV-2 to enable optimized cached downloads.
-
+ """Check if the query is for SARS-CoV-2 to enable optimized cached downloads.
+
Args:
virus (str): Virus taxon name/ID or accession number.
accession (bool): Whether virus parameter is an accession number.
-
- Returns:
+
+ Returns
+ -------
bool: True if this is a SARS-CoV-2 query.
+
"""
if accession:
# When in accession mode, let the user explicitly set is_sars_cov2=True
# rather than trying to detect it
return False
-
+
# Check for common SARS-CoV-2 identifiers in taxon names
- virus_lower = virus.lower().replace('-', '').replace('_', '').replace(' ', '')
-
+ virus_lower = virus.lower().replace("-", "").replace("_", "").replace(" ", "")
+
# Check if the query matches any SARS-CoV-2 identifier
for identifier in SARS_COV2_IDENTIFIERS:
if identifier in virus_lower:
logger.info("Detected SARS-CoV-2 query: %s matches %s", virus, identifier)
return True
-
+
# logger.info("=== Not a SARS-CoV-2 query: %s", virus)
return False
def is_alphainfluenza_query(virus, accession=False):
- """
- Check if the query is for Alphainfluenza to enable optimized cached downloads.
-
+ """Check if the query is for Alphainfluenza to enable optimized cached downloads.
+
Cached packages are available for:
- Alphainfluenza (genus, taxid: 197911)
- Alphainfluenzavirus influenzae (species, taxid: 2955291)
- Influenza A virus (no-rank, taxid: 11320)
-
+
Args:
virus (str): Virus taxon name/ID or accession number.
accession (bool): Whether virus parameter is an accession number.
-
- Returns:
+
+ Returns
+ -------
bool: True if this is an Alphainfluenza query.
+
"""
if accession:
# When in accession mode, let the user explicitly set is_alphainfluenza=True
# rather than trying to detect it
return False
-
+
# Check for common Alphainfluenza identifiers in taxon names
- virus_lower = virus.lower().replace('-', '').replace('_', '').replace(' ', '')
-
+ virus_lower = virus.lower().replace("-", "").replace("_", "").replace(" ", "")
+
# Check if the query matches any Alphainfluenza identifier
for identifier in ALPHAINFLUENZA_IDENTIFIERS:
if identifier in virus_lower:
logger.info("Detected Alphainfluenza query: %s matches %s", virus, identifier)
return True
-
+
# logger.info("=== Not an Alphainfluenza query: %s", virus)
return False
def process_cached_download(zip_file, virus_type="virus"):
- """
- Process a cached download ZIP file and extract sequences with metadata.
-
+ """Process a cached download ZIP file and extract sequences with metadata.
+
This helper function extracts sequences from a cached ZIP download and loads the rich metadata from data_report.jsonl (if available). The metadata is essential for post-download filtering operations.
-
+
NCBI cached downloads typically include:
- genomic.fna: FASTA sequences
- data_report.jsonl: Rich metadata with virus genome information
- dataset_catalog.json: List of files in the package
-
+
Args:
zip_file (str): Path to the downloaded ZIP file.
virus_type (str): Type of virus for logging messages.
-
- Returns:
+
+ Returns
+ -------
tuple: (sequences, metadata_dict, success)
- sequences: List of all sequence records from the cached download.
- metadata_dict: Dictionary mapping accessions to metadata (rich metadata
from data_report.jsonl if available, or basic metadata from FASTA headers).
- success: Boolean indicating if processing was successful.
-
- Raises:
+
+ Raises
+ ------
RuntimeError: If no valid sequences are found in the cached data.
+
"""
if not zip_file or not os.path.exists(zip_file):
return None, None, False
-
+
# Extract directory path from zip file name
extract_dir = os.path.splitext(zip_file)[0]
_unzip_file(zip_file, extract_dir)
-
+
if not os.path.exists(extract_dir):
logger.warning("Extraction directory not found: %s", extract_dir)
return None, None, False
-
+
logger.info("🔬 PROCESSING CACHED DATA...")
logger.info("Extracted cached data to: %s", extract_dir)
-
+
# Find and load metadata from data_report.jsonl (rich metadata from NCBI)
metadata_files = []
fasta_files = []
- for root, dirs, files in os.walk(extract_dir):
+ for root, _dirs, files in os.walk(extract_dir):
for file in files:
- if file == 'data_report.jsonl':
+ if file == "data_report.jsonl":
metadata_files.append(os.path.join(root, file))
- elif file.endswith(('.fasta', '.fa', '.fna')):
+ elif file.endswith((".fasta", ".fa", ".fna")):
fasta_files.append(os.path.join(root, file))
-
+
# Write rich metadata from data_report.jsonl to a temp JSONL file (memory-efficient)
# Instead of building a dict of millions of records in RAM, we stream to disk and let the caller load/filter from the file with _load_metadata_dict_from_temp_jsonl
cached_metadata_jsonl_path = None
@@ -2407,214 +2560,223 @@ def process_cached_download(zip_file, virus_type="virus"):
logger.info("Found %d metadata file(s) in cached download", len(metadata_files))
# Create temp JSONL path next to the zip file
cached_metadata_jsonl_path = os.path.join(extract_dir, "_cached_metadata_internal.jsonl")
-
+
try:
- with open(cached_metadata_jsonl_path, 'w', encoding='utf-8') as out_jsonl:
+ with open(cached_metadata_jsonl_path, "w", encoding="utf-8") as out_jsonl:
for metadata_file in metadata_files:
try:
# Get file size for progress bar estimation
file_size = os.path.getsize(metadata_file)
file_size_mb = file_size / BYTES_PER_MB
logger.debug("Streaming metadata file to temp JSONL: %s (%.1f MB)", metadata_file, file_size_mb)
-
- with open(metadata_file, 'r', encoding='utf-8') as f:
+
+ with open(metadata_file, encoding="utf-8") as f:
# Use tqdm to show progress while reading the file
pbar = tqdm(
total=file_size,
- unit='B',
+ unit="B",
unit_scale=True,
unit_divisor=1024,
desc="Processing metadata",
ncols=80,
- leave=True
+ leave=True,
)
-
+
for line in f:
if line.strip():
# Update progress based on bytes read
- pbar.update(len(line.encode('utf-8')))
-
+ pbar.update(len(line.encode("utf-8")))
+
report = json.loads(line)
# Extract accession from the report
- accession = report.get('accession', '')
+ accession = report.get("accession", "")
if not accession:
continue
-
+
cached_metadata_record_count += 1
-
+
# Update progress bar description with record count
if cached_metadata_record_count % 10000 == 0:
- pbar.set_description(f"Processing metadata ({cached_metadata_record_count:,} records)")
-
+ pbar.set_description(
+ f"Processing metadata ({cached_metadata_record_count:,} records)"
+ )
+
# Transform the NCBI report format to our internal metadata format
# This mirrors the logic in load_metadata_from_api_reports
metadata = {
- 'accession': accession,
- 'length': report.get('length'),
- 'geneCount': report.get('geneCount'),
- 'completeness': report.get('completeness', '').lower(),
+ "accession": accession,
+ "length": report.get("length"),
+ "geneCount": report.get("geneCount"),
+ "completeness": report.get("completeness", "").lower(),
}
-
+
# Extract virus info
- virus_info = report.get('virus', {})
- metadata['virusName'] = virus_info.get('organismName')
- metadata['virusTaxId'] = virus_info.get('taxId')
- metadata['virusPangolinClassification'] = virus_info.get('pangolinClassification')
-
+ virus_info = report.get("virus", {})
+ metadata["virusName"] = virus_info.get("organismName")
+ metadata["virusTaxId"] = virus_info.get("taxId")
+ metadata["virusPangolinClassification"] = virus_info.get("pangolinClassification")
+
# Extract host info
- host_info = report.get('host', {})
- metadata['hostName'] = host_info.get('organismName')
- metadata['hostTaxId'] = host_info.get('taxId')
-
- # Extract isolate info
- isolate_info = report.get('isolate', {})
- metadata['isolateName'] = isolate_info.get('name')
+ host_info = report.get("host", {})
+ metadata["hostName"] = host_info.get("organismName")
+ metadata["hostTaxId"] = host_info.get("taxId")
+
+ # Extract isolate info
+ isolate_info = report.get("isolate", {})
+ metadata["isolateName"] = isolate_info.get("name")
# Store isolate as nested dict to match filter_metadata_only expectations
- metadata['isolate'] = {
- 'collectionDate': isolate_info.get('collectionDate'),
- 'source': isolate_info.get('source'),
+ metadata["isolate"] = {
+ "collectionDate": isolate_info.get("collectionDate"),
+ "source": isolate_info.get("source"),
}
-
+
# Extract location info
- location_info = report.get('location', {})
- metadata['location'] = location_info.get('geographicLocation')
- metadata['region'] = location_info.get('geographicRegion')
-
+ location_info = report.get("location", {})
+ metadata["location"] = location_info.get("geographicLocation")
+ metadata["region"] = location_info.get("geographicRegion")
+
# Extract other fields
- metadata['releaseDate'] = report.get('releaseDate')
- metadata['isAnnotated'] = report.get('isAnnotated', False)
- metadata['sourceDatabase'] = report.get('sourceDatabase')
- metadata['isLabHost'] = report.get('isLabHost', False)
-
+ metadata["releaseDate"] = report.get("releaseDate")
+ metadata["isAnnotated"] = report.get("isAnnotated", False)
+ metadata["sourceDatabase"] = report.get("sourceDatabase")
+ metadata["isLabHost"] = report.get("isLabHost", False)
+
# Gene and protein counts
- metadata['proteinCount'] = report.get('proteinCount')
- metadata['maturePeptideCount'] = report.get('maturePeptideCount')
-
+ metadata["proteinCount"] = report.get("proteinCount")
+ metadata["maturePeptideCount"] = report.get("maturePeptideCount")
+
# Extract segment
- metadata['segment'] = report.get('segment')
-
+ metadata["segment"] = report.get("segment")
+
# Extract vaccine strain flag
- metadata['isVaccineStrain'] = report.get('isVaccineStrain', False)
+ metadata["isVaccineStrain"] = report.get("isVaccineStrain", False)
+
+ submitter_info = report.get("submitter", {})
+ metadata["submitterName"] = submitter_info.get("names")
+ metadata["submitterCountry"] = submitter_info.get("country")
+ metadata["submitterInstitution"] = submitter_info.get("affiliation")
- submitter_info = report.get('submitter', {})
- metadata['submitterName'] = submitter_info.get('names')
- metadata['submitterCountry'] = submitter_info.get('country')
- metadata['submitterInstitution'] = submitter_info.get('affiliation')
-
# Write transformed record to temp JSONL (one line per record)
out_jsonl.write(json.dumps(metadata) + "\\n")
-
+
pbar.close()
-
- logger.info("✅ Streamed %d metadata records from %s to temp JSONL",
- cached_metadata_record_count, metadata_file)
- except Exception as e:
+
+ logger.info(
+ "✅ Streamed %d metadata records from %s to temp JSONL",
+ cached_metadata_record_count,
+ metadata_file,
+ )
+ except Exception as e: # noqa: BLE001
logger.warning("❌ Failed to process metadata file %s: %s", metadata_file, e)
continue
- except Exception as e:
+ except Exception as e: # noqa: BLE001
logger.warning("❌ Failed to create cached metadata JSONL: %s", e)
cached_metadata_jsonl_path = None
else:
logger.warning("No data_report.jsonl found in cached download. Post-download filters may be limited.")
-
+
if not fasta_files:
logger.error("❌ No FASTA files found in cached data.")
raise RuntimeError("No FASTA files found in cached data")
-
+
for fasta_file in fasta_files:
file_size = os.path.getsize(fasta_file)
file_size_mb = file_size / BYTES_PER_MB
if file_size_mb < MIN_VALID_FASTA_SIZE_MB:
- logger.warning("⚠️ FASTA file %s is smaller than expected (%.1f MB). It may not contain valid sequences.", fasta_file, file_size_mb)
+ logger.warning(
+ "⚠️ FASTA file %s is smaller than expected (%.1f MB). It may not contain valid sequences.",
+ fasta_file,
+ file_size_mb,
+ )
else:
logger.info("✅ Cached FASTA file available for streaming: %s (%.1f MB)", fasta_file, file_size_mb)
-
+
# If no rich metadata was loaded, create minimal metadata from FASTA headers
if not cached_metadata_jsonl_path or cached_metadata_record_count == 0:
logger.info("Creating basic metadata from FASTA headers (no data_report.jsonl available)")
logger.info("Streaming FASTA files to extract minimal metadata...")
-
+
cached_metadata_jsonl_path = os.path.join(extract_dir, "_cached_metadata_internal.jsonl")
cached_metadata_record_count = 0
seen_fasta_accessions = set()
-
+
try:
- with open(cached_metadata_jsonl_path, 'w', encoding='utf-8') as out_jsonl:
+ with open(cached_metadata_jsonl_path, "w", encoding="utf-8") as out_jsonl:
for fasta_file in fasta_files:
try:
file_size = os.path.getsize(fasta_file)
-
- with open(fasta_file, 'r', encoding='utf-8') as f:
+
+ with open(fasta_file, encoding="utf-8") as f:
pbar = tqdm(
total=file_size,
- unit='B',
+ unit="B",
unit_scale=True,
unit_divisor=1024,
desc="Extracting FASTA metadata",
ncols=80,
- leave=True
+ leave=True,
)
-
+
current_accession = None
sequence_length = 0
description = ""
-
+
for line in f:
- pbar.update(len(line.encode('utf-8')))
-
- if line.startswith('>'):
+ pbar.update(len(line.encode("utf-8")))
+
+ if line.startswith(">"):
# Save previous sequence if exists
if current_accession and current_accession not in seen_fasta_accessions:
seen_fasta_accessions.add(current_accession)
metadata = {
- 'accession': current_accession,
- 'description': description,
- 'length': sequence_length,
- 'source': 'cached_fasta_header'
+ "accession": current_accession,
+ "description": description,
+ "length": sequence_length,
+ "source": "cached_fasta_header",
}
out_jsonl.write(json.dumps(metadata) + "\\n")
cached_metadata_record_count += 1
-
+
# Parse new header
header = line[1:].strip()
current_accession = header.split()[0]
description = header
sequence_length = 0
-
+
else:
# Count bases in sequence (not including whitespace)
sequence_length += len(line.strip())
-
+
# Save last sequence
if current_accession and current_accession not in seen_fasta_accessions:
seen_fasta_accessions.add(current_accession)
metadata = {
- 'accession': current_accession,
- 'description': description,
- 'length': sequence_length,
- 'source': 'cached_fasta_header'
+ "accession": current_accession,
+ "description": description,
+ "length": sequence_length,
+ "source": "cached_fasta_header",
}
out_jsonl.write(json.dumps(metadata) + "\\n")
cached_metadata_record_count += 1
-
+
pbar.close()
-
+
logger.info("✅ Extracted metadata for sequences from %s", fasta_file)
-
- except Exception as e:
+
+ except Exception as e: # noqa: BLE001
logger.warning("❌ Failed to extract metadata from FASTA %s: %s", fasta_file, e)
continue
-
+
logger.info("Created basic metadata for %d sequences", cached_metadata_record_count)
- except Exception as e:
+ except Exception as e: # noqa: BLE001
logger.warning("❌ Failed to create FASTA metadata JSONL: %s", e)
cached_metadata_jsonl_path = None
-
+
logger.info("🎉 CACHED DATA LOADING SUCCESSFUL!")
logger.debug("Cached %s sequences will be streamed on-demand (not loaded to RAM)", virus_type)
if metadata_files:
logger.info("Rich metadata available from data_report.jsonl for post-download filtering")
-
+
# Return the cached FASTA file path and the path to the metadata JSONL (not loaded to RAM)
# Sequences and metadata will be streamed on-demand when needed
cached_fasta_file = fasta_files[0] if fasta_files else None
@@ -2622,38 +2784,40 @@ def process_cached_download(zip_file, virus_type="virus"):
def _monitor_subprocess_with_progress(process, cmd, timeout=None, progress_timeout=None):
- """
- Monitor a subprocess with progress tracking and timeout handling.
-
+ """Monitor a subprocess with progress tracking and timeout handling.
+
This helper function monitors a running subprocess. When stdout/stderr are piped, it checks for progress indicators. When they're not piped (output goes to console), it simply polls for completion.
-
+
Args:
process: subprocess.Popen instance to monitor.
cmd (list): Command that was executed (for error reporting).
timeout (int): Maximum total execution time in seconds. Defaults to DOWNLOAD_OVERALL_TIMEOUT.
progress_timeout (int): Maximum time without progress in seconds. Defaults to DOWNLOAD_PROGRESS_TIMEOUT.
-
- Returns:
+
+ Returns
+ -------
subprocess.CompletedProcess: Result of the completed process.
-
- Raises:
+
+ Raises
+ ------
subprocess.TimeoutExpired: If timeout conditions are met.
+
"""
# Apply default timeouts if not specified
if timeout is None:
timeout = DOWNLOAD_OVERALL_TIMEOUT
if progress_timeout is None:
progress_timeout = DOWNLOAD_PROGRESS_TIMEOUT
-
+
start_time = time.time()
last_progress = start_time
-
+
while True:
# Check if process has finished
retcode = process.poll()
if retcode is not None:
break
-
+
# Only check for progress if stderr was captured (is not None)
if process.stderr is not None:
# Read stderr without blocking
@@ -2661,12 +2825,12 @@ def _monitor_subprocess_with_progress(process, cmd, timeout=None, progress_timeo
if stderr:
# Log the stderr for debugging
# logger.debug("Progress output: %s", stderr.strip())
-
+
# If we see any progress indicator, update the last_progress time
if any(indicator.lower() in stderr.lower() for indicator in PROGRESS_INDICATORS):
last_progress = time.time()
# logger.debug("Progress detected, updating last_progress time")
-
+
# Check timeout conditions:
# 1. Less than total timeout, continue
# 2. If more than total timeout but progress in last progress_timeout, continue
@@ -2674,13 +2838,13 @@ def _monitor_subprocess_with_progress(process, cmd, timeout=None, progress_timeo
current_time = time.time()
total_time = current_time - start_time
time_since_progress = current_time - last_progress
-
+
if total_time > timeout and time_since_progress > progress_timeout:
process.kill()
raise subprocess.TimeoutExpired(cmd, timeout)
-
+
time.sleep(DOWNLOAD_PROGRESS_CHECK_INTERVAL) # Prevent CPU spin
-
+
# Only call communicate if process was created with pipes, otherwise just wait
if process.stdout is not None or process.stderr is not None:
stdout, stderr = process.communicate()
@@ -2688,31 +2852,19 @@ def _monitor_subprocess_with_progress(process, cmd, timeout=None, progress_timeo
stdout = None
stderr = None
process.wait()
-
- return subprocess.CompletedProcess(
- args=cmd,
- returncode=retcode,
- stdout=stdout,
- stderr=stderr
- )
+
+ return subprocess.CompletedProcess(args=cmd, returncode=retcode, stdout=stdout, stderr=stderr)
def _download_optimized_cached(
- virus_type,
- strategies,
- zip_path,
- outdir,
- use_accession=False,
- accession=None,
- requested_filters=None
+ virus_type, strategies, zip_path, outdir, use_accession=False, accession=None, requested_filters=None
):
- """
- Execute optimized cached download strategies with fallback.
-
+ """Execute optimized cached download strategies with fallback.
+
This is a generic implementation of the hierarchical fallback download pattern
used for both SARS-CoV-2 and Alphainfluenza. It tries each strategy in order
until one succeeds, with comprehensive error handling and logging.
-
+
Args:
virus_type (str): Type of virus for error messages ('SARS-CoV-2', 'Alphainfluenza', etc.).
strategies (list): List of tuples (strategy_name, cmd, applied_filters).
@@ -2721,45 +2873,47 @@ def _download_optimized_cached(
use_accession (bool): Whether using accession-based download.
accession (str, optional): Accession number if using accession-based download.
requested_filters (dict, optional): Dictionary of originally requested filters.
-
- Returns:
+
+ Returns
+ -------
tuple: (zip_path, applied_filters, missing_filters)
- zip_path (str): Path to the successfully downloaded ZIP file.
- applied_filters (list): List of filter names applied in successful strategy.
- missing_filters (list): List of filter names not applied (need post-processing).
-
- Raises:
+
+ Raises
+ ------
RuntimeError: If all strategies fail or datasets CLI is not available.
-
+
Example:
>>> strategies = [
... ("Strategy 1 (specific)", ["datasets", "download", ...], ["complete-only"]),
- ... ("Strategy 2 (general)", ["datasets", "download", ...], [])
+ ... ("Strategy 2 (general)", ["datasets", "download", ...], []),
... ]
>>> zip_file, applied, missing = _download_optimized_cached(
... "SARS-CoV-2", strategies, "/path/to/output.zip", "/output/dir"
... )
+
"""
-
# Get the path to the datasets CLI binary (uses precompiled binary bundled with gget)
datasets_path = _get_datasets_path()
-
+
last_error = None
-
+
for strategy_name, cmd, applied_filters in strategies:
# Replace "datasets" with the actual path to the binary
if cmd and cmd[0] == "datasets":
cmd = [datasets_path] + cmd[1:]
-
+
logger.info("🔄 Trying optimised strategy download with %s...", strategy_name)
-
+
if applied_filters:
logger.info("Applied filters: %s", ", ".join(applied_filters))
else:
logger.info("No specific filters applied")
-
+
logger.debug("Command: %s", " ".join(cmd))
-
+
try:
# Log the exact command being executed
cmd_str = " ".join(cmd)
@@ -2768,12 +2922,7 @@ def _download_optimized_cached(
# Start subprocess for progress monitoring
# Note: We don't use cwd=outdir because the command already includes full paths
try:
- process = subprocess.Popen(
- cmd,
- stdout=None,
- stderr=None,
- text=True
- )
+ process = subprocess.Popen(cmd, stdout=None, stderr=None, text=True)
except FileNotFoundError as fnf_error:
# Datasets binary not found - this shouldn't happen if bundled correctly
error_msg = (
@@ -2784,32 +2933,36 @@ def _download_optimized_cached(
)
logger.error(error_msg)
raise RuntimeError(error_msg) from fnf_error
-
+
# Monitor progress with timeout handling using helper function
result = _monitor_subprocess_with_progress(process, cmd)
-
+
# Check if the command was successful
if result.returncode == 0 and os.path.exists(zip_path):
file_size = os.path.getsize(zip_path)
-
+
# Check if file is too small (likely empty result) - if so, try next strategy. It's not zero since the folder always comes with a generic (readme) files.
if file_size < MIN_VALID_ZIP_SIZE:
- logger.warning("⚠️ %s resulted in file that's too small (%.2f MB, < 100 KB minimum). Trying next strategy...",
- strategy_name, file_size / 1024 / 1024)
+ logger.warning(
+ "⚠️ %s resulted in file that's too small (%.2f MB, < 100 KB minimum). Trying next strategy...",
+ strategy_name,
+ file_size / 1024 / 1024,
+ )
# Clean up invalid file
try:
os.remove(zip_path)
except OSError:
pass
continue
-
- logger.info("✅ %s successful: %s (%.2f MB)",
- strategy_name, os.path.basename(zip_path), file_size / 1024 / 1024)
-
+
+ logger.info(
+ "✅ %s successful: %s (%.2f MB)", strategy_name, os.path.basename(zip_path), file_size / 1024 / 1024
+ )
+
# Log any important output from the datasets CLI
# if result.stdout:
# logger.debug("datasets CLI output: %s", result.stdout.strip())
-
+
# Check which filters from the original request weren't applied in this strategy
if requested_filters:
requested_filter_list = []
@@ -2823,16 +2976,18 @@ def _download_optimized_cached(
else:
logger.debug("Non-boolean filter detected, adding key=value: %s=%s", key, value)
requested_filter_list.append(f"{key}={value}")
-
+
missing_filters = [f for f in requested_filter_list if f not in applied_filters]
if missing_filters:
logger.warning("⚠️ Some requested filters were not applied in successful strategy:")
- logger.warning(" Filters applied: %s", ", ".join(applied_filters) if applied_filters else "none")
+ logger.warning(
+ " Filters applied: %s", ", ".join(applied_filters) if applied_filters else "none"
+ )
logger.warning(" Filters missing: %s", ", ".join(missing_filters))
logger.warning(" These filters will need to be applied through post-processing")
else:
missing_filters = []
-
+
return zip_path, applied_filters, missing_filters
else:
# Strategy failed, prepare error message
@@ -2841,7 +2996,7 @@ def _download_optimized_cached(
error_msg += f": {result.stderr.strip()}"
logger.warning("%s", error_msg)
last_error = error_msg
-
+
# If this was an accession download that failed, provide specific guidance
if use_accession:
error_msg = (
@@ -2850,50 +3005,50 @@ def _download_optimized_cached(
f"If you're not sure, try without the is_{virus_type.lower().replace('-', '_').replace(' ', '_')} flag."
)
raise RuntimeError(error_msg)
-
+
# Clean up failed download file if it exists
if os.path.exists(zip_path):
try:
os.remove(zip_path)
except OSError:
pass
- continue # Try next strategy
-
+ continue # Try next strategy
+
except subprocess.TimeoutExpired:
error_msg = f"{strategy_name} timed out after 30 minutes"
logger.warning("%s", error_msg)
last_error = error_msg
continue
-
+
except subprocess.CalledProcessError as e:
error_msg = f"{strategy_name} execution failed: {e}"
logger.warning("%s", error_msg)
last_error = error_msg
continue
-
- except Exception as e:
+
+ except Exception as e: # noqa: BLE001
error_msg = f"{strategy_name} unexpected error: {e}"
logger.warning("%s", error_msg)
last_error = error_msg
continue
-
+
# All strategies failed
logger.warning("🚨 All cached download strategies failed. Last error: %s", last_error)
-
+
# Provide helpful guidance based on virus type
example_taxon = "SARS-CoV-2" if "sars" in virus_type.lower() else virus_type
guidance_messages = [
"🔧 TROUBLESHOOTING SUGGESTIONS:",
"1. Check your internet connection",
"2. Try running the command manually to see detailed error messages:",
- f" {datasets_path} download virus genome taxon \"{example_taxon}\" --filename test.zip",
+ f' {datasets_path} download virus genome taxon "{example_taxon}" --filename test.zip',
"3. NCBI servers may be temporarily unavailable - try again later",
- f"4. Consider using the general API method by removing {virus_type} specific terms from your query"
+ f"4. Consider using the general API method by removing {virus_type} specific terms from your query",
]
-
+
for msg in guidance_messages:
logger.info(msg)
-
+
# Raise error with the last failure details
raise RuntimeError(
f"All {virus_type} cached download strategies failed. "
@@ -2911,21 +3066,20 @@ def download_sars_cov2_optimized(
accession=None,
use_accession=False,
):
- """
- Download SARS-CoV-2 sequences using NCBI's optimized cached data packages.
-
+ """Download SARS-CoV-2 sequences using NCBI's optimized cached data packages.
+
NCBI provides pre-computed, highly compressed cached packages for SARS-CoV-2
that offer faster and more reliable downloads than the general API endpoints.
This function uses the datasets CLI to download these optimized packages with
hierarchical fallback from specific to general cached files.
-
+
Download strategies (in order of precedence):
1. If use_accession=True: Direct accession download using accession endpoint.
2. If use_accession=False:
a. Specific lineage + complete + host filters using taxon endpoint.
b. Complete genomes only using taxon endpoint.
c. All SARS-CoV-2 genomes using taxon endpoint (default fallback).
-
+
Args:
host (str, optional): Host organism filter (optimized for 'human').
complete_only (bool, optional): Whether to download only complete genomes.
@@ -2934,34 +3088,36 @@ def download_sars_cov2_optimized(
lineage (str, optional): SARS-CoV-2 lineage filter (e.g., 'B.1.1.7', 'P.1').
accession (str, optional): Specific SARS-CoV-2 accession or taxon ID.
use_accession (bool): Whether to use accession endpoint. Defaults to False.
-
- Returns:
+
+ Returns
+ -------
str: Path to the downloaded ZIP file containing sequences and metadata.
-
- Raises:
+
+ Raises
+ ------
RuntimeError: If the datasets CLI is not available or download fails.
+
"""
-
# Determine filter specificity for logging
filter_count = sum(1 for param in [host, complete_only, annotated, lineage] if param is not None)
if filter_count > 0:
logger.info("Attempting SARS-CoV-2 cached download with %d specific filters", filter_count)
else:
logger.info("Attempting general SARS-CoV-2 cached download (no specific filters)")
-
+
# Determine output directory
if not outdir:
outdir = os.getcwd()
logger.debug("No output directory specified, using current directory: %s", outdir)
-
+
# Ensure output directory exists
os.makedirs(outdir, exist_ok=True)
logger.debug("Output directory ready: %s", outdir)
-
+
# Create descriptive filename with timestamp and random suffix
zip_filename = f"sars_cov_2_{timestamp}_{random_suffix}.zip"
zip_path = os.path.join(outdir, zip_filename)
-
+
# Define which filters are available for this download
logger.debug("Available filters for SARS-CoV-2 download:")
if complete_only:
@@ -2972,42 +3128,43 @@ def download_sars_cov2_optimized(
logger.debug("- host filter: %s", host)
if annotated:
logger.debug("- annotated filter")
-
+
# Define fallback strategies in order of preference
strategies = []
-
+
if use_accession:
# Parse the accession input to handle single, space-separated, or file-based accessions
parsed = _parse_accession_input(accession)
-
- if parsed['is_file']:
+
+ if parsed["is_file"]:
# File-based input: use --inputfile flag
- cmd1 = ["datasets", "download", "virus", "genome", "accession",
- "--inputfile", parsed['file_path']]
+ cmd1 = ["datasets", "download", "virus", "genome", "accession", "--inputfile", parsed["file_path"]]
cmd1.extend(["--filename", zip_path])
strategies.append(("Strategy 1 (accessions from file)", cmd1, [f"inputfile={parsed['file_path']}"]))
- logger.debug("Using accession input file: %s", parsed['file_path'])
- elif parsed['type'] == 'list':
+ logger.debug("Using accession input file: %s", parsed["file_path"])
+ elif parsed["type"] == "list":
# Space-separated accessions: pass as arguments
- cmd1 = ["datasets", "download", "virus", "genome", "accession"] + parsed['accessions']
+ cmd1 = ["datasets", "download", "virus", "genome", "accession"] + parsed["accessions"]
cmd1.extend(["--filename", zip_path])
- strategies.append(("Strategy 1 (multiple accessions)", cmd1, [f"accessions={', '.join(parsed['accessions'][:3])}..."]))
- logger.debug("Using multiple accessions: %s", ", ".join(parsed['accessions']))
+ strategies.append(
+ ("Strategy 1 (multiple accessions)", cmd1, [f"accessions={', '.join(parsed['accessions'][:3])}..."])
+ )
+ logger.debug("Using multiple accessions: %s", ", ".join(parsed["accessions"]))
else:
# Single accession
- cmd1 = ["datasets", "download", "virus", "genome", "accession", parsed['accessions']]
+ cmd1 = ["datasets", "download", "virus", "genome", "accession", parsed["accessions"]]
cmd1.extend(["--filename", zip_path])
strategies.append(("Strategy 1 (direct accession)", cmd1, [f"accession={parsed['accessions']}"]))
- logger.debug("Using single accession: %s", parsed['accessions'])
+ logger.debug("Using single accession: %s", parsed["accessions"])
elif lineage or complete_only or host or annotated:
# Strategy 1: Try with specific filters using taxon endpoint
cmd1 = ["datasets", "download", "virus", "genome", "taxon", "SARS-CoV-2"]
filters1 = []
-
+
if complete_only:
cmd1.append("--complete-only")
filters1.append("complete-only")
-
+
if lineage:
cmd1.extend(["--lineage", lineage])
filters1.append(f"lineage={lineage}")
@@ -3015,32 +3172,76 @@ def download_sars_cov2_optimized(
if host:
cmd1.extend(["--host", host])
filters1.append(f"host={host}")
-
+
if annotated:
cmd1.append("--annotated")
filters1.append("annotated")
cmd1.extend(["--filename", zip_path])
strategies.append(("Strategy 1 (specific filters)", cmd1, filters1))
-
+
# Strategy 2: Try complete-only and host if it was requested (without lineage)
if complete_only and host and lineage: # Only add this if we had lineage in strategy 1
- cmd2 = ["datasets", "download", "virus", "genome", "taxon", "SARS-CoV-2", "--complete-only", "--host", host, "--filename", zip_path]
+ cmd2 = [
+ "datasets",
+ "download",
+ "virus",
+ "genome",
+ "taxon",
+ "SARS-CoV-2",
+ "--complete-only",
+ "--host",
+ host,
+ "--filename",
+ zip_path,
+ ]
strategies.append(("Strategy 2 (complete-only and host)", cmd2, ["complete-only", f"host={host}"]))
# Strategy 3: Try complete-only if it was requested
- if complete_only and (host or lineage):
- cmd3 = ["datasets", "download", "virus", "genome", "taxon", "SARS-CoV-2", "--complete-only", "--filename", zip_path]
+ if complete_only and (host or lineage):
+ cmd3 = [
+ "datasets",
+ "download",
+ "virus",
+ "genome",
+ "taxon",
+ "SARS-CoV-2",
+ "--complete-only",
+ "--filename",
+ zip_path,
+ ]
strategies.append(("Strategy 3 (complete-only)", cmd3, ["complete-only"]))
- # Strategy 4: Try host if it was requested
- if host and (complete_only or lineage):
- cmd4 = ["datasets", "download", "virus", "genome", "taxon", "SARS-CoV-2", "--host", host, "--filename", zip_path]
+ # Strategy 4: Try host if it was requested
+ if host and (complete_only or lineage):
+ cmd4 = [
+ "datasets",
+ "download",
+ "virus",
+ "genome",
+ "taxon",
+ "SARS-CoV-2",
+ "--host",
+ host,
+ "--filename",
+ zip_path,
+ ]
strategies.append(("Strategy 4 (host)", cmd4, [f"host={host}"]))
- # Strategy 5: Try lineage if it was requested
- if lineage and (complete_only or host):
- cmd5 = ["datasets", "download", "virus", "genome", "taxon", "SARS-CoV-2", "--lineage", lineage, "--filename", zip_path]
+ # Strategy 5: Try lineage if it was requested
+ if lineage and (complete_only or host):
+ cmd5 = [
+ "datasets",
+ "download",
+ "virus",
+ "genome",
+ "taxon",
+ "SARS-CoV-2",
+ "--lineage",
+ lineage,
+ "--filename",
+ zip_path,
+ ]
strategies.append(("Strategy 5 (lineage)", cmd5, [f"lineage={lineage}"]))
# Strategy 6: General SARS-CoV-2 package (no filters)
@@ -3048,13 +3249,8 @@ def download_sars_cov2_optimized(
strategies.append(("Strategy 6 (general package)", cmd6, []))
# Use the common download function with all strategies
- requested_filters_dict = {
- 'complete-only': complete_only,
- 'lineage': lineage,
- 'host': host,
- 'annotated': annotated
- }
-
+ requested_filters_dict = {"complete-only": complete_only, "lineage": lineage, "host": host, "annotated": annotated}
+
return _download_optimized_cached(
virus_type="SARS-CoV-2",
strategies=strategies,
@@ -3062,7 +3258,7 @@ def download_sars_cov2_optimized(
outdir=outdir,
use_accession=use_accession,
accession=accession,
- requested_filters=requested_filters_dict
+ requested_filters=requested_filters_dict,
)
@@ -3074,25 +3270,24 @@ def download_alphainfluenza_optimized(
accession=None,
use_accession=False,
):
- """
- Download Alphainfluenza sequences using NCBI's optimized cached data packages.
-
+ """Download Alphainfluenza sequences using NCBI's optimized cached data packages.
+
NCBI provides pre-computed, highly compressed cached packages for Alphainfluenza
that offer faster and more reliable downloads than the general API endpoints.
This function uses the datasets CLI to download these optimized packages with
hierarchical fallback from specific to general cached files.
-
+
Cached packages are available for the following Alphainfluenza taxonomic nodes:
1. Alphainfluenza (genus, taxid: 197911)
2. Alphainfluenzavirus influenzae (species, taxid: 2955291)
3. Influenza A virus (no-rank, taxid: 11320)
-
+
For each taxon, filtered sets are available:
1. All genomes
2. Human host only
3. Human host only & complete
4. Complete only
-
+
Args:
host (str, optional): Host organism filter (optimized for 'human').
complete_only (bool, optional): Whether to download only complete genomes.
@@ -3100,37 +3295,39 @@ def download_alphainfluenza_optimized(
outdir (str, optional): Output directory for downloaded files.
accession (str, optional): Specific Alphainfluenza accession or taxon ID.
use_accession (bool): Whether to use accession endpoint. Defaults to False.
-
- Returns:
+
+ Returns
+ -------
str: Path to the downloaded ZIP file containing sequences and metadata.
-
- Raises:
+
+ Raises
+ ------
RuntimeError: If the datasets CLI is not available or download fails.
+
"""
-
# Determine filter specificity for logging
filter_count = sum(1 for param in [host, complete_only, annotated] if param is not None)
if filter_count > 0:
logger.info("Attempting Alphainfluenza cached download with %d specific filters", filter_count)
else:
logger.info("Attempting general Alphainfluenza cached download (no specific filters)")
-
+
# Determine output directory
if not outdir:
outdir = os.getcwd()
logger.debug("No output directory specified, using current directory: %s", outdir)
-
+
# Ensure output directory exists before passing path to datasets CLI
os.makedirs(outdir, exist_ok=True)
logger.debug("Output directory ready: %s", outdir)
-
+
# Create descriptive filename with timestamp and random suffix
zip_filename = f"alphainfluenza_{timestamp}_{random_suffix}.zip"
zip_path = os.path.join(outdir, zip_filename)
-
+
# Ensure the parent directory exists (in case outdir has subdirectories)
os.makedirs(os.path.dirname(zip_path), exist_ok=True)
-
+
# Define which filters are available for this download
logger.debug("Available filters for Alphainfluenza download:")
if complete_only:
@@ -3139,42 +3336,43 @@ def download_alphainfluenza_optimized(
logger.debug("- host filter: %s", host)
if annotated:
logger.debug("- annotated filter")
-
+
# Define fallback strategies in order of preference
strategies = []
-
+
# Default taxon to use (most specific: Alphainfluenzavirus influenzae species)
# This taxon ID has the most comprehensive cached data
default_taxon = ALPHAINFLUENZA_DEFAULT_TAXON
-
+
if use_accession:
# Parse the accession input to handle single, space-separated, or file-based accessions
parsed = _parse_accession_input(accession)
-
- if parsed['is_file']:
+
+ if parsed["is_file"]:
# File-based input: use --inputfile flag
- cmd1 = ["datasets", "download", "virus", "genome", "accession",
- "--inputfile", parsed['file_path']]
+ cmd1 = ["datasets", "download", "virus", "genome", "accession", "--inputfile", parsed["file_path"]]
cmd1.extend(["--filename", zip_path])
strategies.append(("Strategy 1 (accessions from file)", cmd1, [f"inputfile={parsed['file_path']}"]))
- logger.debug("Using accession input file: %s", parsed['file_path'])
- elif parsed['type'] == 'list':
+ logger.debug("Using accession input file: %s", parsed["file_path"])
+ elif parsed["type"] == "list":
# Space-separated accessions: pass as arguments
- cmd1 = ["datasets", "download", "virus", "genome", "accession"] + parsed['accessions']
+ cmd1 = ["datasets", "download", "virus", "genome", "accession"] + parsed["accessions"]
cmd1.extend(["--filename", zip_path])
- strategies.append(("Strategy 1 (multiple accessions)", cmd1, [f"accessions={', '.join(parsed['accessions'][:3])}..."]))
- logger.debug("Using multiple accessions: %s", ", ".join(parsed['accessions']))
+ strategies.append(
+ ("Strategy 1 (multiple accessions)", cmd1, [f"accessions={', '.join(parsed['accessions'][:3])}..."])
+ )
+ logger.debug("Using multiple accessions: %s", ", ".join(parsed["accessions"]))
else:
# Single accession
- cmd1 = ["datasets", "download", "virus", "genome", "accession", parsed['accessions']]
+ cmd1 = ["datasets", "download", "virus", "genome", "accession", parsed["accessions"]]
cmd1.extend(["--filename", zip_path])
strategies.append(("Strategy 1 (direct accession)", cmd1, [f"accession={parsed['accessions']}"]))
- logger.debug("Using single accession: %s", parsed['accessions'])
+ logger.debug("Using single accession: %s", parsed["accessions"])
elif complete_only or host or annotated:
# Strategy 1: Try with specific filters using taxon endpoint
cmd1 = ["datasets", "download", "virus", "genome", "taxon", default_taxon]
filters1 = []
-
+
if complete_only:
cmd1.append("--complete-only")
filters1.append("complete-only")
@@ -3182,27 +3380,60 @@ def download_alphainfluenza_optimized(
if host:
cmd1.extend(["--host", host])
filters1.append(f"host={host}")
-
+
if annotated:
cmd1.append("--annotated")
filters1.append("annotated")
cmd1.extend(["--filename", zip_path])
strategies.append(("Strategy 1 (specific filters)", cmd1, filters1))
-
+
# Strategy 2: Try complete-only and host if both were requested
if complete_only and host:
- cmd2 = ["datasets", "download", "virus", "genome", "taxon", default_taxon, "--complete-only", "--host", host, "--filename", zip_path]
+ cmd2 = [
+ "datasets",
+ "download",
+ "virus",
+ "genome",
+ "taxon",
+ default_taxon,
+ "--complete-only",
+ "--host",
+ host,
+ "--filename",
+ zip_path,
+ ]
strategies.append(("Strategy 2 (complete-only and host)", cmd2, ["complete-only", f"host={host}"]))
# Strategy 3: Try complete-only if it was requested
- if complete_only and (host or annotated):
- cmd3 = ["datasets", "download", "virus", "genome", "taxon", default_taxon, "--complete-only", "--filename", zip_path]
+ if complete_only and (host or annotated):
+ cmd3 = [
+ "datasets",
+ "download",
+ "virus",
+ "genome",
+ "taxon",
+ default_taxon,
+ "--complete-only",
+ "--filename",
+ zip_path,
+ ]
strategies.append(("Strategy 3 (complete-only)", cmd3, ["complete-only"]))
- # Strategy 4: Try host if it was requested
- if host and (complete_only or annotated):
- cmd4 = ["datasets", "download", "virus", "genome", "taxon", default_taxon, "--host", host, "--filename", zip_path]
+ # Strategy 4: Try host if it was requested
+ if host and (complete_only or annotated):
+ cmd4 = [
+ "datasets",
+ "download",
+ "virus",
+ "genome",
+ "taxon",
+ default_taxon,
+ "--host",
+ host,
+ "--filename",
+ zip_path,
+ ]
strategies.append(("Strategy 4 (host)", cmd4, [f"host={host}"]))
# Strategy 5: General Alphainfluenza package (no filters)
@@ -3210,12 +3441,8 @@ def download_alphainfluenza_optimized(
strategies.append(("Strategy 5 (general package)", cmd5, []))
# Use the common download function with all strategies
- requested_filters_dict = {
- 'complete-only': complete_only,
- 'host': host,
- 'annotated': annotated
- }
-
+ requested_filters_dict = {"complete-only": complete_only, "host": host, "annotated": annotated}
+
return _download_optimized_cached(
virus_type="Alphainfluenza",
strategies=strategies,
@@ -3223,140 +3450,159 @@ def download_alphainfluenza_optimized(
outdir=outdir,
use_accession=use_accession,
accession=accession,
- requested_filters=requested_filters_dict
+ requested_filters=requested_filters_dict,
)
def download_sequences_by_accessions(accessions, outdir=None, batch_size=200, failed_commands=None, api_key=None):
- """
- Download virus genome sequences for a specific list of accession numbers.
-
+ """Download virus genome sequences for a specific list of accession numbers.
+
This function downloads sequences for a pre-filtered list of accessions,
using NCBI E-utilities API with batching to avoid URL length limitations.
Large requests are automatically split into smaller batches.
-
+
Args:
accessions (list): List of accession numbers to download.
outdir (str, optional): Output directory for downloaded files.
batch_size (int): Maximum number of accessions per batch. Defaults to 200.
failed_commands (dict, optional): Dictionary to track failed operations.
api_key (str, optional): NCBI API key for higher rate limits (10 req/sec vs 3).
-
- Returns:
+
+ Returns
+ -------
str: Path to the downloaded FASTA file containing sequences.
-
- Raises:
+
+ Raises
+ ------
RuntimeError: If the download request fails.
ValueError: If no accessions are provided.
+
"""
-
if not accessions:
raise ValueError("No accessions provided for download")
-
+
logger.info("Downloading sequences for %d accessions using E-utilities API", len(accessions))
- logger.debug("Accession list: %s", accessions[:5] + ['...'] if len(accessions) > 5 else accessions)
-
+ logger.debug("Accession list: %s", accessions[:5] + ["..."] if len(accessions) > 5 else accessions)
+
# Determine output directory - use current working directory if not specified
if not outdir:
outdir = os.getcwd()
logger.debug("No output directory specified, using current directory: %s", outdir)
-
+
# Ensure output directory exists
os.makedirs(outdir, exist_ok=True)
logger.debug("Ensured output directory exists: %s", outdir)
-
+
# Create output FASTA file path
fasta_path = os.path.join(outdir, f"virus_sequences_{timestamp}_{random_suffix}.fasta")
logger.debug("Saving sequences to: %s", fasta_path)
-
+
# For large datasets, prefer the EPost + EFetch History Server pipeline
# This is NCBI's recommended approach and is significantly faster
if len(accessions) > batch_size:
- logger.info("Large request detected (%d accessions). Trying EPost+EFetch History Server pipeline...",
- len(accessions))
+ logger.info(
+ "Large request detected (%d accessions). Trying EPost+EFetch History Server pipeline...", len(accessions)
+ )
try:
_download_sequences_epost_efetch(accessions, fasta_path, failed_commands)
- except Exception as epost_error:
+ except Exception as epost_error: # noqa: BLE001
logger.warning("EPost+EFetch pipeline failed: %s", epost_error)
logger.info("Falling back to direct batched E-utilities requests...")
# Reset the file in case partial data was written
if os.path.exists(fasta_path):
os.remove(fasta_path)
- return _download_sequences_batched(accessions, NCBI_EUTILS_BASE_EFETCH, fasta_path, batch_size, failed_commands, api_key=api_key)
-
+ return _download_sequences_batched(
+ accessions, NCBI_EUTILS_BASE_EFETCH, fasta_path, batch_size, failed_commands, api_key=api_key
+ )
+
# Check for missing sequences and retry them via direct batched download
downloaded_accs = set()
try:
- with open(fasta_path, 'r', encoding='utf-8') as f:
+ with open(fasta_path, encoding="utf-8") as f:
for line in f:
- if line.startswith('>'):
+ if line.startswith(">"):
acc = line[1:].split()[0].strip()
downloaded_accs.add(acc)
- except IOError:
+ except OSError:
pass
-
+
requested_set = set(accessions)
missing_accs = requested_set - downloaded_accs
-
+
if missing_accs:
- logger.warning("⚠️ EPost+EFetch pipeline missed %d/%d sequences. "
- "Retrying missing accessions via direct batched download...",
- len(missing_accs), len(accessions))
-
+ logger.warning(
+ "⚠️ EPost+EFetch pipeline missed %d/%d sequences. "
+ "Retrying missing accessions via direct batched download...",
+ len(missing_accs),
+ len(accessions),
+ )
+
# Retry the missing accessions by appending to the existing FASTA file
temp_retry_path = fasta_path + ".retry_tmp"
try:
_download_sequences_batched(
- list(missing_accs), NCBI_EUTILS_BASE_EFETCH,
- temp_retry_path, batch_size, failed_commands, api_key=api_key
+ list(missing_accs),
+ NCBI_EUTILS_BASE_EFETCH,
+ temp_retry_path,
+ batch_size,
+ failed_commands,
+ api_key=api_key,
)
# Append recovered sequences to the main FASTA file
if os.path.exists(temp_retry_path) and os.path.getsize(temp_retry_path) > 0:
- with open(fasta_path, 'a', encoding='utf-8') as main_f:
- with open(temp_retry_path, 'r', encoding='utf-8') as retry_f:
+ with open(fasta_path, "a", encoding="utf-8") as main_f:
+ with open(temp_retry_path, encoding="utf-8") as retry_f:
main_f.write(retry_f.read())
# Count recovered sequences
recovered = 0
- with open(temp_retry_path, 'r', encoding='utf-8') as retry_f:
+ with open(temp_retry_path, encoding="utf-8") as retry_f:
for line in retry_f:
- if line.startswith('>'):
+ if line.startswith(">"):
recovered += 1
- logger.info("✅ Recovered %d/%d missing sequences via direct download",
- recovered, len(missing_accs))
- except Exception as retry_error:
- logger.warning("⚠️ Retry of missing sequences failed: %s. "
- "Proceeding with %d/%d sequences.",
- retry_error, len(downloaded_accs), len(accessions))
+ logger.info(
+ "✅ Recovered %d/%d missing sequences via direct download", recovered, len(missing_accs)
+ )
+ except Exception as retry_error: # noqa: BLE001
+ logger.warning(
+ "⚠️ Retry of missing sequences failed: %s. Proceeding with %d/%d sequences.",
+ retry_error,
+ len(downloaded_accs),
+ len(accessions),
+ )
finally:
if os.path.exists(temp_retry_path):
os.remove(temp_retry_path)
-
+
return fasta_path
-
+
# For smaller requests, use single request
- return _download_sequences_single_batch(accessions, NCBI_EUTILS_BASE_EFETCH, fasta_path, failed_commands, api_key=api_key)
+ return _download_sequences_single_batch(
+ accessions, NCBI_EUTILS_BASE_EFETCH, fasta_path, failed_commands, api_key=api_key
+ )
def _download_sequences_epost_efetch(accessions, fasta_path, failed_commands=None, api_key=None):
- """
- Download FASTA sequences using NCBI EPost + EFetch History Server pipeline.
-
+ """Download FASTA sequences using NCBI EPost + EFetch History Server pipeline.
+
This is NCBI's recommended approach for large datasets. It uploads accession
IDs to the History Server via EPost, then retrieves FASTA sequences in batches
using the WebEnv/query_key reference. This avoids URL length limitations and
is significantly faster than individual batched requests.
-
+
Args:
accessions (list): List of accession numbers to download.
fasta_path (str): Path where FASTA file should be saved.
failed_commands (dict, optional): Dictionary to track failed operations.
api_key (str, optional): NCBI API key for higher rate limits (10 req/sec vs 3).
-
- Returns:
+
+ Returns
+ -------
str: Path to the saved FASTA file.
-
- Raises:
+
+ Raises
+ ------
RuntimeError: If EPost fails or no sequences are retrieved.
+
"""
# Resolve API key: argument > module-level env var
if api_key is None:
@@ -3366,64 +3612,64 @@ def _download_sequences_epost_efetch(accessions, fasta_path, failed_commands=Non
# Step 1: Upload accessions to NCBI History Server via EPost
web_env, query_key = _epost_accessions(accessions, api_key=api_key)
-
+
if not web_env or not query_key:
raise RuntimeError(
"EPost failed: could not upload accessions to NCBI History Server. "
"The server may be temporarily unavailable."
)
-
+
# Step 2: Fetch FASTA sequences in batches using the History Server reference
total = len(accessions)
retmax = EFETCH_FASTA_RETMAX
total_downloaded = 0
batch_failures = 0
-
+
# Determine inter-batch delay based on API key availability
delay = EUTILS_INTER_BATCH_DELAY_WITH_KEY if api_key else EUTILS_INTER_BATCH_DELAY
-
+
logger.info("Fetching FASTA sequences in batches of %d (total: %d)", retmax, total)
-
+
try:
- with open(fasta_path, 'w', encoding='utf-8') as fasta_handle:
+ with open(fasta_path, "w", encoding="utf-8") as fasta_handle:
for retstart in range(0, total, retmax):
batch_num = (retstart // retmax) + 1
total_batches = (total + retmax - 1) // retmax
-
- logger.debug("EFetch FASTA batch %d/%d (retstart=%d, retmax=%d)",
- batch_num, total_batches, retstart, retmax)
-
+
+ logger.debug(
+ "EFetch FASTA batch %d/%d (retstart=%d, retmax=%d)", batch_num, total_batches, retstart, retmax
+ )
+
# Define the fetch operation for retry helper
def _fetch_fasta_batch(rs=retstart):
params = {
- 'db': 'nucleotide',
- 'WebEnv': web_env,
- 'query_key': query_key,
- 'retstart': rs,
- 'retmax': retmax,
- 'rettype': 'fasta',
- 'retmode': 'text',
+ "db": "nucleotide",
+ "WebEnv": web_env,
+ "query_key": query_key,
+ "retstart": rs,
+ "retmax": retmax,
+ "rettype": "fasta",
+ "retmode": "text",
}
if api_key:
- params['api_key'] = api_key
-
+ params["api_key"] = api_key
+
response = requests.get(
NCBI_EUTILS_BASE_EFETCH,
params=params,
timeout=EUTILS_TIMEOUT,
- headers={'User-Agent': 'gget/1.0'}
+ headers={"User-Agent": "gget/1.0"},
)
response.raise_for_status()
-
+
# Validate FASTA content
text = response.text.strip()
- if not text or not text.startswith('>'):
+ if not text or not text.startswith(">"):
raise RuntimeError(
- f"Invalid FASTA response for batch at retstart={rs}: "
- f"response starts with '{text[:50]}'"
+ f"Invalid FASTA response for batch at retstart={rs}: response starts with '{text[:50]}'"
)
return text
-
+
# Use exponential backoff retry
success, fasta_text, error_info = _retry_with_exponential_backoff(
operation_name=f"EFetch FASTA batch {batch_num}/{total_batches}",
@@ -3439,38 +3685,46 @@ def _fetch_fasta_batch(rs=retstart):
),
failed_commands=failed_commands,
)
-
+
if success:
# Write FASTA data to file
fasta_handle.write(fasta_text)
- if not fasta_text.endswith('\n'):
- fasta_handle.write('\n')
-
- seq_count = fasta_text.count('>')
+ if not fasta_text.endswith("\n"):
+ fasta_handle.write("\n")
+
+ seq_count = fasta_text.count(">")
total_downloaded += seq_count
- logger.debug("Batch %d/%d: wrote %d sequences (total: %d)",
- batch_num, total_batches, seq_count, total_downloaded)
+ logger.debug(
+ "Batch %d/%d: wrote %d sequences (total: %d)",
+ batch_num,
+ total_batches,
+ seq_count,
+ total_downloaded,
+ )
else:
batch_failures += 1
- logger.warning("❌ Batch %d/%d failed after retries: %s",
- batch_num, total_batches,
- error_info.get('error', 'unknown'))
-
+ logger.warning(
+ "❌ Batch %d/%d failed after retries: %s",
+ batch_num,
+ total_batches,
+ error_info.get("error", "unknown"),
+ )
+
# Track the failure
_track_failed_operation(
failed_commands,
- 'sequence_batches',
- {'batch_num': batch_num, 'retstart': retstart, 'retmax': retmax},
- error_info if error_info else {'error': 'unknown'}
+ "sequence_batches",
+ {"batch_num": batch_num, "retstart": retstart, "retmax": retmax},
+ error_info if error_info else {"error": "unknown"},
)
-
+
# Respect NCBI rate limits
if retstart + retmax < total:
time.sleep(delay)
-
- except IOError as e:
+
+ except OSError as e:
raise RuntimeError(f"Failed to write FASTA file {fasta_path}: {e}") from e
-
+
# Validate results
if total_downloaded == 0:
# Clean up empty file
@@ -3480,74 +3734,72 @@ def _fetch_fasta_batch(rs=retstart):
f"EPost+EFetch pipeline downloaded 0 sequences out of {total} requested. "
f"All {batch_failures} batches failed."
)
-
+
file_size_mb = os.path.getsize(fasta_path) / BYTES_PER_MB
- logger.info("✅ EPost+EFetch pipeline complete: %d sequences downloaded (%.2f MB)",
- total_downloaded, file_size_mb)
-
+ logger.info("✅ EPost+EFetch pipeline complete: %d sequences downloaded (%.2f MB)", total_downloaded, file_size_mb)
+
if batch_failures > 0:
- logger.warning("⚠️ %d batch(es) failed during download. %d/%d sequences retrieved.",
- batch_failures, total_downloaded, total)
-
+ logger.warning(
+ "⚠️ %d batch(es) failed during download. %d/%d sequences retrieved.", batch_failures, total_downloaded, total
+ )
+
return fasta_path
-def _download_sequences_single_batch(accessions, NCBI_EUTILS_BASE_EFETCH, fasta_path, failed_commands=None, api_key=None):
- """
- Download sequences in a single E-utilities request with exponential backoff retries.
-
+def _download_sequences_single_batch(
+ accessions, NCBI_EUTILS_BASE_EFETCH, fasta_path, failed_commands=None, api_key=None
+):
+ """Download sequences in a single E-utilities request with exponential backoff retries.
+
This function handles downloading virus sequences for a list of accessions
using a single HTTP request to NCBI E-utilities. It's optimized for
smaller batches (< 200 accessions) to avoid URL length limitations. Includes
exponential backoff retries for transient failures.
-
+
Args:
accessions (list): List of accession numbers to download.
NCBI_EUTILS_BASE_EFETCH (str): Base URL for NCBI E-utilities API.
fasta_path (str): Path where FASTA file should be saved.
failed_commands (dict, optional): Dictionary to track failed operations.
-
- Returns:
+
+ Returns
+ -------
str: Path to the saved FASTA file.
-
- Raises:
+
+ Raises
+ ------
RuntimeError: If the download fails after retries or response is invalid
-
+
Note:
- Validates FASTA format before saving
- Includes extended timeout for large datasets
- Implements exponential backoff retries for transient failures
- Automatically falls back to batching if URL is too long
-
+
Example:
- >>> accessions = ['NC_045512.2', 'MN908947.3']
- >>> path = _download_sequences_single_batch(accessions, BASE_URL, 'output.fasta')
+ >>> accessions = ["NC_045512.2", "MN908947.3"]
+ >>> path = _download_sequences_single_batch(accessions, BASE_URL, "output.fasta")
+
"""
-
# Build accession string (E-utils supports comma-separated IDs)
accession_string = ",".join(accessions)
-
+
def execute_request():
- params = {
- 'db': 'nucleotide',
- 'id': accession_string,
- 'rettype': 'fasta',
- 'retmode': 'text'
- }
+ params = {"db": "nucleotide", "id": accession_string, "rettype": "fasta", "retmode": "text"}
if api_key:
- params['api_key'] = api_key
+ params["api_key"] = api_key
logger.debug("E-utilities URL: %s", NCBI_EUTILS_BASE_EFETCH)
response = requests.get(NCBI_EUTILS_BASE_EFETCH, params=params, timeout=EUTILS_TIMEOUT)
response.raise_for_status()
-
+
# Verify we got FASTA data
- if not response.text.strip().startswith('>'):
+ if not response.text.strip().startswith(">"):
raise RuntimeError(f"Invalid FASTA response: {response.text[:100]}")
-
+
return response.text
-
+
logger.info("Initiating E-utilities request for %d accessions", len(accessions))
-
+
# Use exponential backoff helper for retries
success, response_text, error_info = _retry_with_exponential_backoff(
operation_name=f"E-utilities request ({len(accessions)} accessions)",
@@ -3555,63 +3807,80 @@ def execute_request():
max_retries=API_MAX_RETRIES,
initial_delay=API_INITIAL_RETRY_DELAY,
backoff_multiplier=API_RETRY_BACKOFF_MULTIPLIER,
- retryable_exceptions=(requests.exceptions.ConnectionError, requests.exceptions.HTTPError, requests.exceptions.Timeout),
+ retryable_exceptions=(
+ requests.exceptions.ConnectionError,
+ requests.exceptions.HTTPError,
+ requests.exceptions.Timeout,
+ ),
failed_commands=failed_commands,
)
-
+
if not success:
# Check for specific URL length error
- error_msg = error_info['error']
+ error_msg = error_info["error"]
if "414" in error_msg or "Request-URI Too Long" in error_msg:
logger.info("URL too long error detected. Retrying with batch processing...")
# Retry with smaller batches (half of default)
- return _download_sequences_batched(accessions, NCBI_EUTILS_BASE_EFETCH, fasta_path, batch_size=EUTILS_DEFAULT_BATCH_SIZE // 2, failed_commands=failed_commands, api_key=api_key)
-
+ return _download_sequences_batched(
+ accessions,
+ NCBI_EUTILS_BASE_EFETCH,
+ fasta_path,
+ batch_size=EUTILS_DEFAULT_BATCH_SIZE // 2,
+ failed_commands=failed_commands,
+ api_key=api_key,
+ )
+
# Log and track the failure
logger.error("❌ E-utilities request failed after %d retries: %s", API_MAX_RETRIES, error_msg)
-
+
# Track failed operation for later reporting in command summary
retry_url = f"{NCBI_EUTILS_BASE_EFETCH}?db=nucleotide&id={accession_string}&rettype=fasta&retmode=text"
_track_failed_operation(
- failed_commands,
- 'sequence_fetch',
+ failed_commands,
+ "sequence_fetch",
{
- 'operation': 'single_batch_download',
- 'accession_count': len(accessions),
- 'retry_url': retry_url,
+ "operation": "single_batch_download",
+ "accession_count": len(accessions),
+ "retry_url": retry_url,
},
- error_info
+ error_info,
)
-
- raise RuntimeError(f"❌ Failed to download virus sequences via E-utilities after {API_MAX_RETRIES} retries: {error_msg}") from None
-
+
+ raise RuntimeError(
+ f"❌ Failed to download virus sequences via E-utilities after {API_MAX_RETRIES} retries: {error_msg}"
+ ) from None
+
# Save to file
try:
# Count sequences in response
- sequence_count = response_text.count('>')
+ sequence_count = response_text.count(">")
logger.info("Received %d sequences from E-utilities", sequence_count)
-
+
# Write FASTA data to file
- with open(fasta_path, 'w', encoding='utf-8') as f:
+ with open(fasta_path, "w", encoding="utf-8") as f:
f.write(response_text)
-
- logger.info("Successfully saved sequences to: %s (%.2f MB)",
- fasta_path, len(response_text.encode('utf-8')) / 1024 / 1024)
+
+ logger.info(
+ "Successfully saved sequences to: %s (%.2f MB)",
+ fasta_path,
+ len(response_text.encode("utf-8")) / 1024 / 1024,
+ )
return fasta_path
-
- except IOError as e:
+
+ except OSError as e:
logger.error("❌ Failed to save FASTA file: %s", e)
raise RuntimeError(f"❌ Failed to save downloaded sequences: {e}") from e
-def _download_sequences_batched(accessions, NCBI_EUTILS_BASE_EFETCH, fasta_path, batch_size, failed_commands=None, api_key=None):
- """
- Download sequences using multiple batched E-utilities requests with incremental file writing.
-
+def _download_sequences_batched(
+ accessions, NCBI_EUTILS_BASE_EFETCH, fasta_path, batch_size, failed_commands=None, api_key=None
+):
+ """Download sequences using multiple batched E-utilities requests with incremental file writing.
+
This function handles large sequence downloads by splitting them into smaller
batches and writing results incrementally to avoid memory issues. It includes
robust error handling with automatic exponential backoff retries for failed batches.
-
+
Key features:
- Batched requests to avoid URL length limits
- Exponential backoff retries for each batch
@@ -3619,7 +3888,7 @@ def _download_sequences_batched(accessions, NCBI_EUTILS_BASE_EFETCH, fasta_path,
- Automatic retry with smaller batch sizes for URL length failures
- Progress tracking and detailed logging
- Graceful handling of partial failures (continues after batch failures)
-
+
Args:
accessions (list): List of accession numbers to download.
NCBI_EUTILS_BASE_EFETCH (str): Base URL for NCBI E-utilities API.
@@ -3627,64 +3896,61 @@ def _download_sequences_batched(accessions, NCBI_EUTILS_BASE_EFETCH, fasta_path,
batch_size (int): Number of accessions per batch.
failed_commands (dict, optional): Dictionary to track failed operations.
api_key (str, optional): NCBI API key for higher rate limits (10 req/sec vs 3).
-
- Returns:
+
+ Returns
+ -------
str: Path to the saved FASTA file containing all downloaded sequences
-
- Raises:
+
+ Raises
+ ------
RuntimeError: If all batches fail or no sequences are downloaded
-
+
Note:
- Respects NCBI rate limits with 0.5s delays between batches
- Implements exponential backoff for individual batch retries
- Automatically reduces batch size for URL length errors
- Continues processing even if some batches fail
- Writes sequences immediately to reduce memory usage
-
+
Example:
- >>> large_accession_list = ['NC_045512.2', 'MN908947.3', ...] # 1000+ accessions
- >>> path = _download_sequences_batched(large_accession_list, BASE_URL, 'out.fasta', 200)
+ >>> large_accession_list = ["NC_045512.2", "MN908947.3", ...] # 1000+ accessions
+ >>> path = _download_sequences_batched(large_accession_list, BASE_URL, "out.fasta", 200)
+
"""
-
# Initialize failed_commands tracking if not already done
- if failed_commands is not None and 'sequence_batches' not in failed_commands:
- failed_commands['sequence_batches'] = []
-
+ if failed_commands is not None and "sequence_batches" not in failed_commands:
+ failed_commands["sequence_batches"] = []
+
# Split accessions into batches
- batches = [accessions[i:i + batch_size] for i in range(0, len(accessions), batch_size)]
- logger.info("Downloading %d accessions in %d batches of size %d",
- len(accessions), len(batches), batch_size)
-
+ batches = [accessions[i : i + batch_size] for i in range(0, len(accessions), batch_size)]
+ logger.info("Downloading %d accessions in %d batches of size %d", len(accessions), len(batches), batch_size)
+
total_downloaded = 0
batch_failed_count = 0
-
+
# Open file once and write batches incrementally to avoid storing all data in memory
try:
- with open(fasta_path, 'w', encoding='utf-8') as f:
- for batch_num, batch_accessions in tqdm(enumerate(batches, 1), total=len(batches), desc="Downloading batches", unit="batch"):
-
+ with open(fasta_path, "w", encoding="utf-8") as f:
+ for batch_num, batch_accessions in tqdm(
+ enumerate(batches, 1), total=len(batches), desc="Downloading batches", unit="batch"
+ ):
# Build accession string for this batch
accession_string = ",".join(batch_accessions)
-
+
def download_batch():
- """Callable for retry helper function"""
- params = {
- 'db': 'nucleotide',
- 'id': accession_string,
- 'rettype': 'fasta',
- 'retmode': 'text'
- }
+ """Callable for retry helper function."""
+ params = {"db": "nucleotide", "id": accession_string, "rettype": "fasta", "retmode": "text"} # noqa: B023
if api_key:
- params['api_key'] = api_key
+ params["api_key"] = api_key
response = requests.get(NCBI_EUTILS_BASE_EFETCH, params=params, timeout=EUTILS_TIMEOUT)
response.raise_for_status()
-
+
# Verify we got FASTA data
- if not response.text.strip().startswith('>'):
+ if not response.text.strip().startswith(">"):
raise RuntimeError(f"Invalid FASTA response: {response.text[:100]}")
-
+
return response.text
-
+
# Use exponential backoff helper for batch retries
success, batch_response_text, error_info = _retry_with_exponential_backoff(
operation_name=f"Batch {batch_num}/{len(batches)} ({len(batch_accessions)} accessions)",
@@ -3692,65 +3958,80 @@ def download_batch():
max_retries=API_MAX_RETRIES,
initial_delay=API_INITIAL_RETRY_DELAY,
backoff_multiplier=API_RETRY_BACKOFF_MULTIPLIER,
- retryable_exceptions=(requests.exceptions.ConnectionError, requests.exceptions.HTTPError, requests.exceptions.Timeout),
+ retryable_exceptions=(
+ requests.exceptions.ConnectionError,
+ requests.exceptions.HTTPError,
+ requests.exceptions.Timeout,
+ ),
failed_commands=failed_commands,
)
-
+
if success:
# Count sequences in this batch
- batch_sequence_count = batch_response_text.count('>')
+ batch_sequence_count = batch_response_text.count(">")
total_downloaded += batch_sequence_count
-
+
# Write sequences immediately to file (incremental write)
f.write(batch_response_text)
- if not batch_response_text.endswith('\n'):
- f.write('\n') # Ensure proper line endings between batches
+ if not batch_response_text.endswith("\n"):
+ f.write("\n") # Ensure proper line endings between batches
f.flush() # Force write to disk immediately
-
+
# Update progress bar description with current stats
- batch_size_mb = len(batch_response_text.encode('utf-8')) / BYTES_PER_MB
- logger.debug(f"✓ Batch {batch_num}: Downloaded {batch_sequence_count} sequences ({batch_size_mb:.2f} MB)")
-
+ batch_size_mb = len(batch_response_text.encode("utf-8")) / BYTES_PER_MB
+ logger.debug(
+ f"✓ Batch {batch_num}: Downloaded {batch_sequence_count} sequences ({batch_size_mb:.2f} MB)"
+ )
+
else:
# Batch failed after retries
- error_msg = error_info['error']
+ error_msg = error_info["error"]
batch_failed_count += 1
-
+
# Check for URL length error
if "414" in error_msg and batch_size > EUTILS_MIN_BATCH_SIZE_FOR_SPLIT:
- tqdm.write(f"⚠️ WARNING: Batch {batch_num} URL too long (size={batch_size}). Retrying with smaller batch...")
+ tqdm.write(
+ f"⚠️ WARNING: Batch {batch_num} URL too long (size={batch_size}). Retrying with smaller batch..."
+ )
# Recursively retry this batch with smaller size by splitting it further
temp_batch_path = f"temp_batch_{batch_num}.fasta"
try:
_download_sequences_batched(
- batch_accessions, NCBI_EUTILS_BASE_EFETCH, temp_batch_path, batch_size // 2, failed_commands, api_key=api_key
+ batch_accessions,
+ NCBI_EUTILS_BASE_EFETCH,
+ temp_batch_path,
+ batch_size // 2,
+ failed_commands,
+ api_key=api_key,
)
# Read the temporary file and append to main file
- with open(temp_batch_path, 'r', encoding='utf-8') as temp_f:
+ with open(temp_batch_path, encoding="utf-8") as temp_f:
batch_content = temp_f.read()
f.write(batch_content)
- if not batch_content.endswith('\n'):
- f.write('\n')
+ if not batch_content.endswith("\n"):
+ f.write("\n")
f.flush()
# Count sequences in this recovered batch
- recovered_count = batch_content.count('>')
+ recovered_count = batch_content.count(">")
total_downloaded += recovered_count
batch_failed_count -= 1 # This batch succeeded after retry
- tqdm.write(f"✓ Recovered batch {batch_num} with smaller size: {recovered_count} sequences")
+ tqdm.write(
+ f"✓ Recovered batch {batch_num} with smaller size: {recovered_count} sequences"
+ )
os.remove(temp_batch_path) # Clean up temp file
- except Exception as file_error:
+ except Exception as file_error: # noqa: BLE001
tqdm.write(f"❌ Failed to recover batch {batch_num}: {file_error}")
# Track the failed batch
_track_failed_operation(
failed_commands,
- 'sequence_batches',
+ "sequence_batches",
{
- 'batch_num': batch_num,
- 'accession_count': len(batch_accessions),
- 'accessions': batch_accessions,
- 'retry_url': f"{NCBI_EUTILS_BASE_EFETCH}?db=nucleotide&id={accession_string}&rettype=fasta&retmode=text",
+ "batch_num": batch_num,
+ "accession_count": len(batch_accessions),
+ "accessions": batch_accessions,
+ "retry_url": f"{NCBI_EUTILS_BASE_EFETCH}?db=nucleotide&id={accession_string}&rettype=fasta&retmode=text",
},
- error_info
+ error_info,
)
continue
else:
@@ -3758,56 +4039,60 @@ def download_batch():
tqdm.write(f"❌ Batch {batch_num} failed after {API_MAX_RETRIES} retries: {error_msg}")
_track_failed_operation(
failed_commands,
- 'sequence_batches',
+ "sequence_batches",
{
- 'batch_num': batch_num,
- 'accession_count': len(batch_accessions),
- 'accessions': batch_accessions,
- 'retry_url': f"{NCBI_EUTILS_BASE_EFETCH}?db=nucleotide&id={accession_string}&rettype=fasta&retmode=text",
+ "batch_num": batch_num,
+ "accession_count": len(batch_accessions),
+ "accessions": batch_accessions,
+ "retry_url": f"{NCBI_EUTILS_BASE_EFETCH}?db=nucleotide&id={accession_string}&rettype=fasta&retmode=text",
},
- error_info
+ error_info,
)
continue
-
+
# Add small delay between requests to be respectful to NCBI servers
if batch_num < len(batches): # Don't delay after the last batch
time.sleep(EUTILS_INTER_BATCH_DELAY)
-
+
# Check if we downloaded anything
if total_downloaded == 0:
raise RuntimeError("❌ All batches failed. No sequences were downloaded.")
-
+
if batch_failed_count > 0:
- logger.warning(f"⚠️ WARNING: {batch_failed_count} out of {len(batches)} batches failed. Successfully downloaded {total_downloaded} sequences.")
- tqdm.write(f"⚠️ WARNING: {batch_failed_count} out of {len(batches)} batches failed. Successfully downloaded {total_downloaded} sequences.")
-
+ logger.warning(
+ f"⚠️ WARNING: {batch_failed_count} out of {len(batches)} batches failed. Successfully downloaded {total_downloaded} sequences."
+ )
+ tqdm.write(
+ f"⚠️ WARNING: {batch_failed_count} out of {len(batches)} batches failed. Successfully downloaded {total_downloaded} sequences."
+ )
+
file_size = os.path.getsize(fasta_path)
- logger.info("Successfully saved %d sequences to: %s (%.2f MB)",
- total_downloaded, fasta_path, file_size / BYTES_PER_MB)
+ logger.info(
+ "Successfully saved %d sequences to: %s (%.2f MB)", total_downloaded, fasta_path, file_size / BYTES_PER_MB
+ )
return fasta_path
-
- except IOError as e:
+
+ except OSError as e:
logger.error("❌ Failed to write FASTA file: %s", e)
raise RuntimeError(f"❌ Failed to save downloaded sequences: {e}") from e
def _unzip_file(zip_file_path, extract_to_path):
- """
- Extract a ZIP file to a specified directory.
-
+ """Extract a ZIP file to a specified directory.
+
Args:
zip_file_path (str): Path to the ZIP file.
extract_to_path (str): Target directory for extraction.
"""
os.makedirs(extract_to_path, exist_ok=True)
logger.debug("Created extraction directory: %s", extract_to_path)
-
+
try:
with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
zip_ref.extractall(extract_to_path)
file_list = zip_ref.namelist()
logger.info("Extracted %d files from %s", len(file_list), zip_file_path)
-
+
except zipfile.BadZipFile as e:
raise zipfile.BadZipFile(f"Invalid or corrupted ZIP file: {zip_file_path}") from e
except PermissionError as e:
@@ -3817,22 +4102,24 @@ def _unzip_file(zip_file_path, extract_to_path):
def _parse_date(date_str, filtername=""):
- """
- Parse various date formats into a datetime object.
-
+ """Parse various date formats into a datetime object.
+
Args:
date_str (str): Date string to parse (various formats accepted).
filtername (str): Name of the filter/field for error reporting.
-
- Returns:
+
+ Returns
+ -------
datetime: Parsed datetime object, or None if parsing fails.
-
- Raises:
+
+ Raises
+ ------
ValueError: If date parsing fails
-
+
Note:
Uses a default date of year 1500 for incomplete date strings to ensure
proper comparison behavior with minimum date filters.
+
"""
try:
# Use dateutil parser for flexible date parsing
@@ -3840,7 +4127,7 @@ def _parse_date(date_str, filtername=""):
parsed_date = parser.parse(date_str, default=datetime(DATE_PARSE_DEFAULT_YEAR, 1, 1))
logger.debug("Successfully parsed date '%s' as %s", date_str, parsed_date)
return parsed_date
-
+
except (ValueError, TypeError) as exc:
error_msg = (
f"Invalid date detected for argument {filtername}: '{date_str}'.\n"
@@ -3859,53 +4146,55 @@ def _parse_date(date_str, filtername=""):
def _parse_partial_date_for_range_check(date_str, for_min_comparison=True, filtername=""):
- """
- Parse partial dates with range-aware handling for comparison.
-
+ """Parse partial dates with range-aware handling for comparison.
+
When comparing partial dates (year-only or year-month) against specific dates,
we need to handle them based on the comparison direction:
-
+
- For min_collection_date comparisons: use the END of the partial range
(e.g., "2015" -> 2015-12-31, "2015-06" -> 2015-06-30, "2021/2022" -> 2022-12-31)
This ensures records from that year/month are included if they COULD be >= min.
-
+
- For max_collection_date comparisons: use the START of the partial range
(e.g., "2015" -> 2015-01-01, "2015-06" -> 2015-06-01, "2021/2022" -> 2021-01-01)
This ensures records from that year/month are included if they COULD be <= max.
-
+
Args:
date_str (str): Date string to parse (various formats).
for_min_comparison (bool): True if comparing against min date, False for max date.
filtername (str): Name of the filter for error messages.
-
- Returns:
+
+ Returns
+ -------
datetime: Parsed datetime object with partial dates adjusted appropriately.
-
- Raises:
+
+ Raises
+ ------
ValueError: If date parsing fails.
- """
+
+ """
if not date_str or not date_str.strip():
raise ValueError(f"Empty date string for {filtername}")
-
+
date_str = date_str.strip()
-
+
# Detect date precision based on format
# Year-only: "2015" (4 digits)
# Year-range: "2021/2022" or "2021-2022" (two 4-digit years)
# Year-month: "2015-06", "2015/06", "Jun 2015", etc.
# Full date: "2015-06-15", "2015/06/15", "Jun 15, 2015", etc.
-
- year_only_pattern = r'^(\d{4})$'
- year_month_pattern = r'^(\d{4})[-/](\d{1,2})$'
- year_range_pattern = r'^(\d{4})[-/](\d{4})$'
+
+ year_only_pattern = r"^(\d{4})$"
+ year_month_pattern = r"^(\d{4})[-/](\d{1,2})$"
+ year_range_pattern = r"^(\d{4})[-/](\d{4})$"
# NCBI API returns date ranges as "[2021 TO 2022]" or "[2021-06 TO 2022-03]"
- bracket_range_pattern = r'^\[(.+?)\s+TO\s+(.+?)\]$'
-
+ bracket_range_pattern = r"^\[(.+?)\s+TO\s+(.+?)\]$"
+
year_match = re.match(year_only_pattern, date_str)
year_month_match = re.match(year_month_pattern, date_str)
year_range_match = re.match(year_range_pattern, date_str)
bracket_range_match = re.match(bracket_range_pattern, date_str, re.IGNORECASE)
-
+
try:
if bracket_range_match:
# Bracket range from NCBI API like "[2021 TO 2022]" or "[2021-06 TO 2022-03]"
@@ -3916,16 +4205,18 @@ def _parse_partial_date_for_range_check(date_str, for_min_comparison=True, filte
end_result = _parse_partial_date_for_range_check(
range_end_str, for_min_comparison=True, filtername=filtername
)
- logger.debug("Parsed bracket-range date '%s' as %s (end of range for min comparison)",
- date_str, end_result)
+ logger.debug(
+ "Parsed bracket-range date '%s' as %s (end of range for min comparison)", date_str, end_result
+ )
return end_result
else:
# For max comparison, use the START of the range
start_result = _parse_partial_date_for_range_check(
range_start_str, for_min_comparison=False, filtername=filtername
)
- logger.debug("Parsed bracket-range date '%s' as %s (start of range for max comparison)",
- date_str, start_result)
+ logger.debug(
+ "Parsed bracket-range date '%s' as %s (start of range for max comparison)", date_str, start_result
+ )
return start_result
elif year_range_match:
@@ -3935,13 +4226,11 @@ def _parse_partial_date_for_range_check(date_str, for_min_comparison=True, filte
if for_min_comparison:
# For min comparison, use end of the range (Dec 31 of end year)
result = datetime(year_end, 12, 31)
- logger.debug("Parsed year-range date '%s' as %s (end of range for min comparison)",
- date_str, result)
+ logger.debug("Parsed year-range date '%s' as %s (end of range for min comparison)", date_str, result)
else:
# For max comparison, use start of the range (Jan 1 of start year)
result = datetime(year_start, 1, 1)
- logger.debug("Parsed year-range date '%s' as %s (start of range for max comparison)",
- date_str, result)
+ logger.debug("Parsed year-range date '%s' as %s (start of range for max comparison)", date_str, result)
return result
elif year_match:
@@ -3950,15 +4239,13 @@ def _parse_partial_date_for_range_check(date_str, for_min_comparison=True, filte
if for_min_comparison:
# For min comparison, use end of year (Dec 31)
result = datetime(year, 12, 31)
- logger.debug("Parsed year-only date '%s' as %s (end of year for min comparison)",
- date_str, result)
+ logger.debug("Parsed year-only date '%s' as %s (end of year for min comparison)", date_str, result)
else:
# For max comparison, use start of year (Jan 1)
result = datetime(year, 1, 1)
- logger.debug("Parsed year-only date '%s' as %s (start of year for max comparison)",
- date_str, result)
+ logger.debug("Parsed year-only date '%s' as %s (start of year for max comparison)", date_str, result)
return result
-
+
elif year_month_match:
# Year-month date like "2015-06"
year = int(year_month_match.group(1))
@@ -3967,18 +4254,16 @@ def _parse_partial_date_for_range_check(date_str, for_min_comparison=True, filte
# For min comparison, use end of month
_, last_day = calendar.monthrange(year, month)
result = datetime(year, month, last_day)
- logger.debug("Parsed year-month date '%s' as %s (end of month for min comparison)",
- date_str, result)
+ logger.debug("Parsed year-month date '%s' as %s (end of month for min comparison)", date_str, result)
else:
# For max comparison, use start of month
result = datetime(year, month, 1)
- logger.debug("Parsed year-month date '%s' as %s (start of month for max comparison)",
- date_str, result)
+ logger.debug("Parsed year-month date '%s' as %s (start of month for max comparison)", date_str, result)
return result
else:
# Full date - use standard parsing
return _parse_date(date_str, filtername=filtername)
-
+
except (ValueError, TypeError) as exc:
error_msg = (
f"Invalid date detected for argument {filtername}: '{date_str}'.\n"
@@ -3990,40 +4275,40 @@ def _parse_partial_date_for_range_check(date_str, for_min_comparison=True, filte
def _write_fasta_record(handle, record):
- """
- Write a single FASTA record to an open file handle.
-
+ """Write a single FASTA record to an open file handle.
+
Args:
handle: Open file handle for writing.
record: FastaRecord object with id, description, and seq attributes.
"""
- if hasattr(record, 'description') and record.description:
+ if hasattr(record, "description") and record.description:
handle.write(f">{record.id} {record.description}\n")
else:
handle.write(f">{record.id}\n")
seq_str = str(record.seq)
for i in range(0, len(seq_str), 70):
- handle.write(seq_str[i:i+70] + '\n')
+ handle.write(seq_str[i : i + 70] + "\n")
def _stream_copy_fasta(input_path, output_path, accession_set=None):
- """
- Stream-copy FASTA records from input to output, optionally filtering by accession set.
-
+ """Stream-copy FASTA records from input to output, optionally filtering by accession set.
+
This avoids loading all sequences into RAM — only one record at a time is in memory.
For large datasets (millions of sequences), this is critical to avoid out-of-memory errors.
-
+
Args:
input_path (str): Path to input FASTA file.
output_path (str): Path to output FASTA file.
accession_set (set, optional): If provided, only copy records whose ID is in this set.
-
- Returns:
+
+ Returns
+ -------
int: Number of records written.
+
"""
count = 0
skipped = 0
- with open(output_path, 'w', encoding='utf-8') as out_handle:
+ with open(output_path, "w", encoding="utf-8") as out_handle:
for record in FastaIO.parse(input_path, "fasta"):
if accession_set is not None and record.id not in accession_set:
skipped += 1
@@ -4032,7 +4317,7 @@ def _stream_copy_fasta(input_path, output_path, accession_set=None):
count += 1
if count % FASTA_STREAM_LOG_INTERVAL == 0:
logger.debug("Streamed %d FASTA records so far...", count)
-
+
if accession_set is not None:
logger.info("Stream-copied %d FASTA records (%d skipped by accession filter)", count, skipped)
else:
@@ -4041,51 +4326,52 @@ def _stream_copy_fasta(input_path, output_path, accession_set=None):
def _load_metadata_dict_from_temp_jsonl(temp_file_path):
- """
- Stream metadata records from a temporary JSONL file and build metadata_dict directly.
-
+ """Stream metadata records from a temporary JSONL file and build metadata_dict directly.
+
This function reads the JSONL file line-by-line, converting each raw API report to the internal metadata format. This avoids loading the entire raw API response list into memory at once.
-
+
The conversion logic mirrors load_metadata_from_api_reports() but processes
records one at a time from disk.
-
+
Args:
temp_file_path (str): Path to the temporary JSONL file containing raw API reports.
-
- Returns:
+
+ Returns
+ -------
dict: Dictionary mapping accession numbers to metadata dictionaries.
Same format as load_metadata_from_api_reports().
+
"""
metadata_dict = {}
processed_count = 0
skipped_count = 0
-
+
if not temp_file_path or not os.path.exists(temp_file_path):
logger.warning("Temporary metadata file not found: %s", temp_file_path)
return metadata_dict
-
+
file_size_mb = os.path.getsize(temp_file_path) / (1024 * 1024)
logger.info("Loading metadata from temp file: %s (%.2f MB)", temp_file_path, file_size_mb)
-
- with open(temp_file_path, 'r', encoding='utf-8') as f:
+
+ with open(temp_file_path, encoding="utf-8") as f:
for line_num, line in enumerate(f, 1):
line = line.strip()
if not line:
continue
-
+
try:
report = json.loads(line)
except json.JSONDecodeError:
skipped_count += 1
logger.debug("Skipping malformed JSON at line %d", line_num)
continue
-
+
# Extract the accession number
accession = report.get("accession")
-
+
if accession:
processed_count += 1
-
+
# Transform API report format to internal format (same as load_metadata_from_api_reports)
metadata = {
"accession": accession,
@@ -4103,8 +4389,8 @@ def _load_metadata_dict_from_temp_jsonl(temp_file_path):
"sourceDatabase": report.get("source_database", ""),
"isolateName": report.get("isolate", {}).get("name", ""),
"isolate": {
- 'collectionDate': report.get("isolate", {}).get("collection_date", ""),
- 'source': report.get("isolate", {}).get("source", ""),
+ "collectionDate": report.get("isolate", {}).get("collection_date", ""),
+ "source": report.get("isolate", {}).get("source", ""),
},
"virusTaxId": report.get("virus", {}).get("tax_id", None),
"virusName": report.get("virus", {}).get("organism_name", ""),
@@ -4119,62 +4405,62 @@ def _load_metadata_dict_from_temp_jsonl(temp_file_path):
"submitterCountry": report.get("submitter", {}).get("country", ""),
"submitterInstitution": report.get("submitter", {}).get("affiliation", ""),
}
-
+
metadata_dict[accession] = metadata
else:
skipped_count += 1
-
+
# Log progress for large files
if processed_count > 0 and processed_count % 500000 == 0:
logger.info(" ... processed %d records from temp file", processed_count)
-
- logger.info("Loaded %d metadata records from temp file (skipped %d)",
- processed_count, skipped_count)
-
+
+ logger.info("Loaded %d metadata records from temp file (skipped %d)", processed_count, skipped_count)
+
return metadata_dict
def _load_cached_metadata_from_jsonl(jsonl_path):
- """
- Load cached metadata from a JSONL file where records are already in internal format.
-
+ """Load cached metadata from a JSONL file where records are already in internal format.
+
Unlike _load_metadata_dict_from_temp_jsonl (which transforms raw API format), this function loads records that are already transformed (from process_cached_download). Each line is a JSON object with 'accession' key and all metadata fields directly.
-
+
Args:
jsonl_path (str): Path to the cached metadata JSONL file.
-
- Returns:
+
+ Returns
+ -------
dict: Dictionary mapping accession numbers to metadata dictionaries.
+
"""
metadata_dict = {}
processed_count = 0
-
+
if not jsonl_path or not os.path.exists(jsonl_path):
logger.warning("Cached metadata JSONL file not found: %s", jsonl_path)
return metadata_dict
-
+
file_size_mb = os.path.getsize(jsonl_path) / (1024 * 1024)
logger.info("Loading cached metadata from JSONL: %s (%.2f MB)", jsonl_path, file_size_mb)
-
- with open(jsonl_path, 'r', encoding='utf-8') as f:
+
+ with open(jsonl_path, encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
-
+
try:
metadata = json.loads(line)
except json.JSONDecodeError:
continue
-
+
accession = metadata.get("accession")
if accession:
metadata_dict[accession] = metadata
processed_count += 1
-
+
if processed_count % 500000 == 0:
logger.info(" ... loaded %d cached metadata records", processed_count)
-
+
logger.info("Loaded %d cached metadata records from JSONL", processed_count)
return metadata_dict
@@ -4190,13 +4476,12 @@ def _stream_filter_cached_metadata_from_jsonl(
min_release_date=None,
applied_strategy_filters=None,
):
- """
- Stream cached metadata from a JSONL file, applying filters on-the-fly.
-
+ """Stream cached metadata from a JSONL file, applying filters on-the-fly.
+
This is the memory-efficient equivalent of loading ALL records into a dict
and then calling filter_cached_metadata_for_unused_filters(). Only records
that pass ALL filters are kept in memory.
-
+
Args:
jsonl_path (str): Path to the cached metadata JSONL file.
host (str, optional): Host organism filter.
@@ -4207,192 +4492,192 @@ def _stream_filter_cached_metadata_from_jsonl(
refseq_only (bool, optional): RefSeq only filter.
min_release_date (str, optional): Minimum release date filter.
applied_strategy_filters (list, optional): Filters already applied server-side.
-
- Returns:
+
+ Returns
+ -------
tuple: (metadata_dict, total_records, filter_stats)
- metadata_dict: dict mapping accession to metadata (only passing records)
- total_records: total number of records scanned
- filter_stats: dict with counts of records filtered by each category
+
"""
if applied_strategy_filters is None:
applied_strategy_filters = []
-
+
# Determine which filters to actually apply
filters_active = {}
- if 'host' not in applied_strategy_filters and host:
- filters_active['host'] = host
- if 'complete-only' not in applied_strategy_filters and complete_only:
- filters_active['complete_only'] = True
- if 'annotated' not in applied_strategy_filters and annotated:
- filters_active['annotated'] = True
- if 'lineage' not in applied_strategy_filters and lineage:
- filters_active['lineage'] = lineage
+ if "host" not in applied_strategy_filters and host:
+ filters_active["host"] = host
+ if "complete-only" not in applied_strategy_filters and complete_only:
+ filters_active["complete_only"] = True
+ if "annotated" not in applied_strategy_filters and annotated:
+ filters_active["annotated"] = True
+ if "lineage" not in applied_strategy_filters and lineage:
+ filters_active["lineage"] = lineage
if geographic_location:
- filters_active['geographic_location'] = geographic_location
+ filters_active["geographic_location"] = geographic_location
if refseq_only:
- filters_active['refseq_only'] = True
+ filters_active["refseq_only"] = True
if min_release_date:
- filters_active['min_release_date'] = min_release_date
-
+ filters_active["min_release_date"] = min_release_date
+
# Parse min_release_date once
min_release_date_parsed = None
- if 'min_release_date' in filters_active:
+ if "min_release_date" in filters_active:
min_release_date_parsed = _parse_date(min_release_date, filtername="min_release_date")
-
+
metadata_dict = {}
total_records = 0
filter_stats = {
- 'host': 0,
- 'complete_only': 0,
- 'annotated': 0,
- 'lineage': 0,
- 'geographic_location': 0,
- 'refseq_only': 0,
- 'min_release_date': 0,
+ "host": 0,
+ "complete_only": 0,
+ "annotated": 0,
+ "lineage": 0,
+ "geographic_location": 0,
+ "refseq_only": 0,
+ "min_release_date": 0,
}
-
+
if not jsonl_path or not os.path.exists(jsonl_path):
logger.warning("Cached metadata JSONL not found for streaming filter: %s", jsonl_path)
return metadata_dict, 0, filter_stats
-
+
file_size_mb = os.path.getsize(jsonl_path) / (1024 * 1024)
logger.info("Stream-filtering cached metadata from JSONL: %s (%.2f MB)", jsonl_path, file_size_mb)
if filters_active:
logger.info("Active filters: %s", list(filters_active.keys()))
else:
logger.info("No filters to apply — loading all records")
-
- with open(jsonl_path, 'r', encoding='utf-8') as f:
+
+ with open(jsonl_path, encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
-
+
try:
metadata = json.loads(line)
except json.JSONDecodeError:
continue
-
+
accession = metadata.get("accession")
if not accession:
continue
-
+
total_records += 1
-
+
# Apply filters — skip record if any filter fails
skip = False
-
+
# Host filter
- if not skip and 'host' in filters_active:
- host_name = metadata.get('hostName', '')
+ if not skip and "host" in filters_active:
+ host_name = metadata.get("hostName", "")
if not host_name or host.lower() not in host_name.lower():
- filter_stats['host'] += 1
+ filter_stats["host"] += 1
skip = True
-
+
# Complete-only filter
- if not skip and 'complete_only' in filters_active:
- completeness = metadata.get('completeness', '')
- if not completeness or completeness.lower() != 'complete':
- filter_stats['complete_only'] += 1
+ if not skip and "complete_only" in filters_active:
+ completeness = metadata.get("completeness", "")
+ if not completeness or completeness.lower() != "complete":
+ filter_stats["complete_only"] += 1
skip = True
-
+
# Annotated filter
- if not skip and 'annotated' in filters_active:
- is_annotated = metadata.get('isAnnotated', False)
+ if not skip and "annotated" in filters_active:
+ is_annotated = metadata.get("isAnnotated", False)
if not is_annotated:
- filter_stats['annotated'] += 1
+ filter_stats["annotated"] += 1
skip = True
-
+
# Lineage filter
- if not skip and 'lineage' in filters_active:
- virus_pangolin = metadata.get('virusPangolinClassification', '')
+ if not skip and "lineage" in filters_active:
+ virus_pangolin = metadata.get("virusPangolinClassification", "")
if not virus_pangolin or lineage.lower() not in str(virus_pangolin).lower():
- filter_stats['lineage'] += 1
+ filter_stats["lineage"] += 1
skip = True
-
+
# Geographic location filter
- if not skip and 'geographic_location' in filters_active:
- geo_loc = metadata.get('location', '') or ''
- geo_region = metadata.get('region', '') or ''
- virus_name = metadata.get('virusName', '') or ''
+ if not skip and "geographic_location" in filters_active:
+ geo_loc = metadata.get("location", "") or ""
+ geo_region = metadata.get("region", "") or ""
+ virus_name = metadata.get("virusName", "") or ""
geo_filter = geographic_location.lower()
loc_matches = geo_loc and geo_filter in geo_loc.lower()
region_matches = geo_region and geo_filter in geo_region.lower()
virus_name_matches = virus_name and geo_filter in virus_name.lower()
if not loc_matches and not region_matches and not virus_name_matches:
- filter_stats['geographic_location'] += 1
+ filter_stats["geographic_location"] += 1
skip = True
-
+
# RefSeq only filter
- if not skip and 'refseq_only' in filters_active:
- is_refseq = metadata.get('sourceDatabase', '').lower() == 'refseq'
+ if not skip and "refseq_only" in filters_active:
+ is_refseq = metadata.get("sourceDatabase", "").lower() == "refseq"
if not is_refseq:
- filter_stats['refseq_only'] += 1
+ filter_stats["refseq_only"] += 1
skip = True
-
+
# Minimum release date filter
- if not skip and 'min_release_date' in filters_active and min_release_date_parsed:
- release_date_str = metadata.get('releaseDate', '')
+ if not skip and "min_release_date" in filters_active and min_release_date_parsed:
+ release_date_str = metadata.get("releaseDate", "")
if not release_date_str:
- filter_stats['min_release_date'] += 1
+ filter_stats["min_release_date"] += 1
skip = True
else:
try:
release_date = _parse_date(release_date_str, filtername="releaseDate")
if release_date and release_date < min_release_date_parsed:
- filter_stats['min_release_date'] += 1
+ filter_stats["min_release_date"] += 1
skip = True
except (ValueError, TypeError):
- filter_stats['min_release_date'] += 1
+ filter_stats["min_release_date"] += 1
skip = True
-
+
if not skip:
metadata_dict[accession] = metadata
-
+
# Log progress every 1M records
if total_records % 1000000 == 0:
- logger.info(" ... scanned %d records, %d passing filters so far",
- total_records, len(metadata_dict))
-
- logger.info("Stream-filter complete: scanned %d records, %d passed all filters",
- total_records, len(metadata_dict))
+ logger.info(" ... scanned %d records, %d passing filters so far", total_records, len(metadata_dict))
+
+ logger.info("Stream-filter complete: scanned %d records, %d passed all filters", total_records, len(metadata_dict))
if any(v > 0 for v in filter_stats.values()):
- logger.info("Filter statistics: %s",
- {k: v for k, v in filter_stats.items() if v > 0})
-
+ logger.info("Filter statistics: %s", {k: v for k, v in filter_stats.items() if v > 0})
+
return metadata_dict, total_records, filter_stats
def load_metadata_from_api_reports(api_reports):
- """
- Load metadata from API response reports into a dictionary.
-
+ """Load metadata from API response reports into a dictionary.
+
This function transforms the raw API response format into a standardized
internal metadata format that can be used by the filtering functions.
It maps API field names to the expected internal field names and handles
missing or null values appropriately.
-
+
Args:
api_reports (list): List of virus metadata reports from the NCBI API.
-
- Returns:
+
+ Returns
+ -------
dict: Dictionary mapping accession numbers to metadata dictionaries
Key: accession number (str)
Value: metadata dictionary with standardized field names
+
"""
metadata_dict = {}
processed_count = 0
skipped_count = 0
-
+
logger.debug("Processing %d API reports into metadata dictionary", len(api_reports))
-
+
for report in api_reports:
# Extract the accession number - this serves as our unique identifier
accession = report.get("accession")
-
+
if accession:
processed_count += 1
-
+
# Transform API report format to match expected internal metadata format
# Map API fields to expected internal field names with appropriate defaults
metadata = {
@@ -4400,7 +4685,9 @@ def load_metadata_from_api_reports(api_reports):
"length": report.get("length"), # Sequence length in nucleotides
# "source": "NCBI_REST_API",
"geneCount": report.get("gene_count"), # Number of genes annotated
- "completeness": (report.get("completeness") or "").lower(), # Completeness status (e.g., complete, partial)
+ "completeness": (
+ report.get("completeness") or ""
+ ).lower(), # Completeness status (e.g., complete, partial)
"host": report.get("host", {}), # Host organism details
"hostName": report.get("host", {}).get("organism_name", ""), # Host organism name
"hostTaxId": report.get("host", {}).get("tax_id", None), # Host taxonomy ID
@@ -4414,69 +4701,68 @@ def load_metadata_from_api_reports(api_reports):
"isolateName": report.get("isolate", {}).get("name", ""), # Isolate name
"isolate": {
# 'name': report.get("isolate", {}).get("name", ""),
- 'collectionDate': report.get("isolate", {}).get("collection_date", ""),
- 'source': report.get("isolate", {}).get("source", ""),
+ "collectionDate": report.get("isolate", {}).get("collection_date", ""),
+ "source": report.get("isolate", {}).get("source", ""),
},
"virusTaxId": report.get("virus", {}).get("tax_id", None), # Virus taxonomy and classification
"virusName": report.get("virus", {}).get("organism_name", ""), # Virus name
"isAnnotated": report.get("is_annotated", False), # Whether sequence is annotated
"releaseDate": report.get("release_date", ""), # When sequence was released
- # "sraAccessions": report.get("sra_accessions", []), # SRA read data accessions
- # "bioprojects": report.get("bioprojects", []), # Associated BioProject IDs
- # "biosample": report.get("biosample"), # BioSample ID
+ # "sraAccessions": report.get("sra_accessions", []), # SRA read data accessions
+ # "bioprojects": report.get("bioprojects", []), # Associated BioProject IDs
+ # "biosample": report.get("biosample"), # BioSample ID
"proteinCount": report.get("protein_count"), # Number of proteins
"maturePeptideCount": report.get("mature_peptide_count"), # Number of mature peptides
"segment": report.get("segment"), # Virus segment identifier (e.g., 'HA', 'NA', 'PB1')
"isVaccineStrain": report.get("is_vaccine_strain", False), # Whether this is a vaccine strain
- "virusPangolinClassification" : report.get("virus", {}).get("pangolin_classification", {}), # Pangolin lineage classification
- "submitterName" : report.get("submitter", {}).get("names", ""), # Submitter names
- "submitterCountry" : report.get("submitter", {}).get("country", ""), # Submitter country
- "submitterInstitution" : report.get("submitter", {}).get("affiliation", "") # Submitter institution
+ "virusPangolinClassification": report.get("virus", {}).get(
+ "pangolin_classification", {}
+ ), # Pangolin lineage classification
+ "submitterName": report.get("submitter", {}).get("names", ""), # Submitter names
+ "submitterCountry": report.get("submitter", {}).get("country", ""), # Submitter country
+ "submitterInstitution": report.get("submitter", {}).get("affiliation", ""), # Submitter institution
}
-
+
# Store the metadata using accession as the key
metadata_dict[accession] = metadata
- # logger.debug("Processed metadata for accession: %s (length: %s, host: %s)",
- # accession,
- # metadata.get("length"),
+ # logger.debug("Processed metadata for accession: %s (length: %s, host: %s)",
+ # accession,
+ # metadata.get("length"),
# metadata.get("host", {}).get("organism_name", "Unknown"))
-
+
else:
# Skip reports without accession numbers
skipped_count += 1
logger.warning("Skipping API report without accession number: %s", report)
-
- logger.info("Processed %d metadata records, skipped %d records without accessions",
- processed_count, skipped_count)
-
- return metadata_dict
+ logger.info("Processed %d metadata records, skipped %d records without accessions", processed_count, skipped_count)
+ return metadata_dict
def _check_protein_requirements(record, metadata, has_proteins, proteins_complete):
- """
- Check if a sequence meets protein/gene requirements based on FASTA header.
-
+ """Check if a sequence meets protein/gene requirements based on FASTA header.
+
This function validates whether a virus sequence contains required proteins
or genes by checking the FASTA header. For segmented viruses (like influenza),
this checks segment/protein labels in the sequence description.
-
+
The function extracts the protein/segment portion of the header by:
1. Splitting the description on the isolate name (if available in metadata)
2. Splitting by semicolons to get individual protein/segment parts
3. Using regex to match protein names (case-insensitive, handles quotes/parentheses)
-
+
Args:
record: FastaRecord object containing the sequence and description
metadata (dict): Metadata dictionary for this accession
has_proteins (str/list/None): Required protein(s)/gene(s) to check for
Can be a single string or list of strings
proteins_complete (bool): Whether proteins must be marked as "complete" in header
-
- Returns:
+
+ Returns
+ -------
bool: True if protein requirements are met, False otherwise
-
+
Example:
>>> # Check for HA segment
>>> _check_protein_requirements(record, metadata, "HA", False)
@@ -4484,12 +4770,12 @@ def _check_protein_requirements(record, metadata, has_proteins, proteins_complet
>>> # Check for multiple segments, requiring complete
>>> _check_protein_requirements(record, metadata, ["HA", "NA"], True)
True # Only if both HA and NA are present AND marked "complete"
+
"""
-
# If no protein filter specified and proteins_complete is False, pass through
if has_proteins is None and not proteins_complete:
return True
-
+
# If only proteins_complete is True but no specific proteins required,
# we can't check completion status without knowing which proteins to look for
if has_proteins is None and proteins_complete:
@@ -4501,11 +4787,11 @@ def _check_protein_requirements(record, metadata, has_proteins, proteins_complet
logger.debug("Sequence %s has no protein/gene annotations", record.id)
return False
return True
-
+
# Convert single string to list for uniform processing
if isinstance(has_proteins, str):
has_proteins = [has_proteins]
-
+
try:
# Extract the protein/segment portion of the header
# If isolate name exists in metadata, split on it to get just the protein info
@@ -4516,75 +4802,69 @@ def _check_protein_requirements(record, metadata, has_proteins, proteins_complet
# If sample name was not added to metadata,
# whole header will be searched for protein/segment names
prot_header = record.description
-
+
# Split header into parts by semicolon for checking individual annotations
prot_parts = prot_header.split(";")
-
+
# Check that ALL required proteins are present
for protein in has_proteins:
# Dynamically create regex for each protein with case insensitivity
# Handles optional quotes, parentheses around protein names
regex = rf"(?i)\b['\",]?\(?{re.escape(protein)}\)?['\",]?\b"
-
+
if proteins_complete:
# Only keeping sequences for which proteins are marked as "complete"
- if not any(
- re.search(regex, part) and "complete" in part.lower()
- for part in prot_parts
- ):
- logger.debug("Sequence %s: protein '%s' not found or not complete",
- record.id, protein)
+ if not any(re.search(regex, part) and "complete" in part.lower() for part in prot_parts):
+ logger.debug("Sequence %s: protein '%s' not found or not complete", record.id, protein)
return False
else:
# Just check if protein name appears anywhere in header parts
if not any(re.search(regex, part) for part in prot_parts):
- logger.debug("Sequence %s: required protein '%s' not found in header",
- record.id, protein)
+ logger.debug("Sequence %s: required protein '%s' not found in header", record.id, protein)
return False
-
- logger.debug("Sequence %s passed protein requirements: %s (complete=%s)",
- record.id, has_proteins, proteins_complete)
+
+ logger.debug(
+ "Sequence %s passed protein requirements: %s (complete=%s)", record.id, has_proteins, proteins_complete
+ )
return True
-
- except Exception as e:
+
+ except Exception as e: # noqa: BLE001
logger.warning(
- f"The 'has_proteins' filter could not be applied to sequence {record.id} "
- f"due to the following error:\n{e}"
+ f"The 'has_proteins' filter could not be applied to sequence {record.id} due to the following error:\n{e}"
)
# On error, exclude the sequence (conservative approach)
return False
def _extract_protein_info_from_header(description, metadata=None):
- """
- Extract protein/segment information from FASTA header.
-
+ """Extract protein/segment information from FASTA header.
+
This function extracts the protein/segment portion of the FASTA description
by splitting on the isolate name (if available in metadata). This is particularly
important for segmented viruses like influenza.
-
+
The extraction logic matches the original Laura_OG implementation:
1. If isolate name exists in metadata, split description on it and take the last part
2. Otherwise, use the entire description as the protein/segment info
-
+
Args:
description (str): FASTA header/description line
metadata (dict, optional): Metadata dictionary that may contain isolate name
-
- Returns:
+
+ Returns
+ -------
str: Extracted protein/segment information, or pd.NA if extraction fails
-
+
Example:
>>> _extract_protein_info_from_header(
- ... "NC_001234 A/California/07/2009 HA; complete cds",
- ... {"isolate": {"name": "A/California/07/2009"}}
+ ... "NC_001234 A/California/07/2009 HA; complete cds", {"isolate": {"name": "A/California/07/2009"}}
... )
" HA; complete cds"
+
"""
-
if not description:
return pd.NA
-
+
try:
# If isolate name exists in metadata, split on it to get just the protein info
if metadata is not None:
@@ -4592,12 +4872,12 @@ def _extract_protein_info_from_header(description, metadata=None):
if isolate_name:
prot_header = description.split(isolate_name)[-1]
return prot_header
-
+
# If sample name was not added to metadata,
# whole header will be added as protein/segment description
return description
-
- except Exception:
+
+ except Exception: # noqa: BLE001
return pd.NA
@@ -4609,17 +4889,16 @@ def filter_sequences(
proteins_complete=False,
output_fasta_path=None,
):
- """
- Apply sequence-dependent filters to downloaded sequences.
-
+ """Apply sequence-dependent filters to downloaded sequences.
+
Applies filters requiring actual sequence data (ambiguous character counting,
protein/feature analysis). Metadata-only filters should be applied by
filter_metadata_only before downloading sequences.
-
+
When output_fasta_path is provided, filtered sequences are streamed directly
to the output file instead of accumulating in memory. This is critical for
large datasets (millions of sequences) that would otherwise exhaust system RAM.
-
+
Args:
fna_file (str): Path to FASTA file containing sequences.
metadata_dict (dict): Dictionary mapping accession numbers to metadata.
@@ -4628,29 +4907,31 @@ def filter_sequences(
proteins_complete (bool): Whether proteins must be complete.
output_fasta_path (str, optional): Path to write filtered sequences directly.
When provided, sequences are streamed to disk instead of held in memory.
-
- Returns:
+
+ Returns
+ -------
tuple: (filtered_count, filtered_metadata, protein_headers)
- filtered_count (int): Number of sequences that passed all filters.
- filtered_metadata (list): Metadata dicts for sequences passing filters.
- protein_headers (list): Protein/segment info from FASTA headers.
+
"""
logger.info("Applying sequence-dependent filters...")
- logger.debug("Sequence filters: max_ambiguous=%s, complete=%s, streaming=%s",
- max_ambiguous_chars, proteins_complete, output_fasta_path is not None)
-
+ logger.debug(
+ "Sequence filters: max_ambiguous=%s, complete=%s, streaming=%s",
+ max_ambiguous_chars,
+ proteins_complete,
+ output_fasta_path is not None,
+ )
+
# Initialize lists to store filtered results (metadata is small, kept in memory)
- filtered_metadata = [] # Will store corresponding metadata dictionaries
- protein_headers = [] # Will store protein/segment information from FASTA headers
- filtered_count = 0 # Count of sequences passing all filters
-
+ filtered_metadata = [] # Will store corresponding metadata dictionaries
+ protein_headers = [] # Will store protein/segment information from FASTA headers
+ filtered_count = 0 # Count of sequences passing all filters
+
# Counters for logging filter statistics
total_sequences = 0
- filter_stats = {
- 'seq_length': 0,
- 'ambiguous_chars': 0,
- 'proteins': 0
- }
+ filter_stats = {"seq_length": 0, "ambiguous_chars": 0, "proteins": 0}
# Read and process sequences from the FASTA file
# When output_fasta_path is set, write passing records directly to disk
@@ -4658,27 +4939,27 @@ def filter_sequences(
output_handle = None
try:
if output_fasta_path:
- output_handle = open(output_fasta_path, 'w', encoding='utf-8')
+ output_handle = open(output_fasta_path, "w", encoding="utf-8")
logger.info("Streaming filtered sequences directly to: %s", output_fasta_path)
-
+
for record in FastaIO.parse(fna_file, "fasta"):
total_sequences += 1
record_passes = True
-
+
# Normalize accession by taking only the first part (before space)
- record_accession = record.id.split()[0] if hasattr(record, 'id') else str(record)
-
+ record_accession = record.id.split()[0] if hasattr(record, "id") else str(record)
+
# Count ambiguous characters (N's)
if max_ambiguous_chars is not None:
- ambiguous_count = record.seq.upper().count('N')
+ ambiguous_count = record.seq.upper().count("N")
if ambiguous_count > max_ambiguous_chars:
- filter_stats['ambiguous_chars'] += 1
+ filter_stats["ambiguous_chars"] += 1
record_passes = False
continue
-
+
# Get metadata for this record to check protein information
record_metadata = metadata_dict.get(record_accession, {})
-
+
if proteins_complete:
protein_count = record_metadata.get("proteinCount", 0)
gene_count = record_metadata.get("geneCount", 0)
@@ -4686,21 +4967,20 @@ def filter_sequences(
if gene_count is None or gene_count == 0:
logger.debug("Sequence %s has no protein/gene annotations", record.id)
record_passes = False
- filter_stats['proteins'] += 1
+ filter_stats["proteins"] += 1
continue
-
+
# If sequence passed all filters, keep it and its metadata
if record_passes:
filtered_count += 1
filtered_metadata.append(record_metadata)
-
+
# Write directly to output file if streaming (memory-efficient)
if output_handle:
_write_fasta_record(output_handle, record)
-
+
if filtered_count % FASTA_STREAM_LOG_INTERVAL == 0:
- logger.debug("Processed %d sequences, %d passed filters so far...",
- total_sequences, filtered_count)
+ logger.debug("Processed %d sequences, %d passed filters so far...", total_sequences, filtered_count)
finally:
if output_handle:
output_handle.close()
@@ -4708,11 +4988,11 @@ def filter_sequences(
# Log filtering results
logger.info("Sequence filter results:")
logger.info("- Total sequences processed: %d", total_sequences)
- logger.info("- Filtered out because of sequence length: %d", filter_stats['seq_length'])
- logger.info("- Filtered out because of number of ambiguous characters: %d", filter_stats['ambiguous_chars'])
- logger.info("- Filtered out because of protein requirements: %d", filter_stats['proteins'])
+ logger.info("- Filtered out because of sequence length: %d", filter_stats["seq_length"])
+ logger.info("- Filtered out because of number of ambiguous characters: %d", filter_stats["ambiguous_chars"])
+ logger.info("- Filtered out because of protein requirements: %d", filter_stats["proteins"])
logger.info("- Sequences passing all filters: %d", filtered_count)
-
+
return filtered_count, filtered_metadata, protein_headers, filter_stats
@@ -4743,30 +5023,28 @@ def save_command_summary(
total_after_genbank_filter=None,
total_after_sequence_filter=None,
):
- """
- Save a summary file documenting the command execution and results.
-
+ """Save a summary file documenting the command execution and results.
+
Creates a comprehensive summary including command line, statistics,
output files, and any errors encountered.
"""
-
# Get versions if not provided
if gget_version is None:
gget_version = _get_gget_version()
-
+
summary_file = os.path.join(outfolder, "command_summary.txt")
-
+
try:
- with open(summary_file, 'w', encoding='utf-8') as f:
+ with open(summary_file, "w", encoding="utf-8") as f:
# Header
f.write("=" * 80 + "\n")
f.write("GGET VIRUS COMMAND SUMMARY\n")
f.write("=" * 80 + "\n\n")
-
+
# Timestamp
f.write(f"Execution Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
f.write(f"Output Folder: {outfolder}\n\n")
-
+
# Version information
f.write("-" * 80 + "\n")
f.write("SOFTWARE VERSIONS\n")
@@ -4775,13 +5053,13 @@ def save_command_summary(
if datasets_version is not None:
f.write(f"{datasets_version}\n")
f.write("\n")
-
+
# Command line
f.write("-" * 80 + "\n")
f.write("COMMAND LINE\n")
f.write("-" * 80 + "\n")
f.write(f"{command_line}\n\n")
-
+
# Execution status
f.write("-" * 80 + "\n")
f.write("EXECUTION STATUS\n")
@@ -4796,7 +5074,7 @@ def save_command_summary(
f.write("✗ Command failed\n")
if error_message:
f.write(f"Error: {error_message}\n\n")
-
+
# Runtime
if runtime_seconds is not None:
f.write("-" * 80 + "\n")
@@ -4805,31 +5083,33 @@ def save_command_summary(
hours, remainder = divmod(int(runtime_seconds), 3600)
minutes, seconds = divmod(remainder, 60)
if hours > 0:
- f.write(f"Total wall-clock time: {hours}h {minutes}m {seconds}s ({runtime_seconds:.1f} seconds)\n\n")
+ f.write(
+ f"Total wall-clock time: {hours}h {minutes}m {seconds}s ({runtime_seconds:.1f} seconds)\n\n"
+ )
elif minutes > 0:
f.write(f"Total wall-clock time: {minutes}m {seconds}s ({runtime_seconds:.1f} seconds)\n\n")
else:
f.write(f"Total wall-clock time: {runtime_seconds:.1f} seconds\n\n")
-
+
# Memory usage
if memory_info:
f.write("-" * 80 + "\n")
f.write("MEMORY USAGE\n")
f.write("-" * 80 + "\n")
- if memory_info.get('rss_mb') is not None:
+ if memory_info.get("rss_mb") is not None:
f.write(f"Process RSS (resident memory): {memory_info['rss_mb']:.1f} MB\n")
- if memory_info.get('vms_mb') is not None:
+ if memory_info.get("vms_mb") is not None:
f.write(f"Process VMS (virtual memory): {memory_info['vms_mb']:.1f} MB\n")
- if memory_info.get('percent') is not None:
+ if memory_info.get("percent") is not None:
f.write(f"Process memory percent: {memory_info['percent']:.1f}%\n")
- if memory_info.get('total_mb') is not None:
+ if memory_info.get("total_mb") is not None:
f.write(f"System total memory: {memory_info['total_mb']:.0f} MB\n")
- if memory_info.get('available_mb') is not None:
+ if memory_info.get("available_mb") is not None:
f.write(f"System available memory: {memory_info['available_mb']:.0f} MB\n")
- if memory_info.get('system_percent') is not None:
+ if memory_info.get("system_percent") is not None:
f.write(f"System memory used: {memory_info['system_percent']:.1f}%\n")
f.write("\n")
-
+
# Statistics
f.write("-" * 80 + "\n")
f.write("SEQUENCE STATISTICS\n")
@@ -4848,14 +5128,14 @@ def save_command_summary(
if total_after_sequence_filter is not None:
f.write(f"After sequence filtering: {total_after_sequence_filter}\n")
f.write(f"Final sequences (after all filters): {total_final_sequences}\n\n")
-
+
# Filter breakdown by stage
- any_filter_stats = (metadata_filter_stats or genbank_filter_stats or sequence_filter_stats)
+ any_filter_stats = metadata_filter_stats or genbank_filter_stats or sequence_filter_stats
if any_filter_stats:
f.write("-" * 80 + "\n")
f.write("FILTER BREAKDOWN BY STAGE\n")
f.write("-" * 80 + "\n")
-
+
if metadata_filter_stats:
active_meta = {k: v for k, v in metadata_filter_stats.items() if v > 0}
if active_meta:
@@ -4864,7 +5144,7 @@ def save_command_summary(
f.write(f" {filter_name}: {count}\n")
else:
f.write("\nMetadata filtering: no records excluded\n")
-
+
if genbank_filter_stats:
active_gb = {k: v for k, v in genbank_filter_stats.items() if v > 0}
if active_gb:
@@ -4873,7 +5153,7 @@ def save_command_summary(
f.write(f" {filter_name}: {count}\n")
else:
f.write("\nGenBank metadata filtering: no records excluded\n")
-
+
if sequence_filter_stats:
active_seq = {k: v for k, v in sequence_filter_stats.items() if v > 0}
if active_seq:
@@ -4882,9 +5162,9 @@ def save_command_summary(
f.write(f" {filter_name}: {count}\n")
else:
f.write("\nSequence filtering: no records excluded\n")
-
+
f.write("\n")
-
+
# Partial metadata recovery information
if partial_metadata_file:
f.write("-" * 80 + "\n")
@@ -4892,20 +5172,22 @@ def save_command_summary(
f.write("-" * 80 + "\n")
f.write(f"Partial metadata saved: {partial_metadata_file}\n")
if recovery_command:
- f.write(f"\nRecovery command:\n")
+ f.write("\nRecovery command:\n")
f.write(f" {recovery_command}\n")
f.write("\n")
-
+
# Detailed statistics from metadata
if filtered_metadata and len(filtered_metadata) > 0:
f.write("-" * 80 + "\n")
f.write("DETAILED STATISTICS\n")
f.write("-" * 80 + "\n")
-
+
# Unique hosts
hosts = set()
for meta in filtered_metadata:
- host_name = meta.get('host', {}).get('organism_name') if isinstance(meta.get('host'), dict) else None
+ host_name = (
+ meta.get("host", {}).get("organism_name") if isinstance(meta.get("host"), dict) else None
+ )
if host_name:
hosts.add(host_name)
f.write(f"Unique hosts: {len(hosts)}\n")
@@ -4917,11 +5199,11 @@ def save_command_summary(
for host in sorted(hosts)[:20]:
f.write(f" - {host}\n")
f.write("\n")
-
+
# Unique geographic locations
locations = set()
for meta in filtered_metadata:
- location = meta.get('location')
+ location = meta.get("location")
if location:
locations.add(location)
f.write(f"Unique geographic locations: {len(locations)}\n")
@@ -4933,37 +5215,37 @@ def save_command_summary(
for loc in sorted(locations)[:20]:
f.write(f" - {loc}\n")
f.write("\n")
-
+
# Sequence length statistics
- lengths = [meta.get('length') for meta in filtered_metadata if meta.get('length')]
+ lengths = [meta.get("length") for meta in filtered_metadata if meta.get("length")]
if lengths:
f.write(f"Sequence length range: {min(lengths)} - {max(lengths)} bp\n")
f.write(f"Average sequence length: {sum(lengths) / len(lengths):.0f} bp\n\n")
-
+
# Completeness breakdown
completeness_counts = {}
for meta in filtered_metadata:
- comp = meta.get('completeness', 'unknown')
+ comp = meta.get("completeness", "unknown")
completeness_counts[comp] = completeness_counts.get(comp, 0) + 1
f.write("Completeness breakdown:\n")
for comp, count in sorted(completeness_counts.items()):
f.write(f" - {comp}: {count}\n")
f.write("\n")
-
+
# Source database breakdown
source_counts = {}
for meta in filtered_metadata:
- source = meta.get('sourceDatabase', 'unknown')
+ source = meta.get("sourceDatabase", "unknown")
source_counts[source] = source_counts.get(source, 0) + 1
f.write("Source database breakdown:\n")
for source, count in sorted(source_counts.items()):
f.write(f" - {source}: {count}\n")
f.write("\n")
-
+
# Submitter countries
countries = set()
for meta in filtered_metadata:
- country = meta.get('submitterCountry')
+ country = meta.get("submitterCountry")
if country:
countries.add(country)
f.write(f"Unique submitter countries: {len(countries)}\n")
@@ -4975,7 +5257,7 @@ def save_command_summary(
for country in sorted(countries)[:20]:
f.write(f" - {country}\n")
f.write("\n")
-
+
# Output files
f.write("-" * 80 + "\n")
f.write("OUTPUT FILES\n")
@@ -4990,101 +5272,113 @@ def save_command_summary(
else:
f.write("No output files generated\n")
f.write("\n")
-
+
# Failed operations - if any occurred
if failed_commands:
has_failures = False
-
+
# Check for API timeouts
- if failed_commands.get('api_timeout'):
+ if failed_commands.get("api_timeout"):
if not has_failures:
f.write("-" * 80 + "\n")
f.write("⚠️ FAILED OPERATIONS - MANUAL RETRY REQUIRED\n")
f.write("-" * 80 + "\n")
has_failures = True
- timeout_info = failed_commands['api_timeout']
- f.write(f"\n📍 API TIMEOUT:\n")
+ timeout_info = failed_commands["api_timeout"]
+ f.write("\n📍 API TIMEOUT:\n")
f.write(f" Error: {timeout_info.get('error', 'Unknown')}\n")
f.write(f" URL: {timeout_info.get('url', 'Unknown')}\n")
- f.write(f" Recommendation: Try again later or use different filters\n\n")
-
+ f.write(" Recommendation: Try again later or use different filters\n\n")
+
# Check for empty API response
- if failed_commands.get('empty_response'):
+ if failed_commands.get("empty_response"):
if not has_failures:
f.write("-" * 80 + "\n")
f.write("⚠️ FAILED OPERATIONS - MANUAL RETRY REQUIRED\n")
f.write("-" * 80 + "\n")
has_failures = True
- empty_resp_info = failed_commands['empty_response']
- f.write(f"\n📍 EMPTY API RESPONSE:\n")
+ empty_resp_info = failed_commands["empty_response"]
+ f.write("\n📍 EMPTY API RESPONSE:\n")
f.write(f" Error: {empty_resp_info.get('error', 'Unknown')}\n")
- f.write(f" Recommendation: Check your virus identifier or try different filter parameters\n\n")
-
+ f.write(" Recommendation: Check your virus identifier or try different filter parameters\n\n")
+
# Check for failed API batches
- if failed_commands.get('api_batches'):
+ if failed_commands.get("api_batches"):
if not has_failures:
f.write("-" * 80 + "\n")
f.write("⚠️ FAILED OPERATIONS - MANUAL RETRY REQUIRED\n")
f.write("-" * 80 + "\n")
has_failures = True
f.write(f"\n📍 FAILED METADATA BATCHES ({len(failed_commands['api_batches'])} batches):\n")
- for batch_info in failed_commands['api_batches'][:5]: # Show first 5
- f.write(f"\n Batch {batch_info.get('batch_num', '?')}: {batch_info.get('accession_count', '?')} accessions\n")
+ for batch_info in failed_commands["api_batches"][:5]: # Show first 5
+ f.write(
+ f"\n Batch {batch_info.get('batch_num', '?')}: {batch_info.get('accession_count', '?')} accessions\n"
+ )
f.write(f" Error: {batch_info.get('error', 'Unknown')}\n")
f.write(f" API URL: {batch_info.get('api_url', 'Unknown')}\n")
- if len(failed_commands['api_batches']) > 5:
+ if len(failed_commands["api_batches"]) > 5:
f.write(f"\n ... and {len(failed_commands['api_batches']) - 5} more failed batches\n")
f.write("\n")
-
+
# Check for pagination errors/timeouts
- if failed_commands.get('pagination_timeouts') or failed_commands.get('pagination_errors'):
+ if failed_commands.get("pagination_timeouts") or failed_commands.get("pagination_errors"):
has_failures = True
- if not (failed_commands.get('api_batches') or failed_commands.get('api_timeout')):
+ if not (failed_commands.get("api_batches") or failed_commands.get("api_timeout")):
f.write("-" * 80 + "\n")
f.write("⚠️ FAILED OPERATIONS - PARTIAL RESULTS OBTAINED\n")
f.write("-" * 80 + "\n")
-
- if failed_commands.get('pagination_timeouts'):
+
+ if failed_commands.get("pagination_timeouts"):
f.write(f"\n📍 PAGINATION TIMEOUTS ({len(failed_commands['pagination_timeouts'])} pages):\n")
- for page_info in failed_commands['pagination_timeouts'][:3]:
- f.write(f" Page {page_info.get('page', '?')}: {page_info.get('records_retrieved', 0)} records retrieved\n")
+ for page_info in failed_commands["pagination_timeouts"][:3]:
+ f.write(
+ f" Page {page_info.get('page', '?')}: {page_info.get('records_retrieved', 0)} records retrieved\n"
+ )
f.write(f" Error: {page_info.get('error', 'Unknown')}\n")
-
- if failed_commands.get('pagination_errors'):
+
+ if failed_commands.get("pagination_errors"):
f.write(f"\n📍 PAGINATION ERRORS ({len(failed_commands['pagination_errors'])} pages):\n")
- for page_info in failed_commands['pagination_errors'][:3]:
- f.write(f" Page {page_info.get('page', '?')}: {page_info.get('error_type', 'Unknown')} error\n")
+ for page_info in failed_commands["pagination_errors"][:3]:
+ f.write(
+ f" Page {page_info.get('page', '?')}: {page_info.get('error_type', 'Unknown')} error\n"
+ )
f.write(f" Error: {page_info.get('error', 'Unknown')}\n")
-
+
# Check for sequence download failures
- if failed_commands.get('sequence_batches'):
+ if failed_commands.get("sequence_batches"):
has_failures = True
- if not (failed_commands.get('api_batches') or failed_commands.get('api_timeout') or
- failed_commands.get('pagination_timeouts') or failed_commands.get('pagination_errors')):
+ if not (
+ failed_commands.get("api_batches")
+ or failed_commands.get("api_timeout")
+ or failed_commands.get("pagination_timeouts")
+ or failed_commands.get("pagination_errors")
+ ):
f.write("-" * 80 + "\n")
f.write("⚠️ FAILED OPERATIONS - MANUAL RETRY AVAILABLE\n")
f.write("-" * 80 + "\n")
-
- f.write(f"\n📍 FAILED SEQUENCE DOWNLOAD BATCHES ({len(failed_commands['sequence_batches'])} batches):\n")
- for batch_info in failed_commands['sequence_batches'][:5]:
+
+ f.write(
+ f"\n📍 FAILED SEQUENCE DOWNLOAD BATCHES ({len(failed_commands['sequence_batches'])} batches):\n"
+ )
+ for batch_info in failed_commands["sequence_batches"][:5]:
f.write(f"\n Batch {batch_info.get('batch_num', '?')}\n")
f.write(f" Error: {batch_info.get('error', 'Unknown')}\n")
f.write(f" Retry URL: {batch_info.get('retry_url', 'Unknown')}\n")
- if len(failed_commands['sequence_batches']) > 5:
+ if len(failed_commands["sequence_batches"]) > 5:
f.write(f"\n ... and {len(failed_commands['sequence_batches']) - 5} more failed batches\n")
-
+
# Check for single sequence fetch failures
- if failed_commands.get('sequence_fetch'):
+ if failed_commands.get("sequence_fetch"):
has_failures = True
f.write(f"\n📍 SEQUENCE FETCH FAILURES ({len(failed_commands['sequence_fetch'])} operations):\n")
- for fetch_info in failed_commands['sequence_fetch'][:3]:
+ for fetch_info in failed_commands["sequence_fetch"][:3]:
f.write(f"\n Operation: {fetch_info.get('operation', 'Unknown')}\n")
f.write(f" Accessions: {fetch_info.get('accession_count', '?')}\n")
f.write(f" Error: {fetch_info.get('error', 'Unknown')}\n")
f.write(f" Retry URL: {fetch_info.get('retry_url', 'Unknown')}\n")
- if len(failed_commands['sequence_fetch']) > 3:
+ if len(failed_commands["sequence_fetch"]) > 3:
f.write(f"\n ... and {len(failed_commands['sequence_fetch']) - 3} more failures\n")
-
+
if has_failures:
f.write("\n💡 RECOVERY INSTRUCTIONS:\n")
f.write(" 1. Copy the URL from above and paste it into your browser\n")
@@ -5092,10 +5386,10 @@ def save_command_summary(
f.write(" 3. Retry the command with updated filters (e.g., stricter date ranges)\n")
f.write(" 4. If the issue persists, NCBI servers may be temporarily unavailable\n")
if partial_metadata_file:
- f.write(f"\n 5. RESUME with baseline deduplication:\n")
+ f.write("\n 5. RESUME with baseline deduplication:\n")
f.write(f" {recovery_command}\n")
f.write("\n")
-
+
# Footer
f.write("=" * 80 + "\n")
f.write("END OF SUMMARY\n")
@@ -5104,158 +5398,155 @@ def save_command_summary(
logger.info("=" * 60)
logger.info("✅ Command summary saved: %s", summary_file)
return summary_file
-
- except Exception as e:
+
+ except Exception as e: # noqa: BLE001
logger.error("Failed to save command summary: %s", e)
logger.error("Traceback: %s", traceback.format_exc())
return None
def merge_metadata_csvs(genbank_csv_path, standard_csv_path):
- """
- Merge standard metadata CSV into GenBank metadata CSV.
-
+ """Merge standard metadata CSV into GenBank metadata CSV.
+
Where GenBank data is missing, fills in values from the standard metadata CSV.
Does not overwrite any existing data in the GenBank CSV.
-
+
Args:
genbank_csv_path (str): Path to the GenBank metadata CSV file
standard_csv_path (str): Path to the standard metadata CSV file
-
- Returns:
+
+ Returns
+ -------
bool: True if merge was successful, False otherwise
+
"""
try:
if not os.path.exists(standard_csv_path):
logger.debug("Standard metadata CSV not found, skipping merge: %s", standard_csv_path)
return False
-
+
logger.info("Merging standard metadata into GenBank metadata...")
-
+
# Read both CSV files - use dtype=str to avoid type conversion issues
genbank_df = pd.read_csv(genbank_csv_path, dtype=str)
standard_df = pd.read_csv(standard_csv_path, dtype=str)
-
+
logger.debug("GenBank CSV: %d rows × %d columns", len(genbank_df), len(genbank_df.columns))
logger.debug("Standard CSV: %d rows × %d columns", len(standard_df), len(standard_df.columns))
-
+
# Create a mapping from accession to standard metadata for quick lookup
standard_by_accession = {}
- if 'accession' in standard_df.columns:
+ if "accession" in standard_df.columns:
for _, row in standard_df.iterrows():
- acc = row['accession']
- if pd.notna(acc) and str(acc).strip() and str(acc) != 'nan':
+ acc = row["accession"]
+ if pd.notna(acc) and str(acc).strip() and str(acc) != "nan":
standard_by_accession[str(acc)] = row
-
+
logger.debug("Indexed %d accessions from standard metadata", len(standard_by_accession))
-
+
# Fill missing values in genbank_df from standard_df
rows_updated = 0
columns_updated = 0
-
+
for idx, row in genbank_df.iterrows():
- accession = str(row['accession']).strip() if pd.notna(row['accession']) else None
-
- if accession and accession != 'nan' and accession in standard_by_accession:
+ accession = str(row["accession"]).strip() if pd.notna(row["accession"]) else None
+
+ if accession and accession != "nan" and accession in standard_by_accession:
standard_row = standard_by_accession[accession]
-
+
# For each column in genbank_df, if the value is NaN/empty, fill from standard
for col in genbank_df.columns:
if col in standard_row.index:
genbank_val = str(row[col]).strip() if pd.notna(row[col]) else None
standard_val = str(standard_row[col]).strip() if pd.notna(standard_row[col]) else None
-
+
# Fill if genbank is empty but standard has data
- if (not genbank_val or genbank_val == 'nan') and standard_val and standard_val != 'nan':
+ if (not genbank_val or genbank_val == "nan") and standard_val and standard_val != "nan":
genbank_df.at[idx, col] = standard_val
columns_updated += 1
-
+
if columns_updated > 0:
rows_updated += 1
-
+
# Save the merged dataframe back to the genbank CSV
- genbank_df.to_csv(genbank_csv_path, index=False, encoding='utf-8')
-
- logger.info("✅ Metadata merge complete: updated %d cells across %d rows",
- columns_updated, rows_updated)
+ genbank_df.to_csv(genbank_csv_path, index=False, encoding="utf-8")
+
+ logger.info("✅ Metadata merge complete: updated %d cells across %d rows", columns_updated, rows_updated)
logger.debug("Merged GenBank CSV: %d rows × %d columns", len(genbank_df), len(genbank_df.columns))
-
+
return True
-
- except Exception as e:
+
+ except Exception as e: # noqa: BLE001
logger.warning("❌ Failed to merge metadata CSVs: %s", e)
logger.debug("Exception details:", exc_info=True)
return False
def save_metadata_to_csv(filtered_metadata, protein_headers, output_metadata_file):
- """
- Save filtered metadata to a CSV file with a specific column order.
-
+ """Save filtered metadata to a CSV file with a specific column order.
+
This function creates a comprehensive CSV file containing all relevant metadata
for the filtered virus sequences. The output format is designed to be compatible
with downstream analysis tools like Delphy.
-
+
Args:
filtered_metadata (list): List of metadata dictionaries for filtered sequences
protein_headers (list): List of protein/segment information extracted from headers
output_metadata_file (str): Path to the output CSV file
-
+
Note:
The column order is specifically designed to match requirements for
phylogenetic analysis tools and provides a standardized format.
"""
-
logger.info("Preparing metadata for CSV output...")
- logger.debug("Processing %d metadata records with %d protein headers",
- len(filtered_metadata), len(protein_headers))
+ logger.debug("Processing %d metadata records with %d protein headers", len(filtered_metadata), len(protein_headers))
# Define the column order for the output CSV
# This order prioritizes the most commonly used fields and matches
# the format expected by downstream analysis tools
columns = [
- "accession", # Primary identifier (lowercase for Delphy compatibility)
- "Organism Name", # Virus species/strain name
- "GenBank/RefSeq", # Source database (GenBank or RefSeq)
- "Submitters", # Names of sequence submitters
- "Organization", # Submitting organization/institution
- "Submitter Country", # Country of submitting organization
- "Release date", # Date when sequence was released to public databases
- "Isolate", # Isolate/sample identifier
- "Virus Lineage", # Taxonomic lineage of the virus
- "Length", # Sequence length in base pairs
- "Nuc Completeness", # Completeness status (complete/partial)
+ "accession", # Primary identifier (lowercase for Delphy compatibility)
+ "Organism Name", # Virus species/strain name
+ "GenBank/RefSeq", # Source database (GenBank or RefSeq)
+ "Submitters", # Names of sequence submitters
+ "Organization", # Submitting organization/institution
+ "Submitter Country", # Country of submitting organization
+ "Release date", # Date when sequence was released to public databases
+ "Isolate", # Isolate/sample identifier
+ "Virus Lineage", # Taxonomic lineage of the virus
+ "Length", # Sequence length in base pairs
+ "Nuc Completeness", # Completeness status (complete/partial)
"Proteins/Segments", # Protein/segment information from FASTA headers
- "Segment", # Virus segment identifier (e.g., 'HA', 'NA', '4', '6')
+ "Segment", # Virus segment identifier (e.g., 'HA', 'NA', '4', '6')
"Is Vaccine Strain", # Whether this sequence is from a vaccine strain
"Geographic Region", # Geographic region where sample was collected
- "Geographic Location",# Specific geographic location
- "Host", # Host organism name
- "Host Lineage", # Taxonomic lineage of host organism
- "Lab Host", # Whether sample was lab-passaged
- "Tissue/Specimen/Source", # Sample source/tissue type
- "Collection Date", # Date when sample was collected
- "Sample Name", # Sample identifier
- "Annotated", # Whether sequence has annotation data
- "SRA Accessions", # Associated SRA (sequencing) accessions
- "Bioprojects", # Associated BioProject identifiers
- "Biosample", # BioSample identifier
- "Protein count", # Number of proteins annotated
- "Gene count", # Number of genes annotated
- "Mature Peptide Count", # Number of mature peptides annotated
+ "Geographic Location", # Specific geographic location
+ "Host", # Host organism name
+ "Host Lineage", # Taxonomic lineage of host organism
+ "Lab Host", # Whether sample was lab-passaged
+ "Tissue/Specimen/Source", # Sample source/tissue type
+ "Collection Date", # Date when sample was collected
+ "Sample Name", # Sample identifier
+ "Annotated", # Whether sequence has annotation data
+ "SRA Accessions", # Associated SRA (sequencing) accessions
+ "Bioprojects", # Associated BioProject identifiers
+ "Biosample", # BioSample identifier
+ "Protein count", # Number of proteins annotated
+ "Gene count", # Number of genes annotated
+ "Mature Peptide Count", # Number of mature peptides annotated
# Additional GenBank columns
- "definition", # GenBank sequence definition
- "strain", # Strain information
- "isolation_source", # Source of isolation
- "create_date", # GenBank creation date
- "update_date", # GenBank update date
- "assembly_name", # Assembly name
- "authors", # Publication authors
- "title", # Publication title
- "journal", # Publication journal
- "pubmed_id", # PubMed ID
- "reference_count", # Number of references
- "comment", # Additional comments
+ "definition", # GenBank sequence definition
+ "strain", # Strain information
+ "isolation_source", # Source of isolation
+ "create_date", # GenBank creation date
+ "update_date", # GenBank update date
+ "assembly_name", # Assembly name
+ "authors", # Publication authors
+ "title", # Publication title
+ "journal", # Publication journal
+ "pubmed_id", # PubMed ID
+ "reference_count", # Number of references
+ "comment", # Additional comments
]
logger.debug("Using column order: %s", columns)
@@ -5263,63 +5554,54 @@ def save_metadata_to_csv(filtered_metadata, protein_headers, output_metadata_fil
# Process metadata in chunks for memory efficiency on large datasets
total_records = len(filtered_metadata)
chunk_size = METADATA_CSV_CHUNK_SIZE
-
+
logger.info("Processing %d metadata records (chunk_size=%d)...", total_records, chunk_size)
-
+
def _build_row(i, metadata):
"""Build a single row dictionary from metadata."""
return {
# Primary identifiers
"accession": metadata.get("accession", pd.NA),
"Organism Name": metadata.get("virus", {}).get("organism_name", pd.NA),
-
# Database and submission information
"GenBank/RefSeq": metadata.get("sourceDatabase", pd.NA),
- "Submitters": ", ".join(metadata.get("submitter", {}).get("names", [])) if metadata.get("submitter", {}).get("names") else pd.NA,
+ "Submitters": ", ".join(metadata.get("submitter", {}).get("names", []))
+ if metadata.get("submitter", {}).get("names")
+ else pd.NA,
"Organization": metadata.get("submitter", {}).get("affiliation", pd.NA),
"Submitter Country": metadata.get("submitter", {}).get("country", pd.NA),
"Release date": metadata.get("releaseDate", "").split("T")[0] if metadata.get("releaseDate") else pd.NA,
-
# Sample and isolate information
"Isolate": metadata.get("isolate", {}).get("name", pd.NA),
"Sample Name": metadata.get("isolate", {}).get("name", pd.NA),
-
# Virus classification
"Virus Lineage": metadata.get("virus", {}).get("lineage", []),
-
# Sequence characteristics
"Length": metadata.get("length", pd.NA),
"Nuc Completeness": metadata.get("completeness", pd.NA),
"Proteins/Segments": protein_headers[i] if i < len(protein_headers) else pd.NA,
- "Segment": metadata.get("segment", pd.NA),
+ "Segment": metadata.get("segment", pd.NA),
"Is Vaccine Strain": metadata.get("isVaccineStrain", metadata.get("is_vaccine_strain", pd.NA)),
-
# Geographic information
"Geographic Region": metadata.get("region", pd.NA),
"Geographic Location": metadata.get("location", pd.NA),
-
# Host information
"Host": metadata.get("host", {}).get("organism_name", pd.NA),
"Host Lineage": metadata.get("host", {}).get("lineage", []),
"Lab Host": metadata.get("labHost", pd.NA),
-
# Sample source information
"Tissue/Specimen/Source": metadata.get("isolate", {}).get("source", pd.NA),
"Collection Date": metadata.get("isolate", {}).get("collection_date", pd.NA),
-
# Annotation and quality information
"Annotated": metadata.get("isAnnotated", pd.NA),
-
# Associated database records
"SRA Accessions": metadata.get("sraAccessions", []),
"Bioprojects": metadata.get("bioprojects", []),
"Biosample": metadata.get("biosample", pd.NA),
-
# Counts
"Gene count": metadata.get("geneCount"),
"Protein count": metadata.get("proteinCount"),
"Mature Peptide Count": metadata.get("maturePeptideCount"),
-
# GenBank-specific columns (not available from NCBI API metadata)
"definition": pd.NA,
"strain": pd.NA,
@@ -5339,32 +5621,32 @@ def _build_row(i, metadata):
try:
first_chunk = True
rows_written = 0
-
+
for chunk_start in range(0, total_records, chunk_size):
chunk_end = min(chunk_start + chunk_size, total_records)
chunk_data = []
-
+
for i in range(chunk_start, chunk_end):
chunk_data.append(_build_row(i, filtered_metadata[i]))
-
+
df_chunk = pd.DataFrame(chunk_data, columns=columns)
-
+
# First chunk writes header, subsequent chunks append without header
if first_chunk:
- df_chunk.to_csv(output_metadata_file, index=False, mode='w')
+ df_chunk.to_csv(output_metadata_file, index=False, mode="w")
first_chunk = False
else:
- df_chunk.to_csv(output_metadata_file, index=False, mode='a', header=False)
-
+ df_chunk.to_csv(output_metadata_file, index=False, mode="a", header=False)
+
rows_written += len(df_chunk)
-
+
# Free chunk memory
del chunk_data
del df_chunk
-
+
if total_records > chunk_size:
logger.debug("CSV progress: %d/%d rows written", rows_written, total_records)
-
+
logger.info("Successfully saved metadata CSV to: %s", output_metadata_file)
logger.debug("CSV file contains %d rows and %d columns", rows_written, len(columns))
except Exception as e:
@@ -5373,27 +5655,27 @@ def _build_row(i, metadata):
def check_min_max(min_val, max_val, filtername, date=False):
- """
- Validate that minimum and maximum values are in the correct order.
-
+ """Validate that minimum and maximum values are in the correct order.
+
Args:
min_val: Minimum value (can be numeric or date string).
max_val: Maximum value (can be numeric or date string).
filtername (str): Name of the filter for error reporting.
date (bool): Whether the values are dates that need parsing.
-
- Raises:
+
+ Raises
+ ------
ValueError: If minimum value is greater than maximum value
-
+
Example:
check_min_max(100, 50, "sequence length") # Raises ValueError
check_min_max(100, 200, "sequence length") # No error
+
"""
# Only perform validation if both values are provided
if min_val is not None and max_val is not None:
- logger.debug("Validating min/max values for %s: min=%s, max=%s",
- filtername, min_val, max_val)
-
+ logger.debug("Validating min/max values for %s: min=%s, max=%s", filtername, min_val, max_val)
+
if date:
try:
min_val = _parse_date(min_val)
@@ -5402,19 +5684,20 @@ def check_min_max(min_val, max_val, filtername, date=False):
except Exception as e:
logger.error("❌ Failed to parse dates for validation: %s", e)
raise ValueError(f"Invalid date format in {filtername} filters") from e
-
+
if min_val > max_val:
error_msg = f"Min value ({min_val}) cannot be greater than max value ({max_val}) for {filtername}."
logger.error("❌ Validation failed: %s", error_msg)
raise ValueError(error_msg)
-
+
logger.debug("Min/max validation passed for %s", filtername)
# =============================================================================
-# ESEARCH PRE-FILTERING
+# ESEARCH PRE-FILTERING
# =============================================================================
+
def _esearch_prefilter_genbank(
virus_taxid,
metadata_filtered_accessions,
@@ -5426,13 +5709,12 @@ def _esearch_prefilter_genbank(
max_seq_length=None,
api_key=None,
):
- """
- Use NCBI ESearch to pre-filter accessions BEFORE fetching full GenBank XML.
-
+ """Use NCBI ESearch to pre-filter accessions BEFORE fetching full GenBank XML.
+
Instead of fetching full GenBank XML for ALL metadata-filtered accessions, we use ESearch to find only the subset that might pass the GenBank- dependent filters. This typically reduces the set from hundreds of thousands to a few hundred, making the GenBank fetch near-instant.
-
+
Strategy: Build an ESearch query that is a SUPERSET of what the exact GenBank XML parsing will select. Use broad matching to avoid false negatives. Then intersect with the metadata-filtered accessions and fetch GenBank XML only for the intersection.
-
+
Args:
virus_taxid (str/int): NCBI taxonomy ID for the virus (e.g. 11676 for HIV-1).
metadata_filtered_accessions (list): Accessions that passed metadata filters.
@@ -5443,37 +5725,39 @@ def _esearch_prefilter_genbank(
min_seq_length (int, optional): Minimum sequence length.
max_seq_length (int, optional): Maximum sequence length.
api_key (str, optional): NCBI API key for higher rate limits.
-
- Returns:
+
+ Returns
+ -------
set: Set of accession numbers that might pass GenBank filters,
or None if pre-filtering couldn't be performed (fall back to full fetch).
+
"""
# Build ESearch query terms
query_parts = []
-
+
# Always include organism constraint
if virus_taxid:
query_parts.append(f"txid{virus_taxid}[Organism]")
-
+
# NOTE: We deliberately do NOT use "proviral" as an ESearch term here.
# ESearch free-text "proviral" only matches sequences that contain the WORD
- # "proviral" in their GenBank text (title, comment, keywords). Many sequences that ARE proviral (especially complete genomes like AF004394.1, U69593.1) do NOT contain this word in their text. The NCBI Virus web interface uses a different mechanism (structured metadata/mol_type qualifier) to identify proviral sequences.
+ # "proviral" in their GenBank text (title, comment, keywords). Many sequences that ARE proviral (especially complete genomes like AF004394.1, U69593.1) do NOT contain this word in their text. The NCBI Virus web interface uses a different mechanism (structured metadata/mol_type qualifier) to identify proviral sequences.
# The provirus filter is correctly applied later in filter_genbank_metadata() by checking the actual GenBank XML features/qualifiers.
-
+
# Has_proteins filter: search for protein name
if has_proteins:
# Use broad "All Fields" matching for protein name to ensure superset
# The exact filtering will happen later on the GenBank XML
query_parts.append(f'"{has_proteins}"')
-
+
# Genotype filter: search for genotype string
if genotype:
query_parts.append(f'"{genotype}"')
-
+
# Molecule type filter
if gen_mol_type:
query_parts.append(f'"{gen_mol_type}"[Molecule Type]')
-
+
# Sequence length filter
if min_seq_length and max_seq_length:
query_parts.append(f"{min_seq_length}:{max_seq_length}[SLEN]")
@@ -5481,107 +5765,111 @@ def _esearch_prefilter_genbank(
query_parts.append(f"{min_seq_length}:99999999[SLEN]")
elif max_seq_length:
query_parts.append(f"1:{max_seq_length}[SLEN]")
-
+
# Need at least organism + one GenBank filter for pre-filtering to be useful
if len(query_parts) < 2:
logger.info("ESearch pre-filter: not enough filter criteria for useful pre-filtering")
return None
-
+
search_query = " AND ".join(query_parts)
logger.info("ESearch pre-filter query: %s", search_query)
-
+
try:
# Step 1: ESearch to get count and WebEnv
params = {
- 'db': 'nucleotide',
- 'term': search_query,
- 'retmax': 0,
- 'usehistory': 'y',
+ "db": "nucleotide",
+ "term": search_query,
+ "retmax": 0,
+ "usehistory": "y",
}
if api_key:
- params['api_key'] = api_key
-
- response = requests.get(NCBI_EUTILS_BASE_ESEARCH, params=params, timeout=60,
- headers={'User-Agent': 'gget/1.0'})
+ params["api_key"] = api_key
+
+ response = requests.get(NCBI_EUTILS_BASE_ESEARCH, params=params, timeout=60, headers={"User-Agent": "gget/1.0"})
response.raise_for_status()
root = ET.fromstring(response.text)
-
- count_elem = root.find('.//Count')
- web_env_elem = root.find('.//WebEnv')
- query_key_elem = root.find('.//QueryKey')
-
+
+ count_elem = root.find(".//Count")
+ web_env_elem = root.find(".//WebEnv")
+ query_key_elem = root.find(".//QueryKey")
+
if count_elem is None or web_env_elem is None:
logger.warning("ESearch pre-filter: could not parse response")
return None
-
+
total_count = int(count_elem.text)
web_env = web_env_elem.text
query_key = query_key_elem.text
-
+
logger.info("ESearch pre-filter: found %d accessions matching GenBank criteria", total_count)
-
+
# If the pre-filter results are TOO large (>50K), it's not useful
# Fall back to the full fetch method
if total_count > 50000:
- logger.info("ESearch pre-filter: %d results is too large for effective pre-filtering (>50K), skipping", total_count)
+ logger.info(
+ "ESearch pre-filter: %d results is too large for effective pre-filtering (>50K), skipping", total_count
+ )
return None
-
+
if total_count == 0:
logger.info("ESearch pre-filter: NO accessions match the GenBank criteria")
return set()
-
+
# Step 2: Fetch all matching accessions using EFetch rettype=acc (lightweight!)
all_esearch_accessions = []
batch_size = 10000
-
+
for retstart in range(0, total_count, batch_size):
time.sleep(0.35 if not api_key else 0.1)
-
+
fetch_params = {
- 'db': 'nucleotide',
- 'WebEnv': web_env,
- 'query_key': query_key,
- 'retmax': batch_size,
- 'retstart': retstart,
- 'rettype': 'acc',
- 'retmode': 'text',
+ "db": "nucleotide",
+ "WebEnv": web_env,
+ "query_key": query_key,
+ "retmax": batch_size,
+ "retstart": retstart,
+ "rettype": "acc",
+ "retmode": "text",
}
if api_key:
- fetch_params['api_key'] = api_key
-
- resp = requests.get(NCBI_EUTILS_BASE_EFETCH, params=fetch_params, timeout=120,
- headers={'User-Agent': 'gget/1.0'})
+ fetch_params["api_key"] = api_key
+
+ resp = requests.get(
+ NCBI_EUTILS_BASE_EFETCH, params=fetch_params, timeout=120, headers={"User-Agent": "gget/1.0"}
+ )
resp.raise_for_status()
-
- batch_accs = [a.strip() for a in resp.text.strip().split('\n') if a.strip()]
+
+ batch_accs = [a.strip() for a in resp.text.strip().split("\n") if a.strip()]
all_esearch_accessions.extend(batch_accs)
- logger.debug("ESearch pre-filter: fetched %d accessions (retstart=%d)",
- len(batch_accs), retstart)
-
+ logger.debug("ESearch pre-filter: fetched %d accessions (retstart=%d)", len(batch_accs), retstart)
+
logger.info("ESearch pre-filter: retrieved %d accession numbers total", len(all_esearch_accessions))
-
+
# Step 3: Intersect with our metadata-filtered accessions
esearch_set = set(all_esearch_accessions)
metadata_set = set(metadata_filtered_accessions)
intersection = esearch_set & metadata_set
-
+
logger.info("ESearch pre-filter RESULTS:")
logger.info(" ESearch matches: %d", len(esearch_set))
logger.info(" Metadata-filtered: %d", len(metadata_set))
logger.info(" Intersection (candidates): %d", len(intersection))
- logger.info(" Reduction: %.1f%% (from %d to %d accessions for GenBank fetch)",
- (1 - len(intersection) / len(metadata_set)) * 100 if metadata_set else 0,
- len(metadata_set), len(intersection))
-
+ logger.info(
+ " Reduction: %.1f%% (from %d to %d accessions for GenBank fetch)",
+ (1 - len(intersection) / len(metadata_set)) * 100 if metadata_set else 0,
+ len(metadata_set),
+ len(intersection),
+ )
+
return intersection
-
+
except requests.exceptions.RequestException as e:
logger.warning("ESearch pre-filter failed (network error): %s. Falling back to full fetch.", e)
return None
except ET.ParseError as e:
logger.warning("ESearch pre-filter failed (parse error): %s. Falling back to full fetch.", e)
return None
- except Exception as e:
+ except Exception as e: # noqa: BLE001
logger.warning("ESearch pre-filter failed (unexpected): %s. Falling back to full fetch.", e)
return None
@@ -5590,82 +5878,79 @@ def _esearch_prefilter_genbank(
# EPOST + EFETCH HELPER FUNCTIONS (NCBI-recommended for large datasets)
# =============================================================================
+
def _epost_accessions(accessions, api_key=None):
- """
- Upload accession numbers to NCBI History Server using EPost.
-
+ """Upload accession numbers to NCBI History Server using EPost.
+
EPost allows uploading large numbers of UIDs to the server, which assigns
them a WebEnv and query_key for subsequent EFetch requests. This avoids
URL length limitations that restrict direct efetch calls to ~200 accessions.
-
+
Args:
accessions (list): List of accession numbers to upload.
api_key (str, optional): NCBI API key for higher rate limits.
-
- Returns:
+
+ Returns
+ -------
tuple: (web_env, query_key) for use in subsequent EFetch calls,
or (None, None) if upload failed.
+
"""
logger.info("Uploading %d accessions to NCBI History Server via EPost...", len(accessions))
-
+
epost_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/epost.fcgi"
-
+
# For EPost, we send IDs in the POST body to avoid URL length limits
# Can handle tens of thousands of accessions in one request
accession_string = ",".join(accessions)
-
+
params = {
- 'db': 'nucleotide',
+ "db": "nucleotide",
}
if api_key:
- params['api_key'] = api_key
-
+ params["api_key"] = api_key
+
# POST body contains the accession list
- data = {
- 'id': accession_string
- }
-
- headers = {'User-Agent': 'gget/1.0'}
-
+ data = {"id": accession_string}
+
+ headers = {"User-Agent": "gget/1.0"}
+
try:
# Make POST request with accessions in body
- response = requests.post(
- epost_url,
- params=params,
- data=data,
- headers=headers,
- timeout=EUTILS_TIMEOUT
- )
+ response = requests.post(epost_url, params=params, data=data, headers=headers, timeout=EUTILS_TIMEOUT)
response.raise_for_status()
-
+
# Parse the XML response to extract WebEnv and query_key
# Example response:
#
# 1
# NCID_01_...
#
-
+
root = ET.fromstring(response.text)
-
- query_key_elem = root.find('.//QueryKey')
- web_env_elem = root.find('.//WebEnv')
-
+
+ query_key_elem = root.find(".//QueryKey")
+ web_env_elem = root.find(".//WebEnv")
+
if query_key_elem is not None and web_env_elem is not None:
query_key = query_key_elem.text
web_env = web_env_elem.text
- logger.info("✅ EPost successful: QueryKey=%s, WebEnv=%s...",
- query_key, web_env[:30] if len(web_env) > 30 else web_env)
+ logger.info(
+ "✅ EPost successful: QueryKey=%s, WebEnv=%s...",
+ query_key,
+ web_env[:30] if len(web_env) > 30 else web_env,
+ )
return web_env, query_key
else:
# Check for error message
- error_elem = root.find('.//ERROR')
+ error_elem = root.find(".//ERROR")
if error_elem is not None:
logger.error("❌ EPost error: %s", error_elem.text)
else:
logger.error("❌ EPost failed: Could not parse WebEnv/QueryKey from response")
logger.debug("Response: %s", response.text[:500])
return None, None
-
+
except requests.exceptions.RequestException as e:
logger.error("❌ EPost request failed: %s", e)
return None, None
@@ -5675,13 +5960,12 @@ def _epost_accessions(accessions, api_key=None):
def _efetch_with_history(web_env, query_key, retstart, retmax, api_key=None, failed_log_path=None):
- """
- Fetch GenBank records using History Server reference (WebEnv/query_key).
-
+ """Fetch GenBank records using History Server reference (WebEnv/query_key).
+
This is the NCBI-recommended method for large datasets. After uploading UIDs
via EPost, use this function to retrieve records in batches using pagination
(retstart/retmax).
-
+
Args:
web_env (str): WebEnv string from EPost.
query_key (str): Query key from EPost.
@@ -5689,25 +5973,26 @@ def _efetch_with_history(web_env, query_key, retstart, retmax, api_key=None, fai
retmax (int): Maximum number of records to retrieve in this batch.
api_key (str, optional): NCBI API key for higher rate limits.
failed_log_path (str, optional): Path to log failed requests.
-
- Returns:
+
+ Returns
+ -------
tuple: (metadata_dict, xml_text) where metadata_dict maps accessions to
parsed metadata, and xml_text is the raw XML response.
+
"""
-
params = {
- 'db': 'nucleotide',
- 'WebEnv': web_env,
- 'query_key': query_key,
- 'retstart': retstart,
- 'retmax': retmax,
- 'rettype': 'gb',
- 'retmode': 'xml',
- 'complexity': GENBANK_COMPLEXITY,
+ "db": "nucleotide",
+ "WebEnv": web_env,
+ "query_key": query_key,
+ "retstart": retstart,
+ "retmax": retmax,
+ "rettype": "gb",
+ "retmode": "xml",
+ "complexity": GENBANK_COMPLEXITY,
}
if api_key:
- params['api_key'] = api_key
-
+ params["api_key"] = api_key
+
# Create a requests.Session with retry logic
session = requests.Session()
try:
@@ -5715,7 +6000,7 @@ def _efetch_with_history(web_env, query_key, retstart, retmax, api_key=None, fai
total=GENBANK_RETRY_ATTEMPTS,
backoff_factor=HTTP_INITIAL_BACKOFF,
status_forcelist=HTTP_RETRY_STATUS_CODES,
- allowed_methods=frozenset(['GET', 'POST'])
+ allowed_methods=frozenset(["GET", "POST"]),
)
except TypeError:
# Fallback for older urllib3 versions
@@ -5723,44 +6008,45 @@ def _efetch_with_history(web_env, query_key, retstart, retmax, api_key=None, fai
total=GENBANK_RETRY_ATTEMPTS,
backoff_factor=HTTP_INITIAL_BACKOFF,
status_forcelist=HTTP_RETRY_STATUS_CODES,
- method_whitelist=frozenset(['GET', 'POST'])
+ method_whitelist=frozenset(["GET", "POST"]),
)
-
+
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount("https://", adapter)
session.mount("http://", adapter)
-
- headers = {'Connection': 'close', 'User-Agent': 'gget/1.0'}
-
+
+ headers = {"Connection": "close", "User-Agent": "gget/1.0"}
+
max_attempts = HTTP_MAX_LOCAL_RETRIES
attempt = 0
backoff = HTTP_INITIAL_BACKOFF
-
+
while attempt < max_attempts:
try:
- logger.debug("EFetch with history: retstart=%d, retmax=%d (attempt %d)",
- retstart, retmax, attempt + 1)
-
+ logger.debug("EFetch with history: retstart=%d, retmax=%d (attempt %d)", retstart, retmax, attempt + 1)
+
response = session.get(NCBI_EUTILS_BASE_EFETCH, params=params, timeout=EUTILS_TIMEOUT, headers=headers)
response.raise_for_status()
-
+
# Verify we got XML data
- if not response.text.strip().startswith(' 100 # Use EPost for any significant number
optimized_batch_size = 500 if use_epost_method else batch_size # EPost method allows larger batches
-
+
# Optimize delay based on API key: 10 req/sec with key vs 3 req/sec without
# With API key: 0.1s delay allows ~10 req/sec
# Without API key: 0.35s delay allows ~3 req/sec
effective_delay = 0.1 if api_key else delay
- logger.info("Using delay of %.2fs between requests (API key: %s)",
- effective_delay, "yes" if api_key else "no")
-
+ logger.info("Using delay of %.2fs between requests (API key: %s)", effective_delay, "yes" if api_key else "no")
+
try:
# Open temp file for incremental XML writing
- xml_file = open(temp_xml_path, 'w', encoding='utf-8')
+ xml_file = open(temp_xml_path, "w", encoding="utf-8")
xml_file.write("\n")
-
+
# Open temp JSONL file for incremental metadata writing
- metadata_jsonl_file = open(temp_metadata_jsonl_path, 'w', encoding='utf-8')
-
+ metadata_jsonl_file = open(temp_metadata_jsonl_path, "w", encoding="utf-8")
+
if use_epost_method:
# ===== EPost + EFetch with History Server =====
EPOST_CHUNK_SIZE = 2000 # Tuned to avoid History Server session timeout
-
+
if len(accessions) > EPOST_CHUNK_SIZE:
- epost_chunks = [accessions[i:i + EPOST_CHUNK_SIZE]
- for i in range(0, len(accessions), EPOST_CHUNK_SIZE)]
+ epost_chunks = [
+ accessions[i : i + EPOST_CHUNK_SIZE] for i in range(0, len(accessions), EPOST_CHUNK_SIZE)
+ ]
else:
epost_chunks = [accessions]
-
- logger.info("Using optimized EPost+EFetch workflow (efetch_batch=%d, epost_chunks=%d of up to %d each)",
- optimized_batch_size, len(epost_chunks), EPOST_CHUNK_SIZE)
-
+
+ logger.info(
+ "Using optimized EPost+EFetch workflow (efetch_batch=%d, epost_chunks=%d of up to %d each)",
+ optimized_batch_size,
+ len(epost_chunks),
+ EPOST_CHUNK_SIZE,
+ )
+
# Calculate total EFetch batches across all EPost chunks for progress tracking
overall_batch_num = 0
total_batches_all_chunks = sum(
- (len(chunk) + optimized_batch_size - 1) // optimized_batch_size
- for chunk in epost_chunks
+ (len(chunk) + optimized_batch_size - 1) // optimized_batch_size for chunk in epost_chunks
)
-
+
# Determine GC and memory logging frequency based on total batches
gc_frequency = max(20, total_batches_all_chunks // 20)
memory_log_frequency = max(50, total_batches_all_chunks // 10)
-
+
epost_failures = [] # Track chunks that fail EPost for direct-URL fallback
-
+
for chunk_idx, chunk_accessions in enumerate(epost_chunks):
- logger.info("EPost chunk %d/%d: uploading %d accessions to History Server...",
- chunk_idx + 1, len(epost_chunks), len(chunk_accessions))
-
+ logger.info(
+ "EPost chunk %d/%d: uploading %d accessions to History Server...",
+ chunk_idx + 1,
+ len(epost_chunks),
+ len(chunk_accessions),
+ )
+
# Step 1: Upload this chunk to NCBI History Server via EPost
web_env, query_key = _epost_accessions(chunk_accessions, api_key=api_key)
-
+
if web_env and query_key:
- logger.info("✅ EPost chunk %d/%d successful: uploaded %d accessions",
- chunk_idx + 1, len(epost_chunks), len(chunk_accessions))
-
+ logger.info(
+ "✅ EPost chunk %d/%d successful: uploaded %d accessions",
+ chunk_idx + 1,
+ len(epost_chunks),
+ len(chunk_accessions),
+ )
+
# Step 2: Fetch data for this chunk using WebEnv/query_key with pagination
chunk_total = len(chunk_accessions)
num_batches = (chunk_total + optimized_batch_size - 1) // optimized_batch_size
-
+
for batch_num in range(num_batches):
overall_batch_num += 1
retstart = batch_num * optimized_batch_size
- logger.info("Processing GenBank batch %d/%d (chunk %d/%d, retstart=%d, retmax=%d)",
- overall_batch_num, total_batches_all_chunks,
- chunk_idx + 1, len(epost_chunks),
- retstart, optimized_batch_size)
-
+ logger.info(
+ "Processing GenBank batch %d/%d (chunk %d/%d, retstart=%d, retmax=%d)",
+ overall_batch_num,
+ total_batches_all_chunks,
+ chunk_idx + 1,
+ len(epost_chunks),
+ retstart,
+ optimized_batch_size,
+ )
+
try:
batch_metadata, batch_xml_text = _efetch_with_history(
web_env=web_env,
@@ -5926,9 +6237,9 @@ def fetch_genbank_metadata(accessions, genbank_full_xml_path, genbank_full_csv_p
retstart=retstart,
retmax=optimized_batch_size,
api_key=api_key,
- failed_log_path=failed_log_path
+ failed_log_path=failed_log_path,
)
-
+
if batch_metadata:
# Stream parsed metadata to temp JSONL (not held in RAM)
for acc, meta in batch_metadata.items():
@@ -5940,7 +6251,7 @@ def fetch_genbank_metadata(accessions, genbank_full_xml_path, genbank_full_csv_p
del batch_metadata
else:
batch_count = 0
-
+
if batch_xml_text:
cleaned_xml = _clean_xml_declarations(batch_xml_text)
xml_file.write(cleaned_xml + "\n")
@@ -5948,56 +6259,64 @@ def fetch_genbank_metadata(accessions, genbank_full_xml_path, genbank_full_csv_p
xml_written = True
del batch_xml_text
del cleaned_xml
- logger.info("Batch %d/%d: Successfully retrieved metadata for %d accessions",
- overall_batch_num, total_batches_all_chunks,
- batch_count)
+ logger.info(
+ "Batch %d/%d: Successfully retrieved metadata for %d accessions",
+ overall_batch_num,
+ total_batches_all_chunks,
+ batch_count,
+ )
else:
- logger.warning("Batch %d/%d returned no data",
- overall_batch_num, total_batches_all_chunks)
+ logger.warning(
+ "Batch %d/%d returned no data", overall_batch_num, total_batches_all_chunks
+ )
# Track failed accessions for potential retry
batch_start = batch_num * optimized_batch_size
batch_end = min(batch_start + optimized_batch_size, chunk_total)
failed_batches.append(chunk_accessions[batch_start:batch_end])
-
+
# Periodic garbage collection
if overall_batch_num % gc_frequency == 0:
_force_garbage_collection(f"after batch {overall_batch_num}/{total_batches_all_chunks}")
-
+
# Periodic memory logging
if overall_batch_num % memory_log_frequency == 0:
_log_memory_usage(f"GenBank batch {overall_batch_num}/{total_batches_all_chunks}")
-
+
# Delay between requests (respect NCBI rate limits)
if overall_batch_num < total_batches_all_chunks and effective_delay > 0:
time.sleep(effective_delay)
-
- except Exception as e:
- logger.error("⚠️ Batch %d/%d failed: %s",
- overall_batch_num, total_batches_all_chunks, e)
+
+ except Exception as e: # noqa: BLE001
+ logger.error("⚠️ Batch %d/%d failed: %s", overall_batch_num, total_batches_all_chunks, e)
batch_start = batch_num * optimized_batch_size
batch_end = min(batch_start + optimized_batch_size, chunk_total)
failed_batches.append(chunk_accessions[batch_start:batch_end])
continue
-
+
# Brief delay between EPost chunks to be respectful to NCBI
if chunk_idx < len(epost_chunks) - 1:
time.sleep(1.0)
-
+
else:
- logger.warning("EPost chunk %d/%d failed, will use direct fetch for %d accessions",
- chunk_idx + 1, len(epost_chunks), len(chunk_accessions))
+ logger.warning(
+ "EPost chunk %d/%d failed, will use direct fetch for %d accessions",
+ chunk_idx + 1,
+ len(epost_chunks),
+ len(chunk_accessions),
+ )
epost_failures.extend(chunk_accessions)
-
+
# If any EPost chunks failed entirely, fall back to direct URL method for those
if epost_failures:
- logger.info("Falling back to direct URL method for %d accessions from failed EPost chunks",
- len(epost_failures))
- direct_batches = [epost_failures[i:i + batch_size]
- for i in range(0, len(epost_failures), batch_size)]
+ logger.info(
+ "Falling back to direct URL method for %d accessions from failed EPost chunks", len(epost_failures)
+ )
+ direct_batches = [epost_failures[i : i + batch_size] for i in range(0, len(epost_failures), batch_size)]
for dbatch_num, dbatch_accessions in enumerate(direct_batches, 1):
try:
batch_metadata, batch_xml_text = _fetch_genbank_batch(
- dbatch_accessions, failed_log_path=failed_log_path)
+ dbatch_accessions, failed_log_path=failed_log_path
+ )
if batch_metadata:
for acc, meta in batch_metadata.items():
metadata_jsonl_file.write(json.dumps({"accession": acc, "metadata": meta}) + "\n")
@@ -6014,42 +6333,46 @@ def fetch_genbank_metadata(accessions, genbank_full_xml_path, genbank_full_csv_p
del cleaned_xml
if dbatch_num < len(direct_batches) and effective_delay > 0:
time.sleep(effective_delay)
- except Exception as e:
+ except Exception as e: # noqa: BLE001
logger.error("⚠️ Direct fallback batch %d failed: %s", dbatch_num, e)
failed_batches.append(dbatch_accessions)
continue
-
+
# Check if we got ANY data from EPost method
if total_metadata_written == 0 and not epost_failures:
logger.warning("EPost method returned no data, falling back to direct fetch method")
use_epost_method = False # Fall through to traditional method
-
+
if not use_epost_method:
# ===== FALLBACK METHOD: Traditional direct URL batching =====
logger.info("Using traditional direct URL method (batch_size=%d)", batch_size)
-
+
# Split accessions into batches to avoid URL length limits
if len(accessions) > batch_size:
- batches = [accessions[i:i + batch_size] for i in range(0, len(accessions), batch_size)]
- logger.info("Processing %d accessions in %d batches of size %d",
- len(accessions), len(batches), batch_size)
+ batches = [accessions[i : i + batch_size] for i in range(0, len(accessions), batch_size)]
+ logger.info(
+ "Processing %d accessions in %d batches of size %d", len(accessions), len(batches), batch_size
+ )
else:
batches = [accessions]
logger.info("Processing %d accessions in 1 batch", len(accessions))
-
+
# Determine GC and memory logging frequency based on total batches
gc_frequency = max(50, len(batches) // 20) # GC roughly every 5% of batches
memory_log_frequency = max(100, len(batches) // 10) # Log memory every 10%
-
+
# Process each batch
for batch_num, batch_accessions in enumerate(batches, 1):
- logger.info("Processing GenBank batch %d/%d (%d accessions)",
- batch_num, len(batches), len(batch_accessions))
-
+ logger.info(
+ "Processing GenBank batch %d/%d (%d accessions)", batch_num, len(batches), len(batch_accessions)
+ )
+
try:
# Fetch GenBank XML data using E-utilities efetch
- batch_metadata, batch_xml_text = _fetch_genbank_batch(batch_accessions, failed_log_path=failed_log_path)
-
+ batch_metadata, batch_xml_text = _fetch_genbank_batch(
+ batch_accessions, failed_log_path=failed_log_path
+ )
+
if batch_metadata:
# Stream parsed metadata to temp JSONL (not held in RAM)
for acc, meta in batch_metadata.items():
@@ -6068,37 +6391,38 @@ def fetch_genbank_metadata(accessions, genbank_full_xml_path, genbank_full_csv_p
xml_file.write(cleaned_xml + "\n")
xml_file.flush() # Ensure data is written to disk
xml_written = True
-
+
# Clear batch_xml_text from memory
del batch_xml_text
del cleaned_xml
-
- logger.info("Batch %d: Successfully retrieved metadata for %d accessions",
- batch_num, batch_count)
+
+ logger.info(
+ "Batch %d: Successfully retrieved metadata for %d accessions", batch_num, batch_count
+ )
else:
# Batch failed, add to failed_batches for retry
logger.warning("Batch %d returned no data, will retry later", batch_num)
failed_batches.append(batch_accessions)
-
+
# Periodic garbage collection to prevent memory buildup
if batch_num % gc_frequency == 0:
_force_garbage_collection(f"after batch {batch_num}/{len(batches)}")
-
+
# Periodic memory logging
if batch_num % memory_log_frequency == 0:
_log_memory_usage(f"GenBank batch {batch_num}/{len(batches)}")
-
+
# Add delay between requests to be respectful to NCBI servers
if batch_num < len(batches) and effective_delay > 0:
logger.debug("Adding %.1f second delay before next batch", effective_delay)
time.sleep(effective_delay)
-
- except Exception as e:
+
+ except Exception as e: # noqa: BLE001
logger.error("⚠️ Batch %d failed: %s", batch_num, e)
failed_batches.append(batch_accessions)
logger.info("Added batch %d to retry list", batch_num)
continue
-
+
# Retry failed batches at the end
if failed_batches:
logger.info("Retrying %d failed batches", len(failed_batches))
@@ -6123,24 +6447,29 @@ def fetch_genbank_metadata(accessions, genbank_full_xml_path, genbank_full_csv_p
del xml
del cleaned_xml
logger.info("Successfully retried batch with %d accessions", len(batch_accessions))
- except Exception as e:
+ except Exception: # noqa: BLE001
logger.warning("Final retry failed for batch: %s", batch_accessions)
-
+
if retry_success:
- logger.info("Successfully recovered %d/%d failed batches on retry", len(retry_success), len(failed_batches))
+ logger.info(
+ "Successfully recovered %d/%d failed batches on retry", len(retry_success), len(failed_batches)
+ )
# ===== DETECT AND RETRY SILENTLY DROPPED ACCESSIONS =====
# The NCBI history server sometimes silently drops individual accessions from batch responses without raising errors. Detect these and retry them individually with direct URL fetch to maximize completeness.
# This also catches accessions lost due to EPost/EFetch position mismatch (server internal ordering differs from posting order) after batch retries.
silently_dropped = set(accessions) - seen_accessions
if silently_dropped:
- logger.info("🔄 Detected %d accessions silently dropped by NCBI history server — retrying with direct fetch",
- len(silently_dropped))
+ logger.info(
+ "🔄 Detected %d accessions silently dropped by NCBI history server — retrying with direct fetch",
+ len(silently_dropped),
+ )
dropped_list = sorted(silently_dropped)
# Use batch size of 200 for direct URL retry (efficient for large sets, small enough to avoid URL length limits ~8KB for 200 accessions)
direct_batch_size = min(200, len(dropped_list))
- direct_batches = [dropped_list[i:i + direct_batch_size]
- for i in range(0, len(dropped_list), direct_batch_size)]
+ direct_batches = [
+ dropped_list[i : i + direct_batch_size] for i in range(0, len(dropped_list), direct_batch_size)
+ ]
recovered_count = 0
for dbatch_num, dbatch_accessions in enumerate(direct_batches, 1):
try:
@@ -6162,31 +6491,32 @@ def fetch_genbank_metadata(accessions, genbank_full_xml_path, genbank_full_csv_p
del cleaned_xml
if dbatch_num < len(direct_batches) and effective_delay > 0:
time.sleep(effective_delay)
- except Exception as e:
- logger.warning("Direct retry failed for dropped accessions %s: %s",
- dbatch_accessions, e)
+ except Exception as e: # noqa: BLE001
+ logger.warning("Direct retry failed for dropped accessions %s: %s", dbatch_accessions, e)
if recovered_count:
- logger.info("✅ Recovered %d/%d silently dropped accessions via direct fetch",
- recovered_count, len(silently_dropped))
+ logger.info(
+ "✅ Recovered %d/%d silently dropped accessions via direct fetch",
+ recovered_count,
+ len(silently_dropped),
+ )
else:
- logger.warning("Could not recover any of the %d silently dropped accessions",
- len(silently_dropped))
+ logger.warning("Could not recover any of the %d silently dropped accessions", len(silently_dropped))
# Close XML wrapper
xml_file.write("\n")
xml_file.close()
xml_file = None
-
+
# Close metadata JSONL file
if metadata_jsonl_file is not None:
metadata_jsonl_file.close()
metadata_jsonl_file = None
-
+
# Move temp file to final location
if xml_written:
shutil.move(temp_xml_path, genbank_full_xml_path)
logger.debug("Saved full GenBank XML to: %s", genbank_full_xml_path)
-
+
# Convert XML to CSV (memory-efficient chunked processing)
_log_memory_usage("before XML to CSV conversion")
_genbank_xml_to_csv(genbank_full_xml_path, genbank_full_csv_path)
@@ -6195,7 +6525,7 @@ def fetch_genbank_metadata(accessions, genbank_full_xml_path, genbank_full_csv_p
logger.warning("No GenBank XML content retrieved to save")
if os.path.exists(temp_xml_path):
os.remove(temp_xml_path)
-
+
except Exception as e:
logger.error("Error during GenBank metadata fetch: %s", e)
raise
@@ -6218,16 +6548,17 @@ def fetch_genbank_metadata(accessions, genbank_full_xml_path, genbank_full_csv_p
except OSError as e:
logger.debug("Failed to clean up temp_xml_path (os.remove): %s", e)
- logger.info("GenBank metadata retrieval complete: %d/%d accessions processed",
- total_metadata_written, len(accessions))
-
+ logger.info(
+ "GenBank metadata retrieval complete: %d/%d accessions processed", total_metadata_written, len(accessions)
+ )
+
# Load all metadata from temp JSONL file into dict for return
# This is fine because GenBank metadata is fetched only for the post-filter subset
# (typically 1K-50K records, not millions)
all_metadata = {}
if os.path.exists(temp_metadata_jsonl_path) and total_metadata_written > 0:
try:
- with open(temp_metadata_jsonl_path, 'r', encoding='utf-8') as f:
+ with open(temp_metadata_jsonl_path, encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
@@ -6238,23 +6569,23 @@ def fetch_genbank_metadata(accessions, genbank_full_xml_path, genbank_full_csv_p
if acc and meta:
all_metadata[acc] = meta
logger.info("Loaded %d GenBank metadata records from temp file", len(all_metadata))
- except Exception as e:
+ except Exception as e: # noqa: BLE001
logger.error("Failed to load GenBank metadata from temp JSONL: %s", e)
-
+
# Clean up temp JSONL file
if os.path.exists(temp_metadata_jsonl_path):
try:
os.remove(temp_metadata_jsonl_path)
except OSError as e:
logger.debug("Failed to clean up temp_metadata_jsonl_path (os.remove): %s", e)
-
+
# Final memory log and GC
_force_garbage_collection("GenBank fetch complete")
_log_memory_usage("GenBank fetch complete")
if not all_metadata:
logger.warning("No GenBank metadata was successfully retrieved")
-
+
missing_accessions = set(accessions) - set(all_metadata.keys())
if missing_accessions:
logger.info("❌ The following accessions could not be downloaded:")
@@ -6264,39 +6595,39 @@ def fetch_genbank_metadata(accessions, genbank_full_xml_path, genbank_full_csv_p
if len(missing_accessions) > 10:
logger.info(f" ... and {len(missing_accessions) - 10} more")
logger.info(f"A log of failed batches and efetch URLs is saved at: {failed_log_path}")
-
+
# Return both metadata and the path to the failed batches log for summary tracking
return all_metadata, failed_log_path if os.path.exists(failed_log_path) else None
def _fetch_genbank_batch(accessions, failed_log_path=None):
- """
- Fetch GenBank metadata for a single batch of accessions.
-
+ """Fetch GenBank metadata for a single batch of accessions.
+
Includes retry logic with exponential backoff and automatic batch splitting
for problematic requests.
-
+
Args:
accessions (list): List of accession numbers for this batch.
failed_log_path (str, optional): Path to log file for failed batches.
-
- Returns:
+
+ Returns
+ -------
tuple: (metadata_dict, xml_text) where metadata_dict maps accessions to
parsed metadata, and xml_text is the raw XML response.
+
"""
-
# Build E-utilities efetch URL for GenBank XML format
# base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
accession_string = ",".join(accessions)
-
+
params = {
- 'db': 'nucleotide', # Nucleotide database (includes virus sequences)
- 'id': accession_string, # Comma-separated accession numbers
- 'rettype': 'gb', # GenBank format
- 'retmode': 'xml', # XML output for structured parsing
- 'complexity': GENBANK_COMPLEXITY,
+ "db": "nucleotide", # Nucleotide database (includes virus sequences)
+ "id": accession_string, # Comma-separated accession numbers
+ "rettype": "gb", # GenBank format
+ "retmode": "xml", # XML output for structured parsing
+ "complexity": GENBANK_COMPLEXITY,
}
-
+
# Create a requests.Session with urllib3 Retry/HTTPAdapter for robust retries
session = requests.Session()
try:
@@ -6304,7 +6635,7 @@ def _fetch_genbank_batch(accessions, failed_log_path=None):
total=GENBANK_RETRY_ATTEMPTS,
backoff_factor=HTTP_INITIAL_BACKOFF,
status_forcelist=HTTP_RETRY_STATUS_CODES,
- allowed_methods=frozenset(['GET', 'POST'])
+ allowed_methods=frozenset(["GET", "POST"]),
)
except TypeError:
# Fallback for older urllib3 versions that use method_whitelist
@@ -6312,14 +6643,14 @@ def _fetch_genbank_batch(accessions, failed_log_path=None):
total=GENBANK_RETRY_ATTEMPTS,
backoff_factor=HTTP_INITIAL_BACKOFF,
status_forcelist=HTTP_RETRY_STATUS_CODES,
- method_whitelist=frozenset(['GET', 'POST'])
+ method_whitelist=frozenset(["GET", "POST"]),
)
adapter = HTTPAdapter(max_retries=retry_strategy)
session.mount("https://", adapter)
session.mount("http://", adapter)
- headers = {'Connection': 'close', 'User-Agent': 'gget/1.0'}
+ headers = {"Connection": "close", "User-Agent": "gget/1.0"}
# Local retry loop for transient chunk/connection errors with exponential backoff
max_attempts = HTTP_MAX_LOCAL_RETRIES
attempt = 0
@@ -6330,7 +6661,10 @@ def _fetch_genbank_batch(accessions, failed_log_path=None):
try:
logger.debug("Making E-utilities request for %d accessions (attempt %d)", len(accessions), attempt + 1)
logger.debug("Request URL: %s", NCBI_EUTILS_BASE_EFETCH)
- logger.debug("Request parameters: %s", {k: (v[:50] + '...' if isinstance(v, str) and len(v) > 50 else v) for k, v in params.items()})
+ logger.debug(
+ "Request parameters: %s",
+ {k: (v[:50] + "..." if isinstance(v, str) and len(v) > 50 else v) for k, v in params.items()},
+ )
# Use POST instead of GET for EFetch to avoid 414 URI Too Long errors.
# NCBI E-utilities supports POST for all requests, and POST puts the
@@ -6341,7 +6675,7 @@ def _fetch_genbank_batch(accessions, failed_log_path=None):
response.raise_for_status()
# Verify we got XML data
- if not response.text.strip().startswith('>> xml = '\n\n...'
>>> _clean_xml_declarations(xml)
'...'
+
"""
cleaned_lines = []
for line in xml_text.splitlines():
@@ -6445,31 +6787,31 @@ def _clean_xml_declarations(xml_text):
def _local_name(tag):
- """
- Return the local name of an XML tag (strip namespace if present).
-
+ """Return the local name of an XML tag (strip namespace if present).
+
XML tags may include namespace prefixes (e.g., '{http://namespace}TagName').
This helper function extracts just the tag name without the namespace.
-
+
Args:
tag (str): XML tag string, potentially with namespace.
-
- Returns:
+
+ Returns
+ -------
str: Tag name without namespace prefix.
-
+
Example:
- >>> _local_name('{http://www.ncbi.nlm.nih.gov}GBSeq')
+ >>> _local_name("{http://www.ncbi.nlm.nih.gov}GBSeq")
'GBSeq'
- >>> _local_name('GBSeq')
+ >>> _local_name("GBSeq")
'GBSeq'
+
"""
- return tag.split('}')[-1] if '}' in tag else tag
+ return tag.split("}")[-1] if "}" in tag else tag
def _genbank_xml_to_csv(xml_path, csv_path, chunk_size=None):
- """
- Convert GenBank XML to CSV with streaming and dynamic qualifier columns.
-
+ """Convert GenBank XML to CSV with streaming and dynamic qualifier columns.
+
Args:
xml_path (str): Path to input GenBank XML file.
csv_path (str): Path to output CSV file.
@@ -6478,16 +6820,15 @@ def _genbank_xml_to_csv(xml_path, csv_path, chunk_size=None):
# Apply default chunk size if not specified
if chunk_size is None:
chunk_size = GENBANK_XML_CHUNK_SIZE
-
+
qualifier_names = set()
rows = []
header_written = False
- csv_file = open(csv_path, "w", newline='', encoding='utf-8')
- writer = None
+ csv_file = open(csv_path, "w", newline="", encoding="utf-8")
# Stream-parse XML
- for event, elem in ET.iterparse(xml_path, events=("end",)):
+ for _event, elem in ET.iterparse(xml_path, events=("end",)):
if _local_name(elem.tag) == "GBSeq":
# # Skip protein sequences (AA type) - we only want nucleotide sequences
# moltype_elem = elem.findtext(".//GBSeq_moltype", "").strip()
@@ -6495,7 +6836,7 @@ def _genbank_xml_to_csv(xml_path, csv_path, chunk_size=None):
# logger.debug("Skipping protein sequence (AA type) in XML to CSV conversion")
# elem.clear()
# continue
-
+
accession = elem.findtext(".//GBSeq_accession-version", "").strip()
sequence = elem.findtext(".//GBSeq_sequence", "").strip()
features = elem.findall(".//GBFeature")
@@ -6540,7 +6881,7 @@ def _genbank_xml_to_csv(xml_path, csv_path, chunk_size=None):
"Interval_to": interval_to,
"Interval_accession": interval_acc,
"order": order_flag,
- "sequence": "" # leave blank for now
+ "sequence": "", # leave blank for now
}
for qn in qualifier_names:
@@ -6579,9 +6920,8 @@ def _genbank_xml_to_csv(xml_path, csv_path, chunk_size=None):
def _save_genbank_xml_and_csv(xml_content, xml_file_name, csv_file_name):
- """
- Save GenBank XML content and convert to CSV.
-
+ """Save GenBank XML content and convert to CSV.
+
Args:
xml_content (str): Raw XML content from E-utilities.
xml_file_name (str): Path for XML output file.
@@ -6608,24 +6948,25 @@ def _save_genbank_xml_and_csv(xml_content, xml_file_name, csv_file_name):
def _parse_genbank_xml(xml_content):
- """
- Parse GenBank XML response and extract high-level metadata fields.
+ """Parse GenBank XML response and extract high-level metadata fields.
+
This function processes the GenBank XML format returned by E-utilities efetch
and extracts key metadata including collection dates, geographic information,
host details, publication references, and sequence features.
-
+
Args:
xml_content (str): Raw XML content from E-utilities efetch.
-
- Returns:
+
+ Returns
+ -------
dict: Dictionary mapping accession numbers to metadata dictionaries
-
+
Note:
Uses xml.etree.ElementTree for parsing to avoid external dependencies.
The GenBank XML schema is documented by NCBI and contains structured
information about sequence records.
+
"""
-
# Parse the XML content
try:
root = ET.fromstring(xml_content)
@@ -6635,127 +6976,127 @@ def _parse_genbank_xml(xml_content):
logger.error("❌ XML parsing failed: %s", e)
logger.debug("XML content preview: %s", xml_content[:500])
raise RuntimeError(f"Invalid XML format in GenBank response: {e}") from e
-
+
metadata_dict = {}
-
+
# Process each GenBank sequence record in the XML
- for gbseq in root.findall('.//GBSeq'):
+ for gbseq in root.findall(".//GBSeq"):
try:
# # Skip protein sequences (AA type) - we only want nucleotide sequences
# moltype_elem = gbseq.find('GBSeq_moltype')
# if moltype_elem is not None and moltype_elem.text == 'AA':
# logger.debug("Skipping protein sequence (AA type)")
# continue
-
+
# Extract accession number as the primary key
- accession_elem = gbseq.find('GBSeq_accession-version')
+ accession_elem = gbseq.find("GBSeq_accession-version")
if accession_elem is None:
- accession_elem = gbseq.find('GBSeq_primary-accession')
-
+ accession_elem = gbseq.find("GBSeq_primary-accession")
+
if accession_elem is None:
logger.warning("Skipping GenBank record without accession number")
continue
-
+
accession = accession_elem.text
logger.debug("Processing GenBank record: %s", accession)
-
+
# Initialize metadata dictionary for this record
metadata = {
- 'accession': accession,
- 'genbank_data': {} # Store GenBank-specific fields
+ "accession": accession,
+ "genbank_data": {}, # Store GenBank-specific fields
}
-
+
# Extract basic sequence information
- length_elem = gbseq.find('GBSeq_length')
- metadata['genbank_data']['sequence_length'] = int(length_elem.text) if length_elem is not None else None
-
- organism_elem = gbseq.find('GBSeq_organism')
- metadata['genbank_data']['organism'] = organism_elem.text if organism_elem is not None else ""
-
- definition_elem = gbseq.find('GBSeq_definition')
- metadata['genbank_data']['definition'] = definition_elem.text if definition_elem is not None else ""
-
+ length_elem = gbseq.find("GBSeq_length")
+ metadata["genbank_data"]["sequence_length"] = int(length_elem.text) if length_elem is not None else None
+
+ organism_elem = gbseq.find("GBSeq_organism")
+ metadata["genbank_data"]["organism"] = organism_elem.text if organism_elem is not None else ""
+
+ definition_elem = gbseq.find("GBSeq_definition")
+ metadata["genbank_data"]["definition"] = definition_elem.text if definition_elem is not None else ""
+
# Extract taxonomy information
- taxonomy_elem = gbseq.find('GBSeq_taxonomy')
- metadata['genbank_data']['taxonomy'] = taxonomy_elem.text if taxonomy_elem is not None else ""
-
+ taxonomy_elem = gbseq.find("GBSeq_taxonomy")
+ metadata["genbank_data"]["taxonomy"] = taxonomy_elem.text if taxonomy_elem is not None else ""
+
# Extract creation and update dates
- create_date_elem = gbseq.find('GBSeq_create-date')
- metadata['genbank_data']['create_date'] = create_date_elem.text if create_date_elem is not None else ""
-
- update_date_elem = gbseq.find('GBSeq_update-date')
- metadata['genbank_data']['update_date'] = update_date_elem.text if update_date_elem is not None else ""
-
+ create_date_elem = gbseq.find("GBSeq_create-date")
+ metadata["genbank_data"]["create_date"] = create_date_elem.text if create_date_elem is not None else ""
+
+ update_date_elem = gbseq.find("GBSeq_update-date")
+ metadata["genbank_data"]["update_date"] = update_date_elem.text if update_date_elem is not None else ""
+
# Extract references (publications)
references = []
- for ref in gbseq.findall('.//GBReference'):
+ for ref in gbseq.findall(".//GBReference"):
ref_data = {}
-
- title_elem = ref.find('GBReference_title')
- ref_data['title'] = title_elem.text if title_elem is not None else ""
-
- authors_elem = ref.find('GBReference_authors')
+
+ title_elem = ref.find("GBReference_title")
+ ref_data["title"] = title_elem.text if title_elem is not None else ""
+
+ authors_elem = ref.find("GBReference_authors")
if authors_elem is not None:
- authors = [a.text for a in authors_elem.findall('GBAuthor') if a.text]
- ref_data['authors'] = ', '.join(authors)
+ authors = [a.text for a in authors_elem.findall("GBAuthor") if a.text]
+ ref_data["authors"] = ", ".join(authors)
else:
- ref_data['authors'] = ""
+ ref_data["authors"] = ""
# ref_data['authors'] = authors_elem.text if authors_elem is not None else ""
-
- journal_elem = ref.find('GBReference_journal')
- ref_data['journal'] = journal_elem.text if journal_elem is not None else ""
-
- pubmed_elem = ref.find('GBReference_pubmed')
- ref_data['pubmed_id'] = pubmed_elem.text if pubmed_elem is not None else ""
-
+
+ journal_elem = ref.find("GBReference_journal")
+ ref_data["journal"] = journal_elem.text if journal_elem is not None else ""
+
+ pubmed_elem = ref.find("GBReference_pubmed")
+ ref_data["pubmed_id"] = pubmed_elem.text if pubmed_elem is not None else ""
+
if any(ref_data.values()): # Only add if we got some reference data
references.append(ref_data)
-
- metadata['genbank_data']['references'] = references
-
+
+ metadata["genbank_data"]["references"] = references
+
# Extract features (collection_date, geographic location, host, etc.)
features_data = {}
gene_count = 0
mature_peptide_count = 0
products = []
-
- for feature in gbseq.findall('.//GBFeature'):
- feature_key_elem = feature.find('GBFeature_key')
+
+ for feature in gbseq.findall(".//GBFeature"):
+ feature_key_elem = feature.find("GBFeature_key")
if feature_key_elem is None:
continue
-
+
feature_key = feature_key_elem.text
-
+
# Count genes and mature peptides
- if feature_key == 'gene':
+ if feature_key == "gene":
gene_count += 1
- elif feature_key == 'mat_peptide':
+ elif feature_key == "mat_peptide":
mature_peptide_count += 1
-
+
# Extract qualifiers for this feature
feature_qualifiers = {}
has_proviral = False
- for qual in feature.findall('.//GBQualifier'):
- qual_name_elem = qual.find('GBQualifier_name')
- qual_value_elem = qual.find('GBQualifier_value')
-
+ for qual in feature.findall(".//GBQualifier"):
+ qual_name_elem = qual.find("GBQualifier_name")
+ qual_value_elem = qual.find("GBQualifier_value")
+
if qual_name_elem is not None:
qual_name = qual_name_elem.text
# Handle qualifiers without values (e.g., proviral)
qual_value = qual_value_elem.text if qual_value_elem is not None else ""
feature_qualifiers[qual_name] = qual_value
-
+
# Track proviral flag (presence indicates proviral)
- if qual_name == 'proviral':
+ if qual_name == "proviral":
has_proviral = True
-
+
# Collect product names for has_proteins filter
- if qual_name == 'product' and qual_value:
+ if qual_name == "product" and qual_value:
products.append(qual_value)
-
+
if has_proviral:
- feature_qualifiers['_has_proviral'] = True
-
+ feature_qualifiers["_has_proviral"] = True
+
if feature_qualifiers:
# Store multiple features of same type (e.g., multiple CDS)
if feature_key not in features_data:
@@ -6764,238 +7105,233 @@ def _parse_genbank_xml(xml_content):
features_data[feature_key].append(feature_qualifiers)
else:
features_data[feature_key] = [features_data[feature_key], feature_qualifiers]
-
+
# Extract specific fields of interest from source feature
- source_feature = features_data.get('source', {})
+ source_feature = features_data.get("source", {})
if isinstance(source_feature, list):
source_feature = source_feature[0] # Use first source if multiple
-
- metadata['genbank_data']['collection_date'] = source_feature.get('collection_date', '')
- metadata['genbank_data']['geographic_location'] = source_feature.get('geo_loc_name', '')
- metadata['genbank_data']['host'] = source_feature.get('host', '')
- metadata['genbank_data']['isolation_source'] = source_feature.get('isolation_source', '')
- metadata['genbank_data']['strain'] = source_feature.get('strain', '')
- metadata['genbank_data']['isolate'] = source_feature.get('isolate', '')
+
+ metadata["genbank_data"]["collection_date"] = source_feature.get("collection_date", "")
+ metadata["genbank_data"]["geographic_location"] = source_feature.get("geo_loc_name", "")
+ metadata["genbank_data"]["host"] = source_feature.get("host", "")
+ metadata["genbank_data"]["isolation_source"] = source_feature.get("isolation_source", "")
+ metadata["genbank_data"]["strain"] = source_feature.get("strain", "")
+ metadata["genbank_data"]["isolate"] = source_feature.get("isolate", "")
# metadata['genbank_data']['collected_by'] = source_feature.get('collected_by', '')
# metadata['genbank_data']['specimen_voucher'] = source_feature.get('specimen_voucher', '')
-
+
# Extract additional GenBank-specific fields for filtering
- metadata['genbank_data']['proviral'] = source_feature.get('_has_proviral', False) or 'proviral' in source_feature
- metadata['genbank_data']['mol_type'] = source_feature.get('mol_type', '')
- metadata['genbank_data']['serotype'] = source_feature.get('serotype', '')
- metadata['genbank_data']['gene_count'] = gene_count
- metadata['genbank_data']['mature_peptide_count'] = mature_peptide_count
- metadata['genbank_data']['products'] = products
-
+ metadata["genbank_data"]["proviral"] = (
+ source_feature.get("_has_proviral", False) or "proviral" in source_feature
+ )
+ metadata["genbank_data"]["mol_type"] = source_feature.get("mol_type", "")
+ metadata["genbank_data"]["serotype"] = source_feature.get("serotype", "")
+ metadata["genbank_data"]["gene_count"] = gene_count
+ metadata["genbank_data"]["mature_peptide_count"] = mature_peptide_count
+ metadata["genbank_data"]["products"] = products
+
# Extract genotype from note field or serotype
- genotype = ''
- note_value = source_feature.get('note', '')
+ genotype = ""
+ note_value = source_feature.get("note", "")
if note_value:
# Look for genotype pattern in note (e.g., "genotype: IV" or "genotype=H5N1")
- genotype_match = re.search(r'genotype[:\s=]+([^\s;,]+)', note_value, re.IGNORECASE)
+ genotype_match = re.search(r"genotype[:\s=]+([^\s;,]+)", note_value, re.IGNORECASE)
if genotype_match:
genotype = genotype_match.group(1).strip()
- if not genotype and source_feature.get('serotype'):
- genotype = source_feature.get('serotype', '')
- metadata['genbank_data']['genotype'] = genotype
-
+ if not genotype and source_feature.get("serotype"):
+ genotype = source_feature.get("serotype", "")
+ metadata["genbank_data"]["genotype"] = genotype
+
# Store all features for potential future use
- metadata['genbank_data']['all_features'] = features_data
-
+ metadata["genbank_data"]["all_features"] = features_data
+
# Extract comment field (often contains additional metadata)
- comment_elem = gbseq.find('GBSeq_comment')
+ comment_elem = gbseq.find("GBSeq_comment")
comment_text = comment_elem.text if comment_elem is not None else ""
- metadata['genbank_data']['comment'] = comment_text
-
+ metadata["genbank_data"]["comment"] = comment_text
+
# Parse assembly name from comment if present (used in some studies)
assembly_name = ""
if comment_text:
- assembly_match = re.search(r'Assembly Name :: (\S+)', comment_text)
+ assembly_match = re.search(r"Assembly Name :: (\S+)", comment_text)
if assembly_match:
assembly_name = assembly_match.group(1)
- metadata['genbank_data']['assembly_name'] = assembly_name
-
+ metadata["genbank_data"]["assembly_name"] = assembly_name
+
# Store the metadata for this accession
metadata_dict[accession] = metadata
-
- logger.debug("Extracted GenBank metadata for %s: organism=%s, collection_date=%s, geographic_location=%s",
- accession,
- metadata['genbank_data']['organism'],
- metadata['genbank_data']['collection_date'],
- metadata['genbank_data']['geographic_location'])
-
- except Exception as e:
- logger.warning("❌ Failed to parse GenBank record %s: %s",
- accession if 'accession' in locals() else 'unknown', e)
+
+ logger.debug(
+ "Extracted GenBank metadata for %s: organism=%s, collection_date=%s, geographic_location=%s",
+ accession,
+ metadata["genbank_data"]["organism"],
+ metadata["genbank_data"]["collection_date"],
+ metadata["genbank_data"]["geographic_location"],
+ )
+
+ except Exception as e: # noqa: BLE001
+ logger.warning(
+ "❌ Failed to parse GenBank record %s: %s", accession if "accession" in locals() else "unknown", e
+ )
continue
-
+
logger.info("✅ Successfully parsed GenBank metadata for %d records", len(metadata_dict))
return metadata_dict
def save_genbank_metadata_to_csv(genbank_metadata, output_file, virus_metadata=None):
- """
- Save GenBank metadata to a CSV file with the same column headers as the standard metadata CSV.
-
+ """Save GenBank metadata to a CSV file with the same column headers as the standard metadata CSV.
+
Args:
genbank_metadata (dict): Dictionary mapping accessions to GenBank metadata
output_file (str): Path to the output CSV file
virus_metadata (list, optional): List of virus metadata dictionaries to merge
-
+
Note:
The CSV format uses the same column headers as save_metadata_to_csv to ensure
consistency between the two output files, making them directly comparable.
"""
-
logger.info("Preparing GenBank metadata for CSV output...")
logger.debug("Processing %d GenBank records", len(genbank_metadata))
-
+
# Use the same column order as save_metadata_to_csv for consistency
columns = [
- "accession", # Primary identifier (lowercase for Delphy compatibility)
- "Organism Name", # Virus species/strain name
- "GenBank/RefSeq", # Source database (GenBank or RefSeq)
- "Submitters", # Names of sequence submitters
- "Organization", # Submitting organization/institution
- "Submitter Country", # Country of submitting organization
- "Release date", # Date when sequence was released to public databases
- "Isolate", # Isolate/sample identifier
- "Virus Lineage", # Taxonomic lineage of the virus
- "Length", # Sequence length in base pairs
- "Nuc Completeness", # Completeness status (complete/partial)
+ "accession", # Primary identifier (lowercase for Delphy compatibility)
+ "Organism Name", # Virus species/strain name
+ "GenBank/RefSeq", # Source database (GenBank or RefSeq)
+ "Submitters", # Names of sequence submitters
+ "Organization", # Submitting organization/institution
+ "Submitter Country", # Country of submitting organization
+ "Release date", # Date when sequence was released to public databases
+ "Isolate", # Isolate/sample identifier
+ "Virus Lineage", # Taxonomic lineage of the virus
+ "Length", # Sequence length in base pairs
+ "Nuc Completeness", # Completeness status (complete/partial)
"Proteins/Segments", # Protein/segment information from FASTA headers
- "Segment", # Virus segment identifier (e.g., 'HA', 'NA', '4', '6')
+ "Segment", # Virus segment identifier (e.g., 'HA', 'NA', '4', '6')
"Is Vaccine Strain", # Whether this sequence is from a vaccine strain
"Geographic Region", # Geographic region where sample was collected
- "Geographic Location",# Specific geographic location
- "Host", # Host organism name
- "Host Lineage", # Taxonomic lineage of host organism
- "Lab Host", # Whether sample was lab-passaged
- "Tissue/Specimen/Source", # Sample source/tissue type
- "Collection Date", # Date when sample was collected
- "Sample Name", # Sample identifier
- "Annotated", # Whether sequence has annotation data
- "SRA Accessions", # Associated SRA (sequencing) accessions
- "Bioprojects", # Associated BioProject identifiers
- "Biosample", # BioSample identifier
- "Protein count", # Number of proteins annotated
- "Gene count", # Number of genes annotated
- "Mature Peptide Count", # Number of mature peptides annotated
+ "Geographic Location", # Specific geographic location
+ "Host", # Host organism name
+ "Host Lineage", # Taxonomic lineage of host organism
+ "Lab Host", # Whether sample was lab-passaged
+ "Tissue/Specimen/Source", # Sample source/tissue type
+ "Collection Date", # Date when sample was collected
+ "Sample Name", # Sample identifier
+ "Annotated", # Whether sequence has annotation data
+ "SRA Accessions", # Associated SRA (sequencing) accessions
+ "Bioprojects", # Associated BioProject identifiers
+ "Biosample", # BioSample identifier
+ "Protein count", # Number of proteins annotated
+ "Gene count", # Number of genes annotated
+ "Mature Peptide Count", # Number of mature peptides annotated
# Additional GenBank columns
- "definition", # GenBank sequence definition
- "strain", # Strain information
- "isolation_source", # Source of isolation
- "create_date", # GenBank creation date
- "update_date", # GenBank update date
- "assembly_name", # Assembly name
- "authors", # Publication authors
- "title", # Publication title
- "journal", # Publication journal
- "pubmed_id", # PubMed ID
- "reference_count", # Number of references
- "comment", # Additional comments
+ "definition", # GenBank sequence definition
+ "strain", # Strain information
+ "isolation_source", # Source of isolation
+ "create_date", # GenBank creation date
+ "update_date", # GenBank update date
+ "assembly_name", # Assembly name
+ "authors", # Publication authors
+ "title", # Publication title
+ "journal", # Publication journal
+ "pubmed_id", # PubMed ID
+ "reference_count", # Number of references
+ "comment", # Additional comments
]
-
+
logger.debug("Using column order: %s", columns)
-
+
# Prepare data for DataFrame creation
data_for_df = []
-
+
for accession, metadata in genbank_metadata.items():
logger.debug("Processing GenBank metadata for accession: %s", accession)
-
- genbank_data = metadata.get('genbank_data', {})
-
+
+ genbank_data = metadata.get("genbank_data", {})
+
# Extract publication information (use first reference if available)
- references = genbank_data.get('references', [])
+ references = genbank_data.get("references", [])
first_ref = references[0] if references else {}
-
+
# Build the row dictionary with the same column structure as save_metadata_to_csv
row = {
# Primary identifier
"accession": accession,
-
# Organism and database information
- "Organism Name": genbank_data.get('organism', pd.NA),
- "GenBank/RefSeq": metadata.get('sourceDatabase', pd.NA),
-
+ "Organism Name": genbank_data.get("organism", pd.NA),
+ "GenBank/RefSeq": metadata.get("sourceDatabase", pd.NA),
# Submission information
- "Submitters": metadata.get('submitter', {}).get('names', []) if metadata.get('submitter', {}).get('names') else pd.NA,
- "Organization": metadata.get('submitter', {}).get('affiliation', pd.NA),
- "Submitter Country": metadata.get('submitter', {}).get('country', pd.NA),
- "Release date": metadata.get('releaseDate', '').split('T')[0] if metadata.get('releaseDate') else pd.NA,
-
+ "Submitters": metadata.get("submitter", {}).get("names", [])
+ if metadata.get("submitter", {}).get("names")
+ else pd.NA,
+ "Organization": metadata.get("submitter", {}).get("affiliation", pd.NA),
+ "Submitter Country": metadata.get("submitter", {}).get("country", pd.NA),
+ "Release date": metadata.get("releaseDate", "").split("T")[0] if metadata.get("releaseDate") else pd.NA,
# Sample and isolate information
- "Isolate": genbank_data.get('isolate', pd.NA),
- "Sample Name": genbank_data.get('isolate', pd.NA),
-
+ "Isolate": genbank_data.get("isolate", pd.NA),
+ "Sample Name": genbank_data.get("isolate", pd.NA),
# Virus classification
- "Virus Lineage": genbank_data.get('taxonomy', pd.NA),
-
+ "Virus Lineage": genbank_data.get("taxonomy", pd.NA),
# Sequence characteristics
- "Length": genbank_data.get('sequence_length', pd.NA),
- "Nuc Completeness": metadata.get('completeness', pd.NA),
+ "Length": genbank_data.get("sequence_length", pd.NA),
+ "Nuc Completeness": metadata.get("completeness", pd.NA),
"Proteins/Segments": pd.NA, # Not available from GenBank XML parsing
- "Segment": metadata.get('segment', pd.NA), # Virus segment identifier
- "Is Vaccine Strain": metadata.get('isVaccineStrain', metadata.get('is_vaccine_strain', pd.NA)),
-
+ "Segment": metadata.get("segment", pd.NA), # Virus segment identifier
+ "Is Vaccine Strain": metadata.get("isVaccineStrain", metadata.get("is_vaccine_strain", pd.NA)),
# Geographic information
- "Geographic Region": metadata.get('region', pd.NA),
- "Geographic Location": genbank_data.get('geographic_location', pd.NA),
-
+ "Geographic Region": metadata.get("region", pd.NA),
+ "Geographic Location": genbank_data.get("geographic_location", pd.NA),
# Host information
- "Host": genbank_data.get('host', pd.NA),
- "Host Lineage": metadata.get('host', {}).get('lineage', []) if isinstance(metadata.get('host'), dict) else pd.NA,
- "Lab Host": metadata.get('labHost', pd.NA),
-
+ "Host": genbank_data.get("host", pd.NA),
+ "Host Lineage": metadata.get("host", {}).get("lineage", [])
+ if isinstance(metadata.get("host"), dict)
+ else pd.NA,
+ "Lab Host": metadata.get("labHost", pd.NA),
# Sample source information
- "Tissue/Specimen/Source": genbank_data.get('isolation_source', pd.NA),
- "Collection Date": genbank_data.get('collection_date', pd.NA),
-
+ "Tissue/Specimen/Source": genbank_data.get("isolation_source", pd.NA),
+ "Collection Date": genbank_data.get("collection_date", pd.NA),
# Annotation and quality information
- "Annotated": metadata.get('isAnnotated', pd.NA),
-
+ "Annotated": metadata.get("isAnnotated", pd.NA),
# Associated database records
- "SRA Accessions": metadata.get('sraAccessions', pd.NA),
- "Bioprojects": metadata.get('bioprojects', pd.NA),
- "Biosample": metadata.get('biosample', pd.NA),
-
+ "SRA Accessions": metadata.get("sraAccessions", pd.NA),
+ "Bioprojects": metadata.get("bioprojects", pd.NA),
+ "Biosample": metadata.get("biosample", pd.NA),
# Counts
- "Gene count": metadata.get('geneCount', pd.NA),
- "Protein count": metadata.get('proteinCount', pd.NA),
- "Mature Peptide Count": metadata.get('maturePeptideCount', pd.NA),
-
+ "Gene count": metadata.get("geneCount", pd.NA),
+ "Protein count": metadata.get("proteinCount", pd.NA),
+ "Mature Peptide Count": metadata.get("maturePeptideCount", pd.NA),
# GenBank-specific columns
- "definition": genbank_data.get('definition', pd.NA),
- "strain": genbank_data.get('strain', pd.NA),
- "isolation_source": genbank_data.get('isolation_source', pd.NA),
- "create_date": genbank_data.get('create_date', pd.NA),
- "update_date": genbank_data.get('update_date', pd.NA),
- "assembly_name": genbank_data.get('assembly_name', pd.NA),
- "authors": first_ref.get('authors', pd.NA),
- "title": first_ref.get('title', pd.NA),
- "journal": first_ref.get('journal', pd.NA),
- "pubmed_id": first_ref.get('pubmed_id', pd.NA),
+ "definition": genbank_data.get("definition", pd.NA),
+ "strain": genbank_data.get("strain", pd.NA),
+ "isolation_source": genbank_data.get("isolation_source", pd.NA),
+ "create_date": genbank_data.get("create_date", pd.NA),
+ "update_date": genbank_data.get("update_date", pd.NA),
+ "assembly_name": genbank_data.get("assembly_name", pd.NA),
+ "authors": first_ref.get("authors", pd.NA),
+ "title": first_ref.get("title", pd.NA),
+ "journal": first_ref.get("journal", pd.NA),
+ "pubmed_id": first_ref.get("pubmed_id", pd.NA),
"reference_count": len(references) if references else pd.NA,
- "comment": genbank_data.get('comment', pd.NA),
+ "comment": genbank_data.get("comment", pd.NA),
}
-
+
data_for_df.append(row)
-
+
logger.info("Creating DataFrame with %d rows and %d columns", len(data_for_df), len(columns))
-
+
# Create DataFrame with the specified column order
df = pd.DataFrame(data_for_df, columns=columns)
# Write DataFrame to CSV file
try:
- df.to_csv(output_file, index=False, encoding='utf-8')
+ df.to_csv(output_file, index=False, encoding="utf-8")
logger.info("✅ GenBank metadata successfully saved to: %s", output_file)
- logger.info("CSV file contains %d rows and %d columns", len(df), len(df.columns))
+ logger.info("CSV file contains %d rows and %d columns", len(df), len(df.columns))
except Exception as e:
logger.error("❌ Failed to save GenBank metadata CSV: %s", e)
raise RuntimeError(f"❌ Failed to save GenBank metadata to {output_file}: {e}") from e
-
+
def filter_cached_metadata_for_unused_filters(
metadata_dict,
@@ -7008,13 +7344,12 @@ def filter_cached_metadata_for_unused_filters(
min_release_date=None,
applied_strategy_filters=None,
):
- """
- Apply filters that were not used in the cached download strategy.
-
+ """Apply filters that were not used in the cached download strategy.
+
This is Step 3 of the cached download pipeline. It applies:
1. Server-side filters not used in the successful cached strategy: host, complete_only, annotated, lineage
2. API-only filters that couldn't be applied server-side: geographic_location, refseq_only, min_release_date
-
+
Args:
metadata_dict (dict): Dictionary mapping accession numbers to metadata from cached download.
host (str, optional): Host organism filter (not applied if in applied_strategy_filters).
@@ -7026,129 +7361,135 @@ def filter_cached_metadata_for_unused_filters(
min_release_date (str, optional): Minimum release date filter (API-only, always applied if specified).
applied_strategy_filters (list, optional): List of filter names that were applied during cached strategy.
Includes: 'host', 'complete-only', 'annotated', 'lineage'
-
- Returns:
+
+ Returns
+ -------
tuple: (filtered_accessions, filtered_metadata_list)
- """
- if applied_strategy_filters is None and geographic_location is None and refseq_only is None and min_release_date is None:
+ """
+ if (
+ applied_strategy_filters is None
+ and geographic_location is None
+ and refseq_only is None
+ and min_release_date is None
+ ):
logger.debug("No filters specified for post-cached-download filtering. Returning all metadata unchanged.")
return list(metadata_dict.keys()), list(metadata_dict.values())
-
+
if applied_strategy_filters is None:
applied_strategy_filters = []
-
- logger.info("="*60)
+
+ logger.info("=" * 60)
logger.info("STEP 3b: Applying post-cached-download filters")
- logger.info("="*60)
+ logger.info("=" * 60)
logger.debug("Filters applied during cached strategy: %s", applied_strategy_filters)
-
+
# Determine which filters to apply based on what wasn't used in strategy
filters_to_apply = {}
-
+
# Server-side filters (only apply if not already applied in strategy)
- if 'host' not in applied_strategy_filters and host:
- filters_to_apply['host'] = host
+ if "host" not in applied_strategy_filters and host:
+ filters_to_apply["host"] = host
logger.debug("Will apply host filter: %s (not used in cached strategy)", host)
-
- if 'complete-only' not in applied_strategy_filters and complete_only:
- filters_to_apply['complete_only'] = complete_only
+
+ if "complete-only" not in applied_strategy_filters and complete_only:
+ filters_to_apply["complete_only"] = complete_only
logger.debug("Will apply complete-only filter (not used in cached strategy)")
-
- if 'annotated' not in applied_strategy_filters and annotated:
- filters_to_apply['annotated'] = annotated
+
+ if "annotated" not in applied_strategy_filters and annotated:
+ filters_to_apply["annotated"] = annotated
logger.debug("Will apply annotated filter (not used in cached strategy)")
-
- if 'lineage' not in applied_strategy_filters and lineage:
- filters_to_apply['lineage'] = lineage
+
+ if "lineage" not in applied_strategy_filters and lineage:
+ filters_to_apply["lineage"] = lineage
logger.debug("Will apply lineage filter: %s (not used in cached strategy)", lineage)
-
+
# API-only filters (always apply if specified since they're never in cached strategy)
if geographic_location:
- filters_to_apply['geographic_location'] = geographic_location
+ filters_to_apply["geographic_location"] = geographic_location
logger.debug("Will apply geographic_location filter: %s (API-only)", geographic_location)
-
+
if refseq_only:
- filters_to_apply['refseq_only'] = refseq_only
+ filters_to_apply["refseq_only"] = refseq_only
logger.debug("Will apply refseq_only filter (API-only)")
-
+
if min_release_date:
- filters_to_apply['min_release_date'] = min_release_date
+ filters_to_apply["min_release_date"] = min_release_date
logger.debug("Will apply min_release_date filter: %s (API-only)", min_release_date)
-
+
# If no filters to apply, return all metadata unchanged
if not filters_to_apply:
logger.debug("No post-cached-download filters to apply. All %d records will proceed.", len(metadata_dict))
return list(metadata_dict.keys()), list(metadata_dict.values())
-
+
logger.info("Applying post-cached-download filters: %s", list(filters_to_apply.keys()))
-
+
# Parse min_release_date once for efficiency
min_release_date_parsed = None
- if 'min_release_date' in filters_to_apply:
+ if "min_release_date" in filters_to_apply:
min_release_date_parsed = _parse_date(min_release_date, filtername="min_release_date")
-
+
# Apply filters to each metadata record
filtered_accessions = []
filtered_metadata_list = []
filter_stats = {
- 'host': 0,
- 'complete_only': 0,
- 'annotated': 0,
- 'lineage': 0,
- 'geographic_location': 0,
- 'refseq_only': 0,
- 'min_release_date': 0,
+ "host": 0,
+ "complete_only": 0,
+ "annotated": 0,
+ "lineage": 0,
+ "geographic_location": 0,
+ "refseq_only": 0,
+ "min_release_date": 0,
}
-
+
for accession, metadata in metadata_dict.items():
# Apply each filter - if any fails, skip this record
-
+
# Host filter
- if 'host' in filters_to_apply:
- host_name = metadata.get('hostName', '')
+ if "host" in filters_to_apply:
+ host_name = metadata.get("hostName", "")
if not host_name:
logger.debug("Skipping %s: missing host metadata", accession)
- filter_stats['host'] += 1
+ filter_stats["host"] += 1
continue
if host.lower() not in host_name.lower():
logger.debug("Skipping %s: host '%s' does not match '%s'", accession, host_name, host)
- filter_stats['host'] += 1
+ filter_stats["host"] += 1
continue
-
+
# Complete-only filter
- if 'complete_only' in filters_to_apply:
- nuc_completeness = metadata.get('completeness', '')
- if not nuc_completeness or nuc_completeness.lower() != 'complete':
+ if "complete_only" in filters_to_apply:
+ nuc_completeness = metadata.get("completeness", "")
+ if not nuc_completeness or nuc_completeness.lower() != "complete":
logger.debug("Skipping %s: completeness '%s' != 'complete'", accession, nuc_completeness)
- filter_stats['complete_only'] += 1
+ filter_stats["complete_only"] += 1
continue
-
+
# Annotated filter
- if 'annotated' in filters_to_apply:
- is_annotated = metadata.get('isAnnotated', False)
+ if "annotated" in filters_to_apply:
+ is_annotated = metadata.get("isAnnotated", False)
if not is_annotated:
logger.debug("Skipping %s: not annotated", accession)
- filter_stats['annotated'] += 1
+ filter_stats["annotated"] += 1
continue
-
+
# Lineage filter (SARS-CoV-2 specific)
- if 'lineage' in filters_to_apply:
- virus_pangolin = metadata.get('virusPangolinClassification', '')
+ if "lineage" in filters_to_apply:
+ virus_pangolin = metadata.get("virusPangolinClassification", "")
if not virus_pangolin or lineage.lower() not in virus_pangolin.lower():
logger.debug("Skipping %s: lineage '%s' does not match '%s'", accession, virus_pangolin, lineage)
- filter_stats['lineage'] += 1
+ filter_stats["lineage"] += 1
continue
-
+
# Geographic location filter (API-only)
- if 'geographic_location' in filters_to_apply:
- geo_loc = metadata.get('location', '') or ''
- geo_region = metadata.get('region', '') or ''
- virus_name = metadata.get('virusName', '') or ''
+ if "geographic_location" in filters_to_apply:
+ geo_loc = metadata.get("location", "") or ""
+ geo_region = metadata.get("region", "") or ""
+ virus_name = metadata.get("virusName", "") or ""
# Only skip if ALL location sources are empty
if not geo_loc and not geo_region and not virus_name:
logger.debug("Skipping %s: missing location, region, and virusName metadata", accession)
- filter_stats['geographic_location'] += 1
+ filter_stats["geographic_location"] += 1
continue
# Check if filter matches location, region, or virusName (fallback for older records)
geo_filter = geographic_location.lower()
@@ -7157,46 +7498,54 @@ def filter_cached_metadata_for_unused_filters(
# Also check virusName as fallback (e.g., "B/USA/65/2002" contains location in name)
virus_name_matches = virus_name and geo_filter in virus_name.lower()
if not loc_matches and not region_matches and not virus_name_matches:
- logger.debug("Skipping %s: geo_location '%s', region '%s', virusName '%s' do not match '%s'", accession, geo_loc, geo_region, virus_name, geographic_location)
- filter_stats['geographic_location'] += 1
+ logger.debug(
+ "Skipping %s: geo_location '%s', region '%s', virusName '%s' do not match '%s'",
+ accession,
+ geo_loc,
+ geo_region,
+ virus_name,
+ geographic_location,
+ )
+ filter_stats["geographic_location"] += 1
continue
-
+
# RefSeq only filter (API-only)
- if 'refseq_only' in filters_to_apply:
- is_refseq = metadata.get('sourceDatabase', '').lower() == 'refseq'
+ if "refseq_only" in filters_to_apply:
+ is_refseq = metadata.get("sourceDatabase", "").lower() == "refseq"
if not is_refseq:
logger.debug("Skipping %s: not RefSeq (refseq_only=True)", accession)
- filter_stats['refseq_only'] += 1
+ filter_stats["refseq_only"] += 1
continue
-
+
# Minimum release date filter (API-only)
- if 'min_release_date' in filters_to_apply:
- release_date_str = metadata.get('releaseDate', '')
+ if "min_release_date" in filters_to_apply:
+ release_date_str = metadata.get("releaseDate", "")
if not release_date_str:
logger.debug("Skipping %s: missing release date metadata", accession)
- filter_stats['min_release_date'] += 1
+ filter_stats["min_release_date"] += 1
continue
release_date = _parse_date(release_date_str.split("T")[0], filtername="release_date")
if not release_date or (min_release_date_parsed and release_date < min_release_date_parsed):
logger.debug("Skipping %s: release date %s < min %s", accession, release_date, min_release_date_parsed)
- filter_stats['min_release_date'] += 1
+ filter_stats["min_release_date"] += 1
continue
-
+
# All filters passed
filtered_accessions.append(accession)
filtered_metadata_list.append(metadata)
-
+
# Log comprehensive filtering statistics
- logger.info("✅ Post-cached-download filtering complete: %d -> %d records",
- len(metadata_dict), len(filtered_accessions))
-
+ logger.info(
+ "✅ Post-cached-download filtering complete: %d -> %d records", len(metadata_dict), len(filtered_accessions)
+ )
+
total_filtered = sum(filter_stats.values())
if total_filtered > 0:
logger.info("Filter statistics (records excluded):")
for filter_name, count in filter_stats.items():
if count > 0:
logger.info(" %s: %d records", filter_name, count)
-
+
return filtered_accessions, filtered_metadata_list
@@ -7227,23 +7576,42 @@ def filter_metadata_only(
geographic_location=None,
host=None,
):
- """
- Filter metadata records based on metadata-only criteria.
-
+ """Filter metadata records based on metadata-only criteria.
+
Applies filters that can be evaluated using only metadata, reducing the
number of accessions before downloading sequences. Sequence-dependent
filters are deferred to post-download filtering.
-
+
Args:
metadata_dict (dict): Dictionary mapping accession numbers to metadata.
(other args): Filter criteria - same as filter_sequences.
-
- Returns:
+
+ Returns
+ -------
tuple: (filtered_accessions, filtered_metadata_list)
+
"""
-
logger.info("Starting metadata-only filtering process...")
- logger.debug("Applying metadata-only filters: seq_length(%s-%s), completeness(%s), lab_passaged(%s), annotated(%s), submitter_country(%s), collection_date(%s-%s), source_database(%s), max_release_date(%s), protein_count(%s-%s), segment(%s), vaccine_strain(%s), submitter_name(%s), submitter_institution(%s), isolate(%s)", min_seq_length, max_seq_length, nuc_completeness, lab_passaged, annotated, submitter_country, min_collection_date, max_collection_date, source_database, max_release_date, min_protein_count, max_protein_count, segment, vaccine_strain, submitter_name, submitter_institution, isolate)
+ logger.debug(
+ "Applying metadata-only filters: seq_length(%s-%s), completeness(%s), lab_passaged(%s), annotated(%s), submitter_country(%s), collection_date(%s-%s), source_database(%s), max_release_date(%s), protein_count(%s-%s), segment(%s), vaccine_strain(%s), submitter_name(%s), submitter_institution(%s), isolate(%s)",
+ min_seq_length,
+ max_seq_length,
+ nuc_completeness,
+ lab_passaged,
+ annotated,
+ submitter_country,
+ min_collection_date,
+ max_collection_date,
+ source_database,
+ max_release_date,
+ min_protein_count,
+ max_protein_count,
+ segment,
+ vaccine_strain,
+ submitter_name,
+ submitter_institution,
+ isolate,
+ )
# Convert date filters to datetime objects for proper comparison
# Parse user-provided filter dates with appropriate partial date handling:
@@ -7252,19 +7620,23 @@ def filter_metadata_only(
min_collection_date = (
_parse_partial_date_for_range_check(
min_collection_date, for_min_comparison=False, filtername="min_collection_date"
- ) if min_collection_date else None
+ )
+ if min_collection_date
+ else None
)
max_collection_date = (
_parse_partial_date_for_range_check(
max_collection_date, for_min_comparison=True, filtername="max_collection_date"
- ) if max_collection_date else None
+ )
+ if max_collection_date
+ else None
)
max_release_date = (
- _parse_partial_date_for_range_check(
- max_release_date, for_min_comparison=True, filtername="max_release_date"
- ) if max_release_date else None
+ _parse_partial_date_for_range_check(max_release_date, for_min_comparison=True, filtername="max_release_date")
+ if max_release_date
+ else None
)
-
+
if min_collection_date:
logger.debug("Parsed min_collection_date: %s", min_collection_date)
if max_collection_date:
@@ -7275,55 +7647,55 @@ def filter_metadata_only(
# Initialize lists to store filtered results
filtered_accessions = []
filtered_metadata_list = []
-
+
# Counters for logging filter statistics
total_sequences = len(metadata_dict)
filter_stats = {
- 'seq_length': 0,
+ "seq_length": 0,
# 'gene_count': 0,
- 'completeness': 0,
- 'lab_passaged': 0,
- 'annotated': 0,
- 'submitter_country': 0,
- 'collection_date': 0,
- 'source_database': 0,
- 'release_date': 0,
+ "completeness": 0,
+ "lab_passaged": 0,
+ "annotated": 0,
+ "submitter_country": 0,
+ "collection_date": 0,
+ "source_database": 0,
+ "release_date": 0,
# 'mature_peptide_count': 0,
- 'protein_count': 0,
- 'segment': 0,
- 'vaccine_strain': 0,
- 'submitter_name': 0,
- 'submitter_institution': 0,
- 'isolate': 0,
- 'isolation_source': 0,
- 'geographic_location': 0,
- 'host': 0,
+ "protein_count": 0,
+ "segment": 0,
+ "vaccine_strain": 0,
+ "submitter_name": 0,
+ "submitter_institution": 0,
+ "isolate": 0,
+ "isolation_source": 0,
+ "geographic_location": 0,
+ "host": 0,
}
logger.info("Processing %d metadata records...", total_sequences)
-
+
for accession, metadata in metadata_dict.items():
# logger.debug("Processing metadata for: %s", accession)
-
+
# Apply filters sequentially - each filter can exclude the record
# If any filter fails, we continue to the next record
-
+
# FILTER 1: Sequence length filters
if min_seq_length is not None or max_seq_length is not None:
sequence_length = metadata.get("length")
if sequence_length is None:
logger.debug("Skipping %s: missing length metadata", accession)
- filter_stats['seq_length'] += 1
+ filter_stats["seq_length"] += 1
continue
-
+
if min_seq_length is not None and sequence_length < min_seq_length:
logger.debug("Skipping %s: length %d < min %d", accession, sequence_length, min_seq_length)
- filter_stats['seq_length'] += 1
+ filter_stats["seq_length"] += 1
continue
-
+
if max_seq_length is not None and sequence_length > max_seq_length:
logger.debug("Skipping %s: length %d > max %d", accession, sequence_length, max_seq_length)
- filter_stats['seq_length'] += 1
+ filter_stats["seq_length"] += 1
continue
# FILTER 2: Gene count filters
@@ -7333,12 +7705,12 @@ def filter_metadata_only(
# logger.debug("Skipping %s: missing gene count metadata", accession)
# filter_stats['gene_count'] += 1
# continue
-
+
# if min_gene_count is not None and gene_count < min_gene_count:
# logger.debug("Skipping %s: gene count %d < min %d", accession, gene_count, min_gene_count)
# filter_stats['gene_count'] += 1
# continue
-
+
# if max_gene_count is not None and gene_count > max_gene_count:
# logger.debug("Skipping %s: gene count %d > max %d", accession, gene_count, max_gene_count)
# filter_stats['gene_count'] += 1
@@ -7349,13 +7721,14 @@ def filter_metadata_only(
completeness_status = metadata.get("completeness")
if completeness_status is None:
logger.debug("Skipping %s: missing completeness metadata", accession)
- filter_stats['completeness'] += 1
+ filter_stats["completeness"] += 1
continue
-
+
if completeness_status.lower() != nuc_completeness.lower():
- logger.debug("Skipping %s: completeness '%s' != required '%s'",
- accession, completeness_status, nuc_completeness)
- filter_stats['completeness'] += 1
+ logger.debug(
+ "Skipping %s: completeness '%s' != required '%s'", accession, completeness_status, nuc_completeness
+ )
+ filter_stats["completeness"] += 1
continue
# FILTER 4: Lab passaging status filter
@@ -7363,14 +7736,14 @@ def filter_metadata_only(
from_lab = metadata.get("isLabHost")
if not from_lab:
logger.debug("Skipping %s: not lab-passaged (required)", accession)
- filter_stats['lab_passaged'] += 1
+ filter_stats["lab_passaged"] += 1
continue
if lab_passaged is False:
from_lab = metadata.get("isLabHost")
if from_lab:
logger.debug("Skipping %s: is lab-passaged (excluded)", accession)
- filter_stats['lab_passaged'] += 1
+ filter_stats["lab_passaged"] += 1
continue
# FILTER 5: Annotation status filter
@@ -7380,135 +7753,151 @@ def filter_metadata_only(
is_annotated = metadata.get("isAnnotated")
if is_annotated:
logger.debug("Skipping %s: is annotated (excluded when annotated=False)", accession)
- filter_stats['annotated'] += 1
+ filter_stats["annotated"] += 1
continue
# FILTER 6: Submitter country filter
if submitter_country is not None:
- submitter_country_value = "_".join(
- (metadata.get("submitterCountry") or "").split(" ")
- ).lower()
-
+ submitter_country_value = "_".join((metadata.get("submitterCountry") or "").split(" ")).lower()
+
if not submitter_country_value:
logger.debug("Skipping %s: missing submitter country", accession)
- filter_stats['submitter_country'] += 1
+ filter_stats["submitter_country"] += 1
continue
-
+
if submitter_country_value != submitter_country.lower():
- logger.debug("Skipping %s: submitter country '%s' != required '%s'",
- accession, submitter_country_value, submitter_country.lower())
- filter_stats['submitter_country'] += 1
+ logger.debug(
+ "Skipping %s: submitter country '%s' != required '%s'",
+ accession,
+ submitter_country_value,
+ submitter_country.lower(),
+ )
+ filter_stats["submitter_country"] += 1
continue
# FILTER 7: Submitter name filter
if submitter_name is not None:
# Convert submitter_name to list if it's a string
submitter_name_list = [submitter_name] if isinstance(submitter_name, str) else submitter_name
-
+
# Get submitter name from metadata
metadata_submitter_name = metadata.get("submitterName", "")
if not metadata_submitter_name:
- filter_stats['submitter_name'] += 1
+ filter_stats["submitter_name"] += 1
continue
-
+
# Build set of acceptable submitter name values (case-insensitive)
acceptable_names = {s.lower().strip() for s in submitter_name_list}
-
+
# Check if metadata submitter name matches any acceptable value (case-insensitive)
metadata_submitter_name_lower = str(metadata_submitter_name).lower().strip()
if metadata_submitter_name_lower not in acceptable_names:
- logger.debug("Skipping %s: submitter name '%s' not in required list %s",
- accession, metadata_submitter_name, submitter_name_list)
- filter_stats['submitter_name'] += 1
+ logger.debug(
+ "Skipping %s: submitter name '%s' not in required list %s",
+ accession,
+ metadata_submitter_name,
+ submitter_name_list,
+ )
+ filter_stats["submitter_name"] += 1
continue
# FILTER 8: Submitter institution filter
if submitter_institution is not None:
# Convert submitter_institution to list if it's a string
- submitter_institution_list = [submitter_institution] if isinstance(submitter_institution, str) else submitter_institution
-
+ submitter_institution_list = (
+ [submitter_institution] if isinstance(submitter_institution, str) else submitter_institution
+ )
+
# Get submitter institution from metadata
metadata_submitter_institution = metadata.get("submitterInstitution", "")
if not metadata_submitter_institution:
- filter_stats['submitter_institution'] += 1
+ filter_stats["submitter_institution"] += 1
continue
-
+
# Build set of acceptable submitter institution values (case-insensitive)
acceptable_institutions = {s.lower().strip() for s in submitter_institution_list}
-
+
# Check if metadata submitter institution matches any acceptable value (case-insensitive)
metadata_submitter_institution_lower = str(metadata_submitter_institution).lower().strip()
if metadata_submitter_institution_lower not in acceptable_institutions:
- logger.debug("Skipping %s: submitter institution '%s' not in required list %s",
- accession, metadata_submitter_institution, submitter_institution_list)
- filter_stats['submitter_institution'] += 1
+ logger.debug(
+ "Skipping %s: submitter institution '%s' not in required list %s",
+ accession,
+ metadata_submitter_institution,
+ submitter_institution_list,
+ )
+ filter_stats["submitter_institution"] += 1
continue
# FILTER 9: isolate filter
if isolate is not None:
# Convert isolate to list if it's a string
isolate_list = [isolate] if isinstance(isolate, str) else isolate
-
+
# Get isolate from metadata
metadata_isolate = metadata.get("isolateName", "")
if not metadata_isolate:
- filter_stats['isolate'] += 1
+ filter_stats["isolate"] += 1
continue
-
+
# Build set of acceptable isolate values (case-insensitive)
acceptable_isolates = {s.lower().strip() for s in isolate_list}
-
+
# Check if metadata isolate matches any acceptable value (case-insensitive)
metadata_isolate_lower = str(metadata_isolate).lower().strip()
if metadata_isolate_lower not in acceptable_isolates:
- logger.debug("Skipping %s: isolate '%s' not in required list %s",
- accession, metadata_isolate, isolate_list)
- filter_stats['isolate'] += 1
+ logger.debug(
+ "Skipping %s: isolate '%s' not in required list %s", accession, metadata_isolate, isolate_list
+ )
+ filter_stats["isolate"] += 1
continue
# FILTER 10: isolation source filter
if isolation_source is not None:
# Convert isolation_source to list if it's a string
isolation_source_list = [isolation_source] if isinstance(isolation_source, str) else isolation_source
-
+
# Get isolation source from metadata
metadata_isolation_source = metadata.get("isolate", {}).get("source", "")
if not metadata_isolation_source:
- filter_stats['isolation_source'] += 1
+ filter_stats["isolation_source"] += 1
continue
-
+
# Build set of acceptable isolation source values (case-insensitive)
acceptable_isolation_sources = {s.lower().strip() for s in isolation_source_list}
-
+
# Check if metadata isolation source matches any acceptable value (case-insensitive)
metadata_isolation_source_lower = str(metadata_isolation_source).lower().strip()
if metadata_isolation_source_lower not in acceptable_isolation_sources:
- logger.debug("Skipping %s: isolation source '%s' not in required list %s",
- accession, metadata_isolation_source, isolation_source_list)
- filter_stats['isolation_source'] += 1
+ logger.debug(
+ "Skipping %s: isolation source '%s' not in required list %s",
+ accession,
+ metadata_isolation_source,
+ isolation_source_list,
+ )
+ filter_stats["isolation_source"] += 1
continue
-
# FILTER 11: Collection date range filter
if min_collection_date is not None or max_collection_date is not None:
date_str = metadata.get("isolate", {}).get("collectionDate", "")
-
+
# Skip records with empty or missing collection date
if not date_str:
logger.debug("Skipping %s: missing or empty collection date", accession)
- filter_stats['collection_date'] += 1
+ filter_stats["collection_date"] += 1
continue
-
+
# Handle partial dates (year-only or year-month) appropriately for range checks
# For min_collection_date: use end of partial range (record COULD be >= min)
# For max_collection_date: use start of partial range (record COULD be <= max)
-
+
skip_record = False
-
+
if min_collection_date:
try:
# Parse with for_min_comparison=True to use end of range for partial dates
@@ -7517,20 +7906,25 @@ def filter_metadata_only(
)
except ValueError:
logger.debug("Skipping %s: invalid collection date format '%s'", accession, date_str)
- filter_stats['collection_date'] += 1
+ filter_stats["collection_date"] += 1
continue
-
+
if date_for_min is None:
logger.debug("Skipping %s: missing or invalid collection date '%s'", accession, date_str)
- filter_stats['collection_date'] += 1
+ filter_stats["collection_date"] += 1
continue
-
+
if date_for_min < min_collection_date:
- logger.debug("Skipping %s: collection date %s (from '%s') < min %s",
- accession, date_for_min, date_str, min_collection_date)
- filter_stats['collection_date'] += 1
+ logger.debug(
+ "Skipping %s: collection date %s (from '%s') < min %s",
+ accession,
+ date_for_min,
+ date_str,
+ min_collection_date,
+ )
+ filter_stats["collection_date"] += 1
skip_record = True
-
+
if not skip_record and max_collection_date:
try:
# Parse with for_min_comparison=False to use start of range for partial dates
@@ -7539,35 +7933,40 @@ def filter_metadata_only(
)
except ValueError:
logger.debug("Skipping %s: invalid collection date format '%s'", accession, date_str)
- filter_stats['collection_date'] += 1
+ filter_stats["collection_date"] += 1
continue
-
+
if date_for_max is None:
logger.debug("Skipping %s: missing or invalid collection date '%s'", accession, date_str)
- filter_stats['collection_date'] += 1
+ filter_stats["collection_date"] += 1
continue
-
+
if date_for_max > max_collection_date:
- logger.debug("Skipping %s: collection date %s (from '%s') > max %s",
- accession, date_for_max, date_str, max_collection_date)
- filter_stats['collection_date'] += 1
+ logger.debug(
+ "Skipping %s: collection date %s (from '%s') > max %s",
+ accession,
+ date_for_max,
+ date_str,
+ max_collection_date,
+ )
+ filter_stats["collection_date"] += 1
skip_record = True
-
+
if skip_record:
continue
# FILTER 12: Maximum release date filter
if max_release_date is not None:
release_date_str = metadata.get("releaseDate")
-
+
if not release_date_str:
logger.debug("Skipping %s: missing release date", accession)
- filter_stats['release_date'] += 1
+ filter_stats["release_date"] += 1
continue
-
+
# Strip time portion if present (e.g., "2024-02-14T00:00:00Z" -> "2024-02-14")
release_date_str_clean = release_date_str.split("T")[0]
-
+
try:
# For max comparison, use START of range for partial dates
# so that records that COULD be within range are included
@@ -7576,39 +7975,44 @@ def filter_metadata_only(
)
except ValueError:
logger.debug("Skipping %s: invalid release date format '%s'", accession, release_date_str)
- filter_stats['release_date'] += 1
+ filter_stats["release_date"] += 1
continue
-
+
if release_date_value is None:
logger.debug("Skipping %s: invalid release date '%s'", accession, release_date_str)
- filter_stats['release_date'] += 1
+ filter_stats["release_date"] += 1
continue
-
+
if release_date_value > max_release_date:
- logger.debug("Skipping %s: release date %s (from '%s') > max %s",
- accession, release_date_value, release_date_str, max_release_date)
- filter_stats['release_date'] += 1
+ logger.debug(
+ "Skipping %s: release date %s (from '%s') > max %s",
+ accession,
+ release_date_value,
+ release_date_str,
+ max_release_date,
+ )
+ filter_stats["release_date"] += 1
continue
# FILTER 13: Mature peptide count filters
# if min_mature_peptide_count is not None or max_mature_peptide_count is not None:
# mature_peptide_count = metadata.get("maturePeptideCount")
-
+
# if mature_peptide_count is None:
# logger.debug("Skipping %s: missing mature peptide count", accession)
# filter_stats['mature_peptide_count'] += 1
# continue
-
- # if (min_mature_peptide_count is not None and
+
+ # if (min_mature_peptide_count is not None and
# mature_peptide_count < min_mature_peptide_count):
- # logger.debug("Skipping %s: mature peptide count %d < min %d",
+ # logger.debug("Skipping %s: mature peptide count %d < min %d",
# accession, mature_peptide_count, min_mature_peptide_count)
# filter_stats['mature_peptide_count'] += 1
# continue
-
- # if (max_mature_peptide_count is not None and
+
+ # if (max_mature_peptide_count is not None and
# mature_peptide_count > max_mature_peptide_count):
- # logger.debug("Skipping %s: mature peptide count %d > max %d",
+ # logger.debug("Skipping %s: mature peptide count %d > max %d",
# accession, mature_peptide_count, max_mature_peptide_count)
# filter_stats['mature_peptide_count'] += 1
# continue
@@ -7616,45 +8020,44 @@ def filter_metadata_only(
# FILTER 14: Protein count filters
if min_protein_count is not None or max_protein_count is not None:
protein_count = metadata.get("proteinCount")
-
+
if protein_count is None:
logger.debug("Skipping %s: missing protein count", accession)
- filter_stats['protein_count'] += 1
+ filter_stats["protein_count"] += 1
continue
-
+
if min_protein_count is not None and protein_count < min_protein_count:
- logger.debug("Skipping %s: protein count %d < min %d",
- accession, protein_count, min_protein_count)
- filter_stats['protein_count'] += 1
+ logger.debug("Skipping %s: protein count %d < min %d", accession, protein_count, min_protein_count)
+ filter_stats["protein_count"] += 1
continue
-
+
if max_protein_count is not None and protein_count > max_protein_count:
- logger.debug("Skipping %s: protein count %d > max %d",
- accession, protein_count, max_protein_count)
- filter_stats['protein_count'] += 1
+ logger.debug("Skipping %s: protein count %d > max %d", accession, protein_count, max_protein_count)
+ filter_stats["protein_count"] += 1
continue
# FILTER 15: Segment filter - simple case-insensitive matching
if segment is not None:
# Convert segment to list if it's a string
segment_list = [segment] if isinstance(segment, str) else segment
-
+
# Get segment from metadata
metadata_segment = metadata.get("segment")
-
+
if not metadata_segment:
- filter_stats['segment'] += 1
+ filter_stats["segment"] += 1
continue
-
+
# Build set of acceptable segment values (case-insensitive)
acceptable_segments = {s.lower().strip() for s in segment_list}
-
+
# Check if metadata segment matches any acceptable value (case-insensitive)
metadata_segment_lower = str(metadata_segment).lower().strip()
if metadata_segment_lower not in acceptable_segments:
- logger.debug("Skipping %s: segment '%s' not in required list %s",
- accession, metadata_segment, segment_list)
- filter_stats['segment'] += 1
+ logger.debug(
+ "Skipping %s: segment '%s' not in required list %s", accession, metadata_segment, segment_list
+ )
+ filter_stats["segment"] += 1
continue
# FILTER 16: Vaccine strain filter
@@ -7663,12 +8066,12 @@ def filter_metadata_only(
if vaccine_strain:
if not is_vaccine:
logger.debug("Skipping %s: not a vaccine strain (required)", accession)
- filter_stats['vaccine_strain'] += 1
+ filter_stats["vaccine_strain"] += 1
continue
- if vaccine_strain == False:
+ if not vaccine_strain:
if is_vaccine:
logger.debug("Skipping %s: is a vaccine strain (not allowed)", accession)
- filter_stats['vaccine_strain'] += 1
+ filter_stats["vaccine_strain"] += 1
continue
# FILTER 17: Source database filter
@@ -7676,104 +8079,119 @@ def filter_metadata_only(
source_db = (metadata.get("sourceDatabase") or "").lower()
if not source_db:
logger.debug("Skipping %s: missing source database", accession)
- filter_stats['source_database'] += 1
+ filter_stats["source_database"] += 1
continue
-
+
if source_db != source_database.lower():
- logger.debug("Skipping %s: source database '%s' != required '%s'",
- accession, source_db, source_database.lower())
- filter_stats['source_database'] += 1
+ logger.debug(
+ "Skipping %s: source database '%s' != required '%s'", accession, source_db, source_database.lower()
+ )
+ filter_stats["source_database"] += 1
continue
# FILTER 18: Geographic location filter (deferred from API when server-side filter fails)
if geographic_location is not None:
# Convert geographic_location to list if it's a string
geo_location_list = [geographic_location] if isinstance(geographic_location, str) else geographic_location
-
+
# Get geographic location from metadata - stored in "location", "region", and "virusName" fields
metadata_location = metadata.get("location", "") or ""
metadata_region = metadata.get("region", "") or ""
metadata_virus_name = metadata.get("virusName", "") or ""
-
+
# Only skip if ALL location sources are empty
if not metadata_location and not metadata_region and not metadata_virus_name:
logger.debug("Skipping %s: missing location, region, and virusName", accession)
- filter_stats['geographic_location'] += 1
+ filter_stats["geographic_location"] += 1
continue
-
+
# Build set of acceptable location values (case-insensitive)
# Normalize the filter values: remove special chars and create variations
acceptable_locations = set()
for loc in geo_location_list:
loc_normalized = loc.lower().strip()
# Remove common separators and create variations
- loc_normalized = loc_normalized.replace('-', ' ').replace('_', ' ').replace('+', ' ')
+ loc_normalized = loc_normalized.replace("-", " ").replace("_", " ").replace("+", " ")
acceptable_locations.add(loc_normalized)
# Also add version without spaces for matching
- acceptable_locations.add(loc_normalized.replace(' ', ''))
-
+ acceptable_locations.add(loc_normalized.replace(" ", ""))
+
# Normalize metadata location and region for comparison
metadata_location_lower = str(metadata_location).lower().strip()
- metadata_location_normalized = metadata_location_lower.replace('-', ' ').replace('_', ' ')
- metadata_location_no_spaces = metadata_location_normalized.replace(' ', '')
-
+ metadata_location_normalized = metadata_location_lower.replace("-", " ").replace("_", " ")
+ metadata_location_no_spaces = metadata_location_normalized.replace(" ", "")
+
metadata_region_lower = str(metadata_region).lower().strip()
- metadata_region_normalized = metadata_region_lower.replace('-', ' ').replace('_', ' ')
- metadata_region_no_spaces = metadata_region_normalized.replace(' ', '')
+ metadata_region_normalized = metadata_region_lower.replace("-", " ").replace("_", " ")
+ metadata_region_no_spaces = metadata_region_normalized.replace(" ", "")
metadata_virus_name_lower = str(metadata_virus_name).lower().strip()
-
+
# Check for partial/substring match in location, region, OR virusName
location_match = False
for acceptable_loc in acceptable_locations:
# Check location field
- if metadata_location_normalized and (acceptable_loc in metadata_location_normalized or acceptable_loc in metadata_location_no_spaces):
+ if metadata_location_normalized and (
+ acceptable_loc in metadata_location_normalized or acceptable_loc in metadata_location_no_spaces
+ ):
location_match = True
break
# Also check if metadata location is contained in acceptable location
- if metadata_location_normalized and (metadata_location_normalized in acceptable_loc or metadata_location_no_spaces in acceptable_loc):
+ if metadata_location_normalized and (
+ metadata_location_normalized in acceptable_loc or metadata_location_no_spaces in acceptable_loc
+ ):
location_match = True
break
# Check region field
- if metadata_region_normalized and (acceptable_loc in metadata_region_normalized or acceptable_loc in metadata_region_no_spaces):
+ if metadata_region_normalized and (
+ acceptable_loc in metadata_region_normalized or acceptable_loc in metadata_region_no_spaces
+ ):
location_match = True
break
# Also check if metadata region is contained in acceptable location
- if metadata_region_normalized and (metadata_region_normalized in acceptable_loc or metadata_region_no_spaces in acceptable_loc):
+ if metadata_region_normalized and (
+ metadata_region_normalized in acceptable_loc or metadata_region_no_spaces in acceptable_loc
+ ):
location_match = True
break
# Check virusName as fallback (for older records where location is in the name)
if metadata_virus_name_lower and acceptable_loc in metadata_virus_name_lower:
location_match = True
break
-
+
if not location_match:
- logger.debug("Skipping %s: location '%s', region '%s', virusName '%s' do not match required '%s'",
- accession, metadata_location, metadata_region, metadata_virus_name, geo_location_list)
- filter_stats['geographic_location'] += 1
+ logger.debug(
+ "Skipping %s: location '%s', region '%s', virusName '%s' do not match required '%s'",
+ accession,
+ metadata_location,
+ metadata_region,
+ metadata_virus_name,
+ geo_location_list,
+ )
+ filter_stats["geographic_location"] += 1
continue
# FILTER 19: Host filter (deferred from API when server-side filter fails)
if host is not None:
# Convert host to list if it's a string
host_list = [host] if isinstance(host, str) else host
-
+
# Get host from metadata - stored in "hostName" field
metadata_host = metadata.get("hostName", "") or ""
-
+
if not metadata_host:
logger.debug("Skipping %s: missing hostName", accession)
- filter_stats['host'] += 1
+ filter_stats["host"] += 1
continue
-
+
# Build set of acceptable host values (case-insensitive)
acceptable_hosts = set()
for h in host_list:
h_normalized = h.lower().strip()
acceptable_hosts.add(h_normalized)
-
+
# Normalize metadata host for comparison
metadata_host_lower = str(metadata_host).lower().strip()
-
+
# Check for partial/substring match in host
host_match = False
for acceptable_host in acceptable_hosts:
@@ -7781,22 +8199,23 @@ def filter_metadata_only(
if acceptable_host in metadata_host_lower or metadata_host_lower in acceptable_host:
host_match = True
break
-
+
if not host_match:
- logger.debug("Skipping %s: hostName '%s' does not match required '%s'",
- accession, metadata_host, host_list)
- filter_stats['host'] += 1
+ logger.debug(
+ "Skipping %s: hostName '%s' does not match required '%s'", accession, metadata_host, host_list
+ )
+ filter_stats["host"] += 1
continue
# If we reach this point, the metadata record has passed all filters
filtered_accessions.append(accession)
filtered_metadata_list.append(metadata)
-
+
logger.debug("Metadata %s passed all filters", accession)
# Log comprehensive filtering statistics
num_filtered = len(filtered_accessions)
-
+
if num_filtered == 0:
# Simplified output when nothing passes filters
logger.info("=================================")
@@ -7814,7 +8233,7 @@ def filter_metadata_only(
logger.info("Metadata-only filtering complete:")
logger.info(" Total metadata records: %d", total_sequences)
logger.info(" Records passing filters: %d", num_filtered)
-
+
# Log detailed filter statistics if any records were filtered out
total_filtered = sum(filter_stats.values())
if total_filtered > 0:
@@ -7822,7 +8241,7 @@ def filter_metadata_only(
for filter_name, count in filter_stats.items():
if count > 0:
logger.info(" %s: %d records", filter_name, count)
-
+
return filtered_accessions, filtered_metadata_list, filter_stats
@@ -7838,13 +8257,12 @@ def filter_genbank_metadata(
gen_mol_type=None,
env_source=None,
):
- """
- Filter accessions based on GenBank-specific metadata fields.
-
+ """Filter accessions based on GenBank-specific metadata fields.
+
This function filters accessions using metadata extracted from GenBank XML
that is not available in the standard NCBI Datasets API. These fields require
fetching full GenBank records via E-utilities.
-
+
Filtering is done using the parsed genbank_metadata dictionary where each
accession maps to metadata containing a 'genbank_data' sub-dictionary with:
- gene_count: Number of gene features
@@ -7856,7 +8274,7 @@ def filter_genbank_metadata(
- isolation_source: Sample isolation source (for env_source filter)
- host: Host information (used to determine env_source when empty)
- comment: GenBank comment field (may contain env_source keywords)
-
+
Args:
genbank_metadata (dict): Dictionary mapping accession numbers to metadata
with 'genbank_data' sub-dictionary from _parse_genbank_xml.
@@ -7874,29 +8292,39 @@ def filter_genbank_metadata(
env_source (str or list, optional): Environmental source keywords to match.
Searches isolation_source and note fields for keywords like 'sewage',
'ocean water', 'sea', etc. when host is empty or not human.
-
- Returns:
+
+ Returns
+ -------
list: List of accession numbers that passed all GenBank-based filters.
-
+
Example:
>>> genbank_filtered = filter_genbank_metadata(
... genbank_metadata=genbank_data,
... min_gene_count=5,
... provirus=False,
- ... genotype=['H5N1', 'H5N8'],
- ... has_proteins='hemagglutinin',
+ ... genotype=["H5N1", "H5N8"],
+ ... has_proteins="hemagglutinin",
... )
+
"""
-
logger.info("Starting GenBank metadata filtering...")
- logger.debug("GenBank filters: gene_count(%s-%s), mature_peptide(%s-%s), provirus=%s, genotype=%s, has_proteins=%s, mol_type=%s, env_source=%s",
- min_gene_count, max_gene_count, min_mature_peptide_count, max_mature_peptide_count,
- provirus, genotype, has_proteins, gen_mol_type, env_source)
-
+ logger.debug(
+ "GenBank filters: gene_count(%s-%s), mature_peptide(%s-%s), provirus=%s, genotype=%s, has_proteins=%s, mol_type=%s, env_source=%s",
+ min_gene_count,
+ max_gene_count,
+ min_mature_peptide_count,
+ max_mature_peptide_count,
+ provirus,
+ genotype,
+ has_proteins,
+ gen_mol_type,
+ env_source,
+ )
+
if not genbank_metadata:
logger.warning("No GenBank metadata provided for filtering")
return []
-
+
# Convert single string filters to lists for uniform processing
if isinstance(genotype, str):
genotype = [genotype]
@@ -7904,86 +8332,108 @@ def filter_genbank_metadata(
has_proteins = [has_proteins]
if isinstance(env_source, str):
env_source = [env_source]
-
+
# Environmental source keywords to search for
- env_keywords = ['sewage', 'wastewater', 'ocean', 'sea', 'river', 'lake',
- 'pond', 'water', 'soil', 'environment', 'feces', 'fecal',
- 'stool', 'manure', 'avian', 'bird', 'poultry', 'swine', 'pig']
-
+ env_keywords = [
+ "sewage",
+ "wastewater",
+ "ocean",
+ "sea",
+ "river",
+ "lake",
+ "pond",
+ "water",
+ "soil",
+ "environment",
+ "feces",
+ "fecal",
+ "stool",
+ "manure",
+ "avian",
+ "bird",
+ "poultry",
+ "swine",
+ "pig",
+ ]
+
# Filter statistics
filter_stats = {
- 'gene_count': 0,
- 'mature_peptide_count': 0,
- 'provirus': 0,
- 'genotype': 0,
- 'has_proteins': 0,
- 'mol_type': 0,
- 'env_source': 0,
- 'no_genbank_data': 0,
+ "gene_count": 0,
+ "mature_peptide_count": 0,
+ "provirus": 0,
+ "genotype": 0,
+ "has_proteins": 0,
+ "mol_type": 0,
+ "env_source": 0,
+ "no_genbank_data": 0,
}
-
+
filtered_accessions = []
total_sequences = len(genbank_metadata)
-
+
for accession, metadata in genbank_metadata.items():
# Get genbank_data sub-dictionary
- gb_data = metadata.get('genbank_data', {})
+ gb_data = metadata.get("genbank_data", {})
if not gb_data:
logger.debug("Skipping %s: no genbank_data", accession)
- filter_stats['no_genbank_data'] += 1
+ filter_stats["no_genbank_data"] += 1
continue
-
+
# FILTER 1: Gene count
if min_gene_count is not None or max_gene_count is not None:
- gene_count = gb_data.get('gene_count', 0)
-
+ gene_count = gb_data.get("gene_count", 0)
+
if min_gene_count is not None and gene_count < min_gene_count:
logger.debug("Skipping %s: gene count %d < min %d", accession, gene_count, min_gene_count)
- filter_stats['gene_count'] += 1
+ filter_stats["gene_count"] += 1
continue
-
+
if max_gene_count is not None and gene_count > max_gene_count:
logger.debug("Skipping %s: gene count %d > max %d", accession, gene_count, max_gene_count)
- filter_stats['gene_count'] += 1
+ filter_stats["gene_count"] += 1
continue
-
+
# FILTER 2: Mature peptide count
if min_mature_peptide_count is not None or max_mature_peptide_count is not None:
- mat_count = gb_data.get('mature_peptide_count', 0)
-
+ mat_count = gb_data.get("mature_peptide_count", 0)
+
if min_mature_peptide_count is not None and mat_count < min_mature_peptide_count:
- logger.debug("Skipping %s: mature peptide count %d < min %d", accession, mat_count, min_mature_peptide_count)
- filter_stats['mature_peptide_count'] += 1
+ logger.debug(
+ "Skipping %s: mature peptide count %d < min %d", accession, mat_count, min_mature_peptide_count
+ )
+ filter_stats["mature_peptide_count"] += 1
continue
-
+
if max_mature_peptide_count is not None and mat_count > max_mature_peptide_count:
- logger.debug("Skipping %s: mature peptide count %d > max %d", accession, mat_count, max_mature_peptide_count)
- filter_stats['mature_peptide_count'] += 1
+ logger.debug(
+ "Skipping %s: mature peptide count %d > max %d", accession, mat_count, max_mature_peptide_count
+ )
+ filter_stats["mature_peptide_count"] += 1
continue
-
+
# FILTER 3: Proviral status
if provirus is not None:
- is_proviral = gb_data.get('proviral', False)
-
+ is_proviral = gb_data.get("proviral", False)
+
if provirus and not is_proviral:
logger.debug("Skipping %s: not proviral (required)", accession)
- filter_stats['provirus'] += 1
+ filter_stats["provirus"] += 1
continue
-
+
if provirus is False and is_proviral:
logger.debug("Skipping %s: is proviral (excluded)", accession)
- filter_stats['provirus'] += 1
+ filter_stats["provirus"] += 1
continue
-
+
# FILTER 4: Genotype (from serotype or note field)
if genotype is not None:
- record_genotype = (gb_data.get('genotype', '') or '').lower().strip()
-
+ record_genotype = (gb_data.get("genotype", "") or "").lower().strip()
+
if not record_genotype:
logger.debug("Skipping %s: missing genotype", accession)
- filter_stats['genotype'] += 1
+ filter_stats["genotype"] += 1
continue
-
+
# Check if any requested genotype matches (case-insensitive, partial match)
genotype_match = False
for g in genotype:
@@ -7991,17 +8441,17 @@ def filter_genbank_metadata(
if g_lower in record_genotype or record_genotype in g_lower:
genotype_match = True
break
-
+
if not genotype_match:
logger.debug("Skipping %s: genotype '%s' not in required %s", accession, record_genotype, genotype)
- filter_stats['genotype'] += 1
+ filter_stats["genotype"] += 1
continue
-
+
# FILTER 5: Has proteins (check product names)
if has_proteins is not None:
- products = gb_data.get('products', [])
+ products = gb_data.get("products", [])
products_lower = [p.lower() for p in products]
-
+
# Check that AT LEAST ONE required protein is present
any_protein_found = False
for protein in has_proteins:
@@ -8012,40 +8462,41 @@ def filter_genbank_metadata(
break
if any_protein_found:
break
-
+
if not any_protein_found:
- logger.debug("Skipping %s: none of required proteins %s found in %s",
- accession, has_proteins, products[:5])
- filter_stats['has_proteins'] += 1
+ logger.debug(
+ "Skipping %s: none of required proteins %s found in %s", accession, has_proteins, products[:5]
+ )
+ filter_stats["has_proteins"] += 1
continue
-
+
# FILTER 6: Molecule type (gen_mol_type)
if gen_mol_type is not None:
- mol_type = (gb_data.get('mol_type', '') or '').lower().strip()
-
+ mol_type = (gb_data.get("mol_type", "") or "").lower().strip()
+
if not mol_type:
logger.debug("Skipping %s: missing mol_type", accession)
- filter_stats['mol_type'] += 1
+ filter_stats["mol_type"] += 1
continue
-
+
gen_mol_type_lower = gen_mol_type.lower().strip()
-
+
# Define molecule type mappings for common aliases
# dsDNA/ssRNA etc are structural classifications while GenBank uses "genomic DNA/RNA"
mol_type_mappings = {
- 'dsdna': ['genomic dna', 'dna'],
- 'ssdna': ['genomic dna', 'dna'],
- 'dsrna': ['genomic rna', 'rna'],
- 'ssrna': ['genomic rna', 'rna', 'mrna', 'viral crna'],
- 'dna': ['genomic dna', 'dna'],
- 'rna': ['genomic rna', 'rna', 'mrna', 'viral crna'],
- 'genomic dna': ['genomic dna', 'dna'],
- 'genomic rna': ['genomic rna', 'rna'],
+ "dsdna": ["genomic dna", "dna"],
+ "ssdna": ["genomic dna", "dna"],
+ "dsrna": ["genomic rna", "rna"],
+ "ssrna": ["genomic rna", "rna", "mrna", "viral crna"],
+ "dna": ["genomic dna", "dna"],
+ "rna": ["genomic rna", "rna", "mrna", "viral crna"],
+ "genomic dna": ["genomic dna", "dna"],
+ "genomic rna": ["genomic rna", "rna"],
}
-
+
# Check if the filter matches directly or via mapping
mol_type_match = False
-
+
# First check direct/partial match
if gen_mol_type_lower in mol_type or mol_type in gen_mol_type_lower:
mol_type_match = True
@@ -8056,33 +8507,33 @@ def filter_genbank_metadata(
if mapped_val in mol_type:
mol_type_match = True
break
-
+
if not mol_type_match:
logger.debug("Skipping %s: mol_type '%s' != required '%s'", accession, mol_type, gen_mol_type)
- filter_stats['mol_type'] += 1
+ filter_stats["mol_type"] += 1
continue
-
+
# FILTER 7: Environmental source (env_source)
if env_source is not None:
# Get isolation_source, note, and check host
- isolation_source = (gb_data.get('isolation_source', '') or '').lower()
- host = (gb_data.get('host', '') or '').lower()
- comment = (gb_data.get('comment', '') or '').lower()
- all_features = gb_data.get('all_features', {})
-
+ isolation_source = (gb_data.get("isolation_source", "") or "").lower()
+ host = (gb_data.get("host", "") or "").lower()
+ comment = (gb_data.get("comment", "") or "").lower()
+ all_features = gb_data.get("all_features", {})
+
# Get note from source feature
- source_feature = all_features.get('source', {})
+ source_feature = all_features.get("source", {})
if isinstance(source_feature, list):
source_feature = source_feature[0]
- note = (source_feature.get('note', '') or '').lower()
-
+ note = (source_feature.get("note", "") or "").lower()
+
# Combine all text to search
search_text = f"{isolation_source} {note} {comment}"
-
+
# If host is specified and is human, skip env_source filtering
# (env_source is for non-human/environmental samples)
- is_human_host = 'human' in host or 'homo sapiens' in host
-
+ is_human_host = "human" in host or "homo sapiens" in host
+
env_match = False
for env_term in env_source:
env_term_lower = env_term.lower().strip()
@@ -8096,21 +8547,20 @@ def filter_genbank_metadata(
break
if env_match:
break
-
+
# Only filter out if explicitly looking for environmental and not found
if not env_match and not is_human_host:
- logger.debug("Skipping %s: env_source '%s' not found in isolation_source/note",
- accession, env_source)
- filter_stats['env_source'] += 1
+ logger.debug("Skipping %s: env_source '%s' not found in isolation_source/note", accession, env_source)
+ filter_stats["env_source"] += 1
continue
-
+
# If we reach here, the record passed all filters
filtered_accessions.append(accession)
logger.debug("GenBank metadata %s passed all filters", accession)
-
+
# Log filtering summary
num_filtered = len(filtered_accessions)
-
+
if num_filtered == 0:
logger.info("=================================")
logger.info("GenBank filtering complete: 0 of %d records passed filters", total_sequences)
@@ -8126,14 +8576,14 @@ def filter_genbank_metadata(
logger.info("GenBank filtering complete:")
logger.info(" Total GenBank records: %d", total_sequences)
logger.info(" Records passing filters: %d", num_filtered)
-
+
total_excluded = sum(filter_stats.values())
if total_excluded > 0:
logger.info("Filter statistics (records excluded):")
for filter_name, count in filter_stats.items():
if count > 0:
logger.info(" %s: %d records", filter_name, count)
-
+
return filtered_accessions, filter_stats
@@ -8178,18 +8628,17 @@ def virus(
isolate=None,
genotype=None,
isolation_source=None,
- env_source=None,
+ env_source=None,
submitter_name=None,
submitter_institution=None,
- gen_mol_type=None,
+ gen_mol_type=None,
# assembly_completeness=None,
api_key=None,
baseline_metadata=None,
merge_results=True,
verbose=True,
- ):
- """
- Download a virus genome dataset from the NCBI Virus database (https://www.ncbi.nlm.nih.gov/labs/virus/).
+):
+ """Download a virus genome dataset from the NCBI Virus database (https://www.ncbi.nlm.nih.gov/labs/virus/).
This is the main function that orchestrates the entire virus data retrieval process,
now optimized to download sequences only after all metadata-based filtering:
@@ -8211,29 +8660,30 @@ def virus(
if api_key:
logger.info("Using NCBI API key for higher rate limits (10 req/sec)")
else:
- logger.info("No NCBI API key provided. Using default rate limit (3 req/sec). "
- "Set NCBI_API_KEY env var or pass --api_key for faster requests.")
+ logger.info(
+ "No NCBI API key provided. Using default rate limit (3 req/sec). "
+ "Set NCBI_API_KEY env var or pass --api_key for faster requests."
+ )
# Save the original logger level and set it based on verbose parameter
original_logger_level = logger.level
if not verbose:
logger.setLevel(logging.CRITICAL)
-
+
logger.info("Starting virus data retrieval process...")
-
+
# Capture the command line for summary
command_line = " ".join(sys.argv) if len(sys.argv) > 0 else "virus (called programmatically)"
-
+
# Track wall-clock runtime
_virus_start_time = time.time()
-
+
# Initialize variables for tracking results
total_api_records = 0
total_after_metadata_filter = 0
total_final_sequences = 0
output_files_dict = {}
final_metadata_for_summary = []
- filtered_sequences = []
refseq_only = False
# Initialize filter stats for each stage (populated by filter functions)
@@ -8242,156 +8692,177 @@ def virus(
sequence_filter_stats = {}
total_after_genbank_filter = None
total_after_sequence_filter = None
-
+
# Initialize failed commands tracker for tracking all types of failures
failed_commands = {
- 'api_timeout': None,
- 'empty_response': None,
- 'sequence_batches': [],
- 'genbank_batches': [],
- 'api_batches': [],
- 'pagination_timeouts': [],
- 'pagination_errors': [],
- 'sequence_fetch': [],
+ "api_timeout": None,
+ "empty_response": None,
+ "sequence_batches": [],
+ "genbank_batches": [],
+ "api_batches": [],
+ "pagination_timeouts": [],
+ "pagination_errors": [],
+ "sequence_fetch": [],
}
-
+
# Track if GenBank metadata was successfully retrieved
genbank_success = False
genbank_error_msg = None
if download_all_accessions:
logger.info("ATTENTION: Download all accessions mode is active.")
- logger.info("This will download ALL virus accessions from NCBI, which can be a very large dataset and take a long time.")
- virus = NCBI_ALL_VIRUSES_TAXID # NCBI taxonomy ID for all Viruses
+ logger.info(
+ "This will download ALL virus accessions from NCBI, which can be a very large dataset and take a long time."
+ )
+ virus = NCBI_ALL_VIRUSES_TAXID # NCBI taxonomy ID for all Viruses
is_accession = False
- logger.info("Overriding virus query and accession tag to fetch all viruses using taxon ID: %s. Filters remain unchanged.", virus)
+ logger.info(
+ "Overriding virus query and accession tag to fetch all viruses using taxon ID: %s. Filters remain unchanged.",
+ virus,
+ )
- logger.info("Query parameters: virus='%s', is_accession=%s, outfolder='%s'",
- virus, is_accession, outfolder)
- logger.debug("Applied filters: host=%s, seq_length=(%s-%s), gene_count=(%s-%s), completeness=%s, annotated=%s, source_db(%s), keep_temp=%s, lab_passaged=%s, geographic_location=%s, submitter_country=%s, submitter_name=%s, submitter_institution=%s, collection_date=(%s-%s), release_date=(%s-%s), protein_count=(%s-%s), mature_peptide_count=(%s-%s), max_ambiguous=%s, has_proteins=%s, proteins_complete=%s, segment=%s, vaccine_strain=%s, lineage=%s, provirus=%s, isolate=%s, genotype=%s, isolation_source=%s, env_source=%s, gen_mol_type=%s, genbank_metadata=%s, genbank_batch_size=%s",
- host, min_seq_length, max_seq_length, min_gene_count, max_gene_count, nuc_completeness, annotated, source_database, keep_temp, lab_passaged, geographic_location, submitter_country, submitter_name, submitter_institution, min_collection_date, max_collection_date, min_release_date, max_release_date, min_protein_count, max_protein_count, min_mature_peptide_count, max_mature_peptide_count, max_ambiguous_chars, has_proteins, proteins_complete, segment, vaccine_strain, lineage, provirus, isolate, genotype, isolation_source, env_source, gen_mol_type, genbank_metadata, genbank_batch_size)
+ logger.info("Query parameters: virus='%s', is_accession=%s, outfolder='%s'", virus, is_accession, outfolder)
+ logger.debug(
+ "Applied filters: host=%s, seq_length=(%s-%s), gene_count=(%s-%s), completeness=%s, annotated=%s, source_db(%s), keep_temp=%s, lab_passaged=%s, geographic_location=%s, submitter_country=%s, submitter_name=%s, submitter_institution=%s, collection_date=(%s-%s), release_date=(%s-%s), protein_count=(%s-%s), mature_peptide_count=(%s-%s), max_ambiguous=%s, has_proteins=%s, proteins_complete=%s, segment=%s, vaccine_strain=%s, lineage=%s, provirus=%s, isolate=%s, genotype=%s, isolation_source=%s, env_source=%s, gen_mol_type=%s, genbank_metadata=%s, genbank_batch_size=%s",
+ host,
+ min_seq_length,
+ max_seq_length,
+ min_gene_count,
+ max_gene_count,
+ nuc_completeness,
+ annotated,
+ source_database,
+ keep_temp,
+ lab_passaged,
+ geographic_location,
+ submitter_country,
+ submitter_name,
+ submitter_institution,
+ min_collection_date,
+ max_collection_date,
+ min_release_date,
+ max_release_date,
+ min_protein_count,
+ max_protein_count,
+ min_mature_peptide_count,
+ max_mature_peptide_count,
+ max_ambiguous_chars,
+ has_proteins,
+ proteins_complete,
+ segment,
+ vaccine_strain,
+ lineage,
+ provirus,
+ isolate,
+ genotype,
+ isolation_source,
+ env_source,
+ gen_mol_type,
+ genbank_metadata,
+ genbank_batch_size,
+ )
# SECTION 1: INPUT VALIDATION AND OUTPUT DIRECTORY SETUP
# Validate and normalize input arguments before proceeding
logger.info("=" * 60)
logger.info("STEP 1: VALIDATING INPUT ARGUMENTS AND OUTPUT DIRECTORY SETUP...")
logger.info("=" * 60)
-
+
# Validate virus parameter
if virus is None or (isinstance(virus, str) and virus.strip() == ""):
- raise ValueError(
- "Argument 'virus' must be a non-empty string (virus name, taxon ID, or accession number)."
- )
-
+ raise ValueError("Argument 'virus' must be a non-empty string (virus name, taxon ID, or accession number).")
+
# Validate that both host and env_source filters are not used together, as they are mutually exclusive
if host is not None and env_source is not None:
- raise ValueError("Both 'host' and 'env_source' filters are specified. If there is a host, there is no environmental source. Use only one of these filters for results.")
-
+ raise ValueError(
+ "Both 'host' and 'env_source' filters are specified. If there is a host, there is no environmental source. Use only one of these filters for results."
+ )
+
# Normalize host parameter: convert "human" to "Homo sapiens" for NCBI API compatibility
if host is not None and host.strip().lower() == "human":
logger.debug("Normalizing host 'human' to 'Homo sapiens' for NCBI API compatibility")
host = "Homo sapiens"
-
+
# Validate nucleotide completeness argument
if nuc_completeness is not None:
nuc_completeness = nuc_completeness.lower() # Normalize to lowercase
if nuc_completeness not in ["partial", "complete"]:
- raise ValueError(
- "Argument 'nuc_completeness' must be 'partial', 'complete', or None."
- )
+ raise ValueError("Argument 'nuc_completeness' must be 'partial', 'complete', or None.")
logger.debug("Nucleotide completeness filter set to: %s", nuc_completeness)
# Validate source database argument
if source_database is not None:
source_database = source_database.lower() # Normalize to lowercase
if source_database not in ["refseq", "genbank"]:
- raise ValueError(
- "Argument 'source_database' must be 'refseq', 'genbank', or None."
- )
+ raise ValueError("Argument 'source_database' must be 'refseq', 'genbank', or None.")
elif source_database == "refseq":
refseq_only = True
logger.debug("Source database filter set to RefSeq only")
logger.debug("Source database filter set to: %s", source_database)
-
+
# Validate boolean arguments with proper type checking
if annotated is not None and not isinstance(annotated, bool):
- raise TypeError(
- "Argument 'annotated' must be a boolean (True or False) or None."
- )
-
+ raise TypeError("Argument 'annotated' must be a boolean (True or False) or None.")
+
if lab_passaged is not None and not isinstance(lab_passaged, bool):
- raise TypeError(
- "Argument 'lab_passaged' must be a boolean (True or False) or None."
- )
-
+ raise TypeError("Argument 'lab_passaged' must be a boolean (True or False) or None.")
+
if proteins_complete is not None and not isinstance(proteins_complete, bool):
- raise TypeError(
- "Argument 'proteins_complete' must be a boolean (True or False)."
- )
+ raise TypeError("Argument 'proteins_complete' must be a boolean (True or False).")
# if refseq_only is not None and not isinstance(refseq_only, bool):
# raise TypeError(
# "Argument 'refseq_only' must be a boolean (True or False)."
- # )
-
+ # )
+
if keep_temp is not None and not isinstance(keep_temp, bool):
- raise TypeError(
- "Argument 'keep_temp' must be a boolean (True or False)."
- )
+ raise TypeError("Argument 'keep_temp' must be a boolean (True or False).")
if is_accession is not None and not isinstance(is_accession, bool):
- raise TypeError(
- "Argument 'is_accession' must be a boolean (True or False)."
- )
-
+ raise TypeError("Argument 'is_accession' must be a boolean (True or False).")
+
if genbank_metadata is not None and not isinstance(genbank_metadata, bool):
- raise TypeError(
- "Argument 'genbank_metadata' must be a boolean (True or False)."
- )
+ raise TypeError("Argument 'genbank_metadata' must be a boolean (True or False).")
if vaccine_strain is not None and not isinstance(vaccine_strain, bool):
- raise TypeError(
- "Argument 'vaccine_strain' must be a boolean (True or False) or None."
- )
+ raise TypeError("Argument 'vaccine_strain' must be a boolean (True or False) or None.")
if provirus is not None and not isinstance(provirus, bool):
- raise TypeError(
- "Argument 'provirus' must be a boolean (True or False) or None."
- )
-
+ raise TypeError("Argument 'provirus' must be a boolean (True or False) or None.")
+
if genbank_batch_size is not None:
if not isinstance(genbank_batch_size, int) or genbank_batch_size <= 0:
- raise ValueError(
- "Argument 'genbank_batch_size' must be a positive integer."
- )
+ raise ValueError("Argument 'genbank_batch_size' must be a positive integer.")
if genbank_batch_size > GENBANK_MAX_BATCH_SIZE_WARNING:
- logger.warning("Large genbank_batch_size (%d) may cause API timeouts. Consider using smaller batches.", genbank_batch_size)
-
+ logger.warning(
+ "Large genbank_batch_size (%d) may cause API timeouts. Consider using smaller batches.",
+ genbank_batch_size,
+ )
+
if genbank_metadata:
logger.info("GenBank metadata retrieval enabled (batch_size=%d)", genbank_batch_size)
else:
# Check if any GenBank-dependent filters are specified
genbank_dependent_filters = {
- 'provirus': provirus,
- 'genotype': genotype,
- 'has_proteins': has_proteins,
- 'gen_mol_type': gen_mol_type,
- 'env_source': env_source,
- 'min_gene_count': min_gene_count,
- 'max_gene_count': max_gene_count,
- 'min_mature_peptide_count': min_mature_peptide_count,
- 'max_mature_peptide_count': max_mature_peptide_count,
+ "provirus": provirus,
+ "genotype": genotype,
+ "has_proteins": has_proteins,
+ "gen_mol_type": gen_mol_type,
+ "env_source": env_source,
+ "min_gene_count": min_gene_count,
+ "max_gene_count": max_gene_count,
+ "min_mature_peptide_count": min_mature_peptide_count,
+ "max_mature_peptide_count": max_mature_peptide_count,
}
active_genbank_filters = [k for k, v in genbank_dependent_filters.items() if v is not None]
-
+
if active_genbank_filters:
- logger.info("GenBank-dependent filters detected: %s", ', '.join(active_genbank_filters))
+ logger.info("GenBank-dependent filters detected: %s", ", ".join(active_genbank_filters))
logger.info("Automatically enabling GenBank metadata retrieval (-g flag)")
genbank_metadata = True
else:
logger.debug("GenBank metadata retrieval disabled")
-
# Convert integer virus identifiers to strings for API compatibility
if isinstance(virus, int):
virus = str(virus)
@@ -8440,20 +8911,23 @@ def virus(
##############
# Prepare output directory and all used file paths
- virus_clean = virus.replace(' ', '_').replace('/', '_').replace('-', '_')
+ virus_clean = virus.replace(" ", "_").replace("/", "_").replace("-", "_")
# Create and prepare output directory structure
if outfolder is None:
currentfolder = os.getcwd()
- outfolder = os.path.join(currentfolder, "gget_virus_output" , f"{virus_clean}_{timestamp}")
- logger.info("No output folder specified, creating a subdirectory in current directory named 'gget_virus_output' and placing results in a folder named: %s", outfolder)
+ outfolder = os.path.join(currentfolder, "gget_virus_output", f"{virus_clean}_{timestamp}")
+ logger.info(
+ "No output folder specified, creating a subdirectory in current directory named 'gget_virus_output' and placing results in a folder named: %s",
+ outfolder,
+ )
else:
logger.info("Using specified output folder: %s", outfolder)
-
+
# Ensure output folder exists
os.makedirs(outfolder, exist_ok=True)
logger.debug("Output folder ready: %s", outfolder)
-
+
# Create temporary directory for intermediate processing
# This will be cleaned up at the end regardless of success or failure
temp_dir = os.path.join(outfolder, f"tmp_{timestamp}_{random_suffix}")
@@ -8465,13 +8939,13 @@ def virus(
genbank_full_xml_path = os.path.join(outfolder, f"{virus_clean}_genbank_metadata_full.xml")
genbank_full_csv_path = os.path.join(outfolder, f"{virus_clean}_genbank_metadata_full.csv")
output_api_metadata_jsonl = os.path.join(outfolder, f"{virus_clean}_api_metadata.jsonl")
-
+
# SECTION 1b: BASELINE METADATA LOADING FOR DEDUPLICATION
# If a baseline file is provided, load accessions for deduplication
baseline_accessions = None
baseline_skipped_count = 0
partial_metadata_file = None # Will be set if API fails and partial metadata is saved
-
+
if baseline_metadata is not None:
logger.info("=" * 60)
logger.info("STEP 1b: LOADING BASELINE METADATA FOR DEDUPLICATION")
@@ -8494,7 +8968,7 @@ def virus(
# SECTION 2: CHECKING FOR CACHED DATA PROCESSING
logger.info("=" * 60)
- logger.info("STEP 2: CHECKING FOR SARS-CoV-2 AND INFLUENZA A QUERIES TO APPLY OPTIMIZED CACHED PATHWAY")
+ logger.info("STEP 2: CHECKING FOR SARS-CoV-2 AND INFLUENZA A QUERIES TO APPLY OPTIMIZED CACHED PATHWAY")
logger.info("=" * 60)
# Initialize variables to track cached download results
cached_fasta_file = None # Path to cached FASTA file (sequences streamed on-demand)
@@ -8506,22 +8980,22 @@ def virus(
# For SARS-CoV-2 queries, use cached data packages with hierarchical fallback
if _skip_cache:
logger.info("⏭️ SKIPPING CACHED PATHWAY - Using API method directly (via _skip_cache flag)")
- elif (is_sars_cov2 or is_sars_cov2_query(virus, is_accession)):
+ elif is_sars_cov2 or is_sars_cov2_query(virus, is_accession):
logger.info("DETECTED SARS-CoV-2 QUERY - USING CACHED DATA PACKAGE PATHWAY")
logger.info("SARS-CoV-2 queries will use NCBI's optimized cached data packages")
logger.info("with hierarchical fallback from specific to general cached files.")
-
+
# Use the download_sars_cov2_optimized function which handles fallback strategies internally
params = {
- 'host': host,
- 'complete_only': (nuc_completeness == "complete"),
- 'annotated': annotated,
- 'outdir': outfolder,
- 'lineage': lineage,
- 'accession': virus,
- 'use_accession': is_accession
+ "host": host,
+ "complete_only": (nuc_completeness == "complete"),
+ "annotated": annotated,
+ "outdir": outfolder,
+ "lineage": lineage,
+ "accession": virus,
+ "use_accession": is_accession,
}
-
+
try:
download_result = download_sars_cov2_optimized(**params)
# Unpack tuple: (zip_path, applied_filters, missing_filters)
@@ -8530,20 +9004,22 @@ def virus(
missing_filters = download_result[2]
cached_zip_file = zip_file # Track for cleanup
datasets_version = _get_datasets_version()
-
+
cached_fasta_file, cached_metadata_dict, used_cached_download = process_cached_download(
zip_file, virus_type="SARS-CoV-2"
)
if used_cached_download:
- logger.info("Cached download completed. Server-side filters (host, complete_only, annotated, lineage) applied.")
+ logger.info(
+ "Cached download completed. Server-side filters (host, complete_only, annotated, lineage) applied."
+ )
logger.info("All other filters will be applied in the unified filtering pipeline.")
logger.debug("Applied filters: %s", applied_filters)
logger.debug("Missing filters (to apply in Step 3b): %s", missing_filters)
- except Exception as cache_error:
+ except Exception as cache_error: # noqa: BLE001
logger.warning("SARS-CoV-2 cached download failed after all strategies: %s", cache_error)
logger.info("🔄 Retrying with normal API download method (_skip_cache=True)...")
# Retry the entire virus() call with _skip_cache=True to use the normal API pathway
- _virus_func = globals()['virus']
+ _virus_func = globals()["virus"]
return _virus_func(
virus=virus,
is_accession=is_accession,
@@ -8593,26 +9069,25 @@ def virus(
else:
logger.info("No SARS-CoV-2 query detected.")
-
# SECTION 2b: ALPHAINFLUENZA CACHED DATA PROCESSING
# For Alphainfluenza queries, use cached data packages with hierarchical fallback
if _skip_cache:
logger.info("⏭️ SKIPPING CACHED PATHWAY - Using API method directly (via _skip_cache flag)")
- elif (is_alphainfluenza or is_alphainfluenza_query(virus, is_accession)):
+ elif is_alphainfluenza or is_alphainfluenza_query(virus, is_accession):
logger.info("DETECTED ALPHAINFLUENZA QUERY - USING CACHED DATA PACKAGES")
logger.info("Alphainfluenza queries will use NCBI's optimized cached data packages")
logger.info("with hierarchical fallback from specific to general cached files.")
-
+
# Use the download_alphainfluenza_optimized function which handles fallback strategies internally
params = {
- 'host': host,
- 'complete_only': (nuc_completeness == "complete"),
- 'annotated': annotated,
- 'outdir': outfolder,
- 'accession': virus,
- 'use_accession': is_accession
+ "host": host,
+ "complete_only": (nuc_completeness == "complete"),
+ "annotated": annotated,
+ "outdir": outfolder,
+ "accession": virus,
+ "use_accession": is_accession,
}
-
+
try:
download_result = download_alphainfluenza_optimized(**params)
# Unpack tuple: (zip_path, applied_filters, missing_filters)
@@ -8621,7 +9096,7 @@ def virus(
missing_filters = download_result[2]
cached_zip_file = zip_file # Track for cleanup
datasets_version = _get_datasets_version()
-
+
cached_fasta_file, cached_metadata_dict, used_cached_download = process_cached_download(
zip_file, virus_type="Alphainfluenza"
)
@@ -8630,13 +9105,13 @@ def virus(
logger.info("All other filters will be applied in the unified filtering pipeline.")
logger.debug("Applied filters: %s", applied_filters)
logger.debug("Missing filters (to apply in Step 3b): %s", missing_filters)
- except Exception as cache_error:
+ except Exception as cache_error: # noqa: BLE001
logger.warning("Alphainfluenza cached download failed after all strategies: %s", cache_error)
logger.info("🔄 Retrying with normal API download method (_skip_cache=True)...")
# Retry the entire virus() call with _skip_cache=True to use the normal API pathway
# Note: We use globals()['virus'] because the local parameter 'virus' (a string)
# shadows the function name in this scope.
- _virus_func = globals()['virus']
+ _virus_func = globals()["virus"]
return _virus_func(
virus=virus,
is_accession=is_accession,
@@ -8685,24 +9160,24 @@ def virus(
)
else:
logger.info("No Alphainfluenza query detected.")
-
+
# Initialize deferred_filters for tracking filters that couldn't be applied server-side
deferred_filters = None
try:
- # SECTION 3: METADATA RETRIEVAL AND FILTERING
+ # SECTION 3: METADATA RETRIEVAL AND FILTERING
# Check if we're using cached download data
if used_cached_download and cached_metadata_dict:
logger.info("=" * 60)
logger.info("STEP 3: Applying metadata filters for cached download")
logger.info("=" * 60)
logger.info("Using metadata from cached download (skipping API metadata fetch)")
-
+
# cached_metadata_dict is now a file path (not a dict)
# Use streaming filter to load only records passing filters into memory
if isinstance(cached_metadata_dict, str) and os.path.isfile(cached_metadata_dict):
logger.info("Memory-efficient path: streaming cached metadata JSONL with on-the-fly filtering")
-
+
metadata_dict, total_api_records, cache_filter_stats = _stream_filter_cached_metadata_from_jsonl(
cached_metadata_dict,
host=host,
@@ -8714,30 +9189,31 @@ def virus(
min_release_date=min_release_date,
applied_strategy_filters=applied_filters,
)
- logger.info("Loaded %d records passing filters (from %d total in cache)",
- len(metadata_dict), total_api_records)
-
+ logger.info(
+ "Loaded %d records passing filters (from %d total in cache)", len(metadata_dict), total_api_records
+ )
+
# Copy the cached JSONL to the output API metadata location for consistency
try:
shutil.copy(cached_metadata_dict, output_api_metadata_jsonl)
logger.info("✅ Saved cached metadata JSONL: %s", output_api_metadata_jsonl)
- except Exception as e:
+ except Exception as e: # noqa: BLE001
logger.warning("❌ Failed to copy cached metadata JSONL: %s", e)
else:
# Fallback: cached_metadata_dict is already a dict (legacy path)
metadata_dict = cached_metadata_dict
total_api_records = len(metadata_dict)
logger.info("Loaded %d metadata records from cached download", total_api_records)
-
+
# Save metadata JSONL for consistency
try:
with open(output_api_metadata_jsonl, "w", encoding="utf-8") as f:
for md in metadata_dict.values():
f.write(json.dumps(md) + "\n")
logger.info("✅ Saved cached metadata JSONL: %s", output_api_metadata_jsonl)
- except Exception as e:
+ except Exception as e: # noqa: BLE001
logger.warning("❌ Failed to save cached metadata JSONL: %s", e)
-
+
# Apply post-cached-download filters (legacy in-memory path)
logger.debug("Using applied_filters from cached strategy: %s", applied_filters)
filtered_accessions_step3, filtered_metadata_step3 = filter_cached_metadata_for_unused_filters(
@@ -8751,44 +9227,58 @@ def virus(
min_release_date=min_release_date,
applied_strategy_filters=applied_filters,
)
- metadata_dict = {acc: md for acc, md in zip(filtered_accessions_step3, filtered_metadata_step3)}
-
+ metadata_dict = dict(zip(filtered_accessions_step3, filtered_metadata_step3, strict=False))
+
logger.info("After post-cached-download filtering: %d records remain", len(metadata_dict))
-
+
# Baseline deduplication for cached path
baseline_skipped_count = 0
if baseline_accessions is not None and metadata_dict:
- logger.info("Deduplicating cached metadata against baseline (%d accessions)...", len(baseline_accessions))
+ logger.info(
+ "Deduplicating cached metadata against baseline (%d accessions)...", len(baseline_accessions)
+ )
metadata_dict, baseline_skipped_count = _deduplicate_metadata_against_baseline(
metadata_dict, baseline_accessions
)
- logger.info("After baseline deduplication: %d new records (skipped %d)",
- len(metadata_dict), baseline_skipped_count)
-
+ logger.info(
+ "After baseline deduplication: %d new records (skipped %d)",
+ len(metadata_dict),
+ baseline_skipped_count,
+ )
+
else:
# Regular API metadata fetch
logger.info("=" * 60)
logger.info("STEP 3: Fetching virus metadata from NCBI API")
logger.info("=" * 60)
api_annotated_filter = annotated if annotated is True else None
- api_complete_filter = True if nuc_completeness=="complete" else False
+ api_complete_filter = True if nuc_completeness == "complete" else False
+
+ logger.debug(
+ "Applying server-side filters: host=%s, geo_location=%s, annotated=%s, complete_only=%s, min_release_date=%s, refseq_only=%s",
+ host,
+ geographic_location,
+ annotated,
+ api_complete_filter,
+ min_release_date,
+ refseq_only,
+ )
- logger.debug("Applying server-side filters: host=%s, geo_location=%s, annotated=%s, complete_only=%s, min_release_date=%s, refseq_only=%s", host, geographic_location, annotated, api_complete_filter, min_release_date, refseq_only)
-
# Track deferred filters that couldn't be applied server-side
deferred_filters = None
-
+
try:
# Check if this is a multi-accession query (list or file)
use_batched_fetch = False
if is_accession:
parsed_accessions = _parse_accession_input(virus)
- if parsed_accessions['type'] in ('list', 'file'):
+ if parsed_accessions["type"] in ("list", "file"):
use_batched_fetch = True
- accession_list = parsed_accessions['accessions']
- logger.info("Detected %d accessions from %s input",
- len(accession_list), parsed_accessions['type'])
-
+ accession_list = parsed_accessions["accessions"]
+ logger.info(
+ "Detected %d accessions from %s input", len(accession_list), parsed_accessions["type"]
+ )
+
if use_batched_fetch:
# Multiple accessions - use batched fetching
api_result = _fetch_metadata_for_accession_list(
@@ -8849,7 +9339,7 @@ def virus(
api_reports, deferred_filters = api_result
else:
api_reports = api_result
-
+
# Log deferred filters if any
if deferred_filters:
logger.info("=" * 60)
@@ -8863,39 +9353,42 @@ def virus(
# Ensure output folder exists for summary file
os.makedirs(outfolder, exist_ok=True)
logger.debug("Ensured output folder exists for error summary: %s", outfolder)
-
+
# Save partial metadata if any was collected before the failure
# Check for streaming temp file from fetch_virus_metadata
partial_metadata_dict = {}
temp_metadata_glob = os.path.join(temp_dir, "gget_metadata_*.jsonl")
import glob as _glob
+
temp_metadata_files = _glob.glob(temp_metadata_glob)
for tmf in temp_metadata_files:
try:
- with open(tmf, 'r', encoding='utf-8') as f:
+ with open(tmf, encoding="utf-8") as f:
for line in f:
line = line.strip()
if line:
try:
record = json.loads(line)
- acc = record.get('accession', {})
+ acc = record.get("accession", {})
if isinstance(acc, dict):
- acc = acc.get('accession', '')
+ acc = acc.get("accession", "")
if acc:
partial_metadata_dict[str(acc)] = record
except json.JSONDecodeError:
continue
- except Exception:
+ except Exception: # noqa: BLE001
continue
-
+
if partial_metadata_dict:
# Convert raw API reports to internal format
try:
partial_internal = load_metadata_from_api_reports(list(partial_metadata_dict.values()))
- except Exception:
- partial_internal = {acc: {'accession': acc} for acc in partial_metadata_dict}
-
- partial_metadata_file = _save_partial_metadata(partial_internal, outfolder, virus_clean, reason="api_failure")
+ except Exception: # noqa: BLE001
+ partial_internal = {acc: {"accession": acc} for acc in partial_metadata_dict}
+
+ partial_metadata_file = _save_partial_metadata(
+ partial_internal, outfolder, virus_clean, reason="api_failure"
+ )
if partial_metadata_file:
logger.info("=" * 60)
logger.info("💾 PARTIAL METADATA SAVED FOR RECOVERY")
@@ -8903,10 +9396,14 @@ def virus(
logger.info(" Records: %d", len(partial_internal))
logger.info("")
logger.info(" Recovery command:")
- logger.info(" gget virus %s --baseline %s --merge-results -o %s",
- virus, partial_metadata_file, outfolder)
+ logger.info(
+ " gget virus %s --baseline %s --merge-results -o %s",
+ virus,
+ partial_metadata_file,
+ outfolder,
+ )
logger.info("=" * 60)
-
+
# Save a summary file documenting the failure, then exit gracefully
logger.error("Failed to fetch virus metadata from NCBI API")
save_command_summary(
@@ -8922,7 +9419,9 @@ def virus(
error_message=str(e),
failed_commands=failed_commands,
partial_metadata_file=partial_metadata_file if partial_metadata_dict else None,
- recovery_command=f"gget virus {virus} --baseline {partial_metadata_file} --merge-results -o {outfolder}" if partial_metadata_dict and partial_metadata_file else None,
+ recovery_command=f"gget virus {virus} --baseline {partial_metadata_file} --merge-results -o {outfolder}"
+ if partial_metadata_dict and partial_metadata_file
+ else None,
)
return None
@@ -8941,7 +9440,7 @@ def virus(
datasets_version=datasets_version,
success=True,
error_message="No virus records found matching the specified criteria (API returned 0 records)",
- failed_commands=failed_commands
+ failed_commands=failed_commands,
)
return
@@ -8951,7 +9450,7 @@ def virus(
# - A list of report dicts for smaller/accession-based queries
_log_memory_usage("after API fetch")
logger.debug("Converting API metadata to internal format...")
-
+
if isinstance(api_reports, str) and os.path.isfile(api_reports):
# Stream from temp JSONL file - avoids loading raw API reports into RAM
logger.info("Loading metadata from streamed temp file (memory-efficient path)...")
@@ -8972,7 +9471,7 @@ def virus(
datasets_version=datasets_version,
success=True,
error_message="No virus records found matching the specified criteria (API returned 0 records)",
- failed_commands=failed_commands
+ failed_commands=failed_commands,
)
return
else:
@@ -8981,7 +9480,7 @@ def virus(
metadata_dict = load_metadata_from_api_reports(api_reports)
# Delete api_reports after conversion - no longer needed
del api_reports
-
+
_force_garbage_collection("after api_reports conversion")
_log_memory_usage("after api_reports cleanup")
@@ -8992,40 +9491,44 @@ def virus(
for md in metadata_dict.values():
f.write(json.dumps(md) + "\n")
logger.info("✅ Saved API metadata JSONL: %s", output_api_metadata_jsonl)
- except Exception as e:
+ except Exception as e: # noqa: BLE001
logger.warning("❌ Failed to save API metadata JSONL: %s", e)
- # SECTION 3b: BASELINE DEDUPLICATION
+ # SECTION 3b: BASELINE DEDUPLICATION
# If baseline accessions were loaded, remove them from the metadata
baseline_skipped_count = 0
if baseline_accessions is not None and metadata_dict:
logger.info("=" * 60)
logger.info("STEP 3b: DEDUPLICATING AGAINST BASELINE")
logger.info("=" * 60)
- logger.info("API returned %d records. Comparing against %d baseline accessions...",
- len(metadata_dict), len(baseline_accessions))
-
+ logger.info(
+ "API returned %d records. Comparing against %d baseline accessions...",
+ len(metadata_dict),
+ len(baseline_accessions),
+ )
+
metadata_dict, baseline_skipped_count = _deduplicate_metadata_against_baseline(
metadata_dict, baseline_accessions
)
-
+
logger.info("Deduplication complete:")
logger.info(" - Total from API: %d", total_api_records)
logger.info(" - Already in baseline (skipped): %d", baseline_skipped_count)
logger.info(" - New accessions to process: %d", len(metadata_dict))
-
+
if not metadata_dict:
logger.warning("All API records already exist in the baseline file.")
logger.info("No new sequences to download.")
-
+
if merge_results and baseline_metadata:
# Copy baseline to output as the merged result
merged_output = os.path.join(outfolder, f"{virus_clean}_merged.csv")
import shutil as _shutil
+
_shutil.copy2(baseline_metadata, merged_output)
logger.info("✅ Baseline copied as merged output: %s", merged_output)
- output_files_dict['Merged Metadata'] = merged_output
-
+ output_files_dict["Merged Metadata"] = merged_output
+
save_command_summary(
outfolder=outfolder,
command_line=command_line,
@@ -9044,7 +9547,7 @@ def virus(
)
return
- # SECTION 4: METADATA-ONLY FILTERING
+ # SECTION 4: METADATA-ONLY FILTERING
logger.info("=" * 60)
logger.info("STEP 4: Applying metadata-only filters")
logger.info("=" * 60)
@@ -9054,12 +9557,14 @@ def virus(
"max_seq_length": max_seq_length,
# "min_gene_count": min_gene_count,
# "max_gene_count": max_gene_count,
- "nuc_completeness": nuc_completeness if nuc_completeness and nuc_completeness.lower() == 'partial' else None, #only for partial cases
+ "nuc_completeness": nuc_completeness
+ if nuc_completeness and nuc_completeness.lower() == "partial"
+ else None, # only for partial cases
"lab_passaged": lab_passaged,
"submitter_country": submitter_country,
"min_collection_date": min_collection_date,
"max_collection_date": max_collection_date,
- "source_database": source_database if source_database and source_database.lower() == 'genbank' else None,
+ "source_database": source_database if source_database and source_database.lower() == "genbank" else None,
"max_release_date": max_release_date,
# "min_mature_peptide_count": min_mature_peptide_count,
# "max_mature_peptide_count": max_mature_peptide_count,
@@ -9074,8 +9579,8 @@ def virus(
"isolate": isolate,
"isolation_source": isolation_source,
# Add deferred filters if server-side filter failed
- "geographic_location": deferred_filters.get('geographic_location') if deferred_filters else None,
- "host": deferred_filters.get('host') if deferred_filters else None,
+ "geographic_location": deferred_filters.get("geographic_location") if deferred_filters else None,
+ "host": deferred_filters.get("host") if deferred_filters else None,
}
all_metadata_filters_none = all(v is None for k, v in filters.items())
@@ -9091,7 +9596,9 @@ def virus(
filtered_metadata = list(metadata_dict.values())
logger.info("All %d sequences will proceed to sequence download", len(filtered_accessions))
else:
- filtered_accessions, filtered_metadata, metadata_filter_stats = filter_metadata_only(metadata_dict, **filters)
+ filtered_accessions, filtered_metadata, metadata_filter_stats = filter_metadata_only(
+ metadata_dict, **filters
+ )
if not filtered_accessions:
pass # No sequences passed metadata filters
total_after_metadata_filter = 0
@@ -9114,7 +9621,7 @@ def virus(
metadata_filter_stats=metadata_filter_stats,
)
return
-
+
total_after_metadata_filter = len(filtered_accessions)
_log_memory_usage("after metadata filtering")
@@ -9125,29 +9632,31 @@ def virus(
for md in filtered_metadata:
f.write(json.dumps(md) + "\n")
logger.info("✅ Saved filtered metadata JSONL: %s", output_metadata_jsonl)
- except Exception as e:
+ except Exception as e: # noqa: BLE001
logger.warning("❌ Failed to save filtered metadata JSONL: %s", e)
- # SECTION 4.5: EARLY GENBANK METADATA FETCHING AND FILTERING
- # This step fetches GenBank metadata and applies GenBank-dependent filters BEFORE downloading sequences, dramatically reducing the number of sequences to download.
-
+ # SECTION 4.5: EARLY GENBANK METADATA FETCHING AND FILTERING
+ # This step fetches GenBank metadata and applies GenBank-dependent filters BEFORE downloading sequences, dramatically reducing the number of sequences to download.
+
# Track GenBank data for later steps (saving to CSV)
genbank_data_prefetch = None
genbank_prefetch_done = False
-
+
# Check if any GenBank-dependent filters are specified
- genbank_dependent_filters_active = any([
- provirus is not None,
- genotype is not None,
- has_proteins is not None,
- gen_mol_type is not None,
- env_source is not None,
- min_gene_count is not None,
- max_gene_count is not None,
- min_mature_peptide_count is not None,
- max_mature_peptide_count is not None,
- ])
-
+ genbank_dependent_filters_active = any(
+ [
+ provirus is not None,
+ genotype is not None,
+ has_proteins is not None,
+ gen_mol_type is not None,
+ env_source is not None,
+ min_gene_count is not None,
+ max_gene_count is not None,
+ min_mature_peptide_count is not None,
+ max_mature_peptide_count is not None,
+ ]
+ )
+
if genbank_dependent_filters_active and filtered_accessions:
logger.info("=" * 60)
logger.info("STEP 4.5: Early GenBank metadata fetch and filtering (OPTIMIZATION)")
@@ -9156,36 +9665,36 @@ def virus(
logger.info("sequence download to reduce the number of sequences to download.")
logger.info("This can dramatically speed up processing for large datasets.")
_log_memory_usage("before early GenBank fetch")
-
+
try:
# Create temp paths for GenBank data
genbank_prefetch_xml = os.path.join(temp_dir, f"{virus_clean}_genbank_prefetch.xml")
genbank_prefetch_csv = os.path.join(temp_dir, f"{virus_clean}_genbank_prefetch.csv")
-
+
# ESearch pre-filtering
accessions_to_fetch = list(set(filtered_accessions))
-
+
# Get virus taxid from metadata for ESearch query
virus_taxid_for_esearch = None
if filtered_metadata:
- virus_taxid_for_esearch = filtered_metadata[0].get('virusTaxId')
+ virus_taxid_for_esearch = filtered_metadata[0].get("virusTaxId")
if not virus_taxid_for_esearch:
# Try to get from nested virus dict
- virus_dict = filtered_metadata[0].get('virus', {})
+ virus_dict = filtered_metadata[0].get("virus", {})
if isinstance(virus_dict, dict):
- virus_taxid_for_esearch = virus_dict.get('tax_id')
-
+ virus_taxid_for_esearch = virus_dict.get("tax_id")
+
# Only attempt ESearch pre-filtering if we have enough accessions
# to make it worthwhile (>1000) and we have GenBank-dependent filters
esearch_prefilter_threshold = 1000
if len(accessions_to_fetch) > esearch_prefilter_threshold and virus_taxid_for_esearch:
logger.info("=" * 60)
- logger.info("ESearch PRE-FILTERING: Narrowing %d accessions to likely candidates",
- len(accessions_to_fetch))
- logger.info("This avoids fetching full GenBank XML for all %d accessions",
- len(accessions_to_fetch))
+ logger.info(
+ "ESearch PRE-FILTERING: Narrowing %d accessions to likely candidates", len(accessions_to_fetch)
+ )
+ logger.info("This avoids fetching full GenBank XML for all %d accessions", len(accessions_to_fetch))
logger.info("=" * 60)
-
+
esearch_candidates = _esearch_prefilter_genbank(
virus_taxid=virus_taxid_for_esearch,
metadata_filtered_accessions=accessions_to_fetch,
@@ -9197,7 +9706,7 @@ def virus(
max_seq_length=max_seq_length,
api_key=api_key,
)
-
+
if esearch_candidates is not None:
if len(esearch_candidates) == 0:
# ESearch says NO accessions match
@@ -9206,7 +9715,7 @@ def virus(
filtered_accessions = []
filtered_metadata = []
genbank_data_prefetch = {}
-
+
save_command_summary(
outfolder=outfolder,
command_line=command_line,
@@ -9228,19 +9737,22 @@ def virus(
if not keep_temp and os.path.isdir(temp_dir):
try:
shutil.rmtree(temp_dir)
- except Exception:
+ except Exception: # noqa: BLE001
pass
return
else:
# Narrow to only the candidates
accessions_to_fetch = list(esearch_candidates)
- logger.info("✅ ESearch pre-filter successful: narrowed from %d to %d accessions",
- len(filtered_accessions), len(accessions_to_fetch))
+ logger.info(
+ "✅ ESearch pre-filter successful: narrowed from %d to %d accessions",
+ len(filtered_accessions),
+ len(accessions_to_fetch),
+ )
else:
logger.info("ESearch pre-filter: could not pre-filter, using full accession list")
-
+
logger.info("Fetching GenBank metadata for %d accessions...", len(accessions_to_fetch))
-
+
# Fetch GenBank metadata
genbank_data_prefetch, genbank_failed_log = fetch_genbank_metadata(
accessions=accessions_to_fetch,
@@ -9248,14 +9760,14 @@ def virus(
genbank_full_csv_path=genbank_prefetch_csv,
batch_size=genbank_batch_size,
delay=GENBANK_INTER_BATCH_DELAY,
- api_key=api_key
+ api_key=api_key,
)
-
+
if genbank_data_prefetch:
genbank_prefetch_done = True
logger.info("Successfully retrieved GenBank metadata for %d accessions", len(genbank_data_prefetch))
_log_memory_usage("after early GenBank fetch")
-
+
# Apply GenBank-dependent filters
logger.info("Applying GenBank-dependent filters...")
filters_genbank_early = {
@@ -9269,38 +9781,50 @@ def virus(
"min_mature_peptide_count": min_mature_peptide_count,
"max_mature_peptide_count": max_mature_peptide_count,
}
-
+
genbank_filtered_accessions_early, genbank_filter_stats = filter_genbank_metadata(
genbank_metadata=genbank_data_prefetch,
**filters_genbank_early,
)
-
+
if genbank_filtered_accessions_early:
# Calculate reduction
before_count = len(filtered_accessions)
after_count = len(genbank_filtered_accessions_early)
reduction_pct = ((before_count - after_count) / before_count) * 100 if before_count > 0 else 0
-
+
logger.info("=" * 60)
logger.info("🎯 EARLY GENBANK FILTERING RESULTS:")
logger.info(" Before GenBank filters: %d accessions", before_count)
logger.info(" After GenBank filters: %d accessions", after_count)
- logger.info(" Reduction: %.1f%% (%d accessions filtered out)", reduction_pct, before_count - after_count)
- logger.info(" This means we'll download %d sequences instead of %d!", after_count, before_count)
+ logger.info(
+ " Reduction: %.1f%% (%d accessions filtered out)",
+ reduction_pct,
+ before_count - after_count,
+ )
+ logger.info(
+ " This means we'll download %d sequences instead of %d!", after_count, before_count
+ )
logger.info("=" * 60)
-
+
# Update filtered_accessions and filtered_metadata
genbank_filtered_set_early = set(genbank_filtered_accessions_early)
filtered_accessions = [acc for acc in filtered_accessions if acc in genbank_filtered_set_early]
- filtered_metadata = [md for md in filtered_metadata if md['accession'] in genbank_filtered_set_early]
-
+ filtered_metadata = [
+ md for md in filtered_metadata if md["accession"] in genbank_filtered_set_early
+ ]
+
# Also filter genbank_data_prefetch to only include passing accessions
- genbank_data_prefetch = {acc: genbank_data_prefetch[acc] for acc in genbank_filtered_accessions_early if acc in genbank_data_prefetch}
-
+ genbank_data_prefetch = {
+ acc: genbank_data_prefetch[acc]
+ for acc in genbank_filtered_accessions_early
+ if acc in genbank_data_prefetch
+ }
+
# Update total_after_metadata_filter to reflect GenBank filtering
# Note: This is now total after BOTH metadata + GenBank filtering
total_after_genbank_filter = len(filtered_accessions)
-
+
_force_garbage_collection("after early GenBank filtering")
_log_memory_usage("after early GenBank filtering cleanup")
else:
@@ -9309,7 +9833,7 @@ def virus(
filtered_accessions = []
filtered_metadata = []
genbank_data_prefetch = {}
-
+
# Save command summary and return early
save_command_summary(
outfolder=outfolder,
@@ -9333,21 +9857,23 @@ def virus(
if not keep_temp and os.path.isdir(temp_dir):
try:
shutil.rmtree(temp_dir)
- except Exception:
+ except Exception: # noqa: BLE001
pass
return
else:
logger.warning("Failed to retrieve GenBank metadata. Proceeding without early filtering.")
logger.warning("GenBank-dependent filters will be applied after sequence download (slower).")
genbank_prefetch_done = False
-
- except Exception as e:
+
+ except Exception as e: # noqa: BLE001
logger.warning("Early GenBank fetch failed: %s", e)
- logger.warning("Proceeding without early GenBank filtering. Filters will be applied after sequence download.")
+ logger.warning(
+ "Proceeding without early GenBank filtering. Filters will be applied after sequence download."
+ )
genbank_prefetch_done = False
genbank_data_prefetch = None
- # SECTION 5: DOWNLOAD SEQUENCES FOR FILTERED ACCESSIONS ONLY
+ # SECTION 5: DOWNLOAD SEQUENCES FOR FILTERED ACCESSIONS ONLY
logger.info("=" * 60)
logger.info("STEP 5: Downloading sequences for filtered accessions")
logger.info("=" * 60)
@@ -9356,24 +9882,24 @@ def virus(
if used_cached_download and cached_fasta_file:
logger.info("Using sequences from cached download (skipping sequence download)")
logger.info("Streaming and filtering cached FASTA file on-demand...")
-
+
# Create filtered accessions set for faster lookup
filtered_acc_set = set(filtered_accessions)
-
+
# Stream through cached FASTA and write only filtered sequences
# This avoids loading the entire file into RAM
fna_file = os.path.join(temp_dir, f"{virus_clean}_cached_sequences.fasta")
filtered_count = 0
-
+
try:
# Generator expression: yields only sequences in filtered_accessions
filtered_records = (r for r in FastaIO.parse(cached_fasta_file, "fasta") if r.id in filtered_acc_set)
FastaIO.write(filtered_records, fna_file, "fasta")
-
+
# Count the written sequences
for _ in FastaIO.parse(fna_file, "fasta"):
filtered_count += 1
-
+
logger.info("✅ Streamed and wrote %d filtered sequences from cached FASTA", filtered_count)
logger.info(" Output: %s", fna_file)
except Exception as e:
@@ -9381,17 +9907,19 @@ def virus(
raise RuntimeError(f"Failed to process cached FASTA file: {e}") from e
else:
# Regular sequence download
- fna_file = download_sequences_by_accessions(filtered_accessions, outdir=temp_dir, failed_commands=failed_commands, api_key=api_key)
+ fna_file = download_sequences_by_accessions(
+ filtered_accessions, outdir=temp_dir, failed_commands=failed_commands, api_key=api_key
+ )
if not os.path.exists(fna_file):
raise RuntimeError(f"❌ Download failed: FASTA file not found at {fna_file}")
logger.info("Downloaded FASTA file: %s (%.2f MB)", fna_file, os.path.getsize(fna_file) / 1024 / 1024)
- # SECTION 6: SEQUENCE-DEPENDENT FILTERING
+ # SECTION 6: SEQUENCE-DEPENDENT FILTERING
logger.info("=" * 60)
logger.info("STEP 6: Applying sequence-dependent filters and saving results")
logger.info("=" * 60)
- filters_seq={
+ filters_seq = {
"max_ambiguous_chars": max_ambiguous_chars,
# "has_proteins": has_proteins,
"proteins_complete": proteins_complete,
@@ -9420,7 +9948,7 @@ def virus(
)
# Clean up filtered_metadata_dict after use
del filtered_metadata_dict
-
+
# metadata_dict is no longer needed after this point
# filtered_metadata_final and filtered_accessions contain all we need
try:
@@ -9430,7 +9958,7 @@ def virus(
_force_garbage_collection("after sequence filtering")
_log_memory_usage("after sequence filtering cleanup")
- # SECTION 7: SAVING FINAL OUTPUT FILES
+ # SECTION 7: SAVING FINAL OUTPUT FILES
logger.info("=" * 60)
logger.info("STEP 7: Saving final output files")
logger.info("=" * 60)
@@ -9439,11 +9967,15 @@ def virus(
# FASTA was already written during Section 6 (streaming to output_fasta_file)
if os.path.exists(output_fasta_file):
- logger.info("✅ FASTA file saved: %s (%.2f MB)", output_fasta_file, os.path.getsize(output_fasta_file) / 1024 / 1024)
- output_files_dict['FASTA Sequences'] = output_fasta_file
+ logger.info(
+ "✅ FASTA file saved: %s (%.2f MB)",
+ output_fasta_file,
+ os.path.getsize(output_fasta_file) / 1024 / 1024,
+ )
+ output_files_dict["FASTA Sequences"] = output_fasta_file
else:
logger.error("❌ FASTA file not found at expected location: %s", output_fasta_file)
-
+
# Track final metadata for summary
final_metadata_for_summary = filtered_metadata_final
@@ -9452,8 +9984,12 @@ def virus(
with open(output_metadata_jsonl, "w", encoding="utf-8") as file:
for metadata in filtered_metadata_final:
file.write(json.dumps(metadata) + "\n")
- logger.info("✅ JSONL metadata file saved: %s (%.2f MB)", output_metadata_jsonl, os.path.getsize(output_metadata_jsonl) / 1024 / 1024)
- output_files_dict['JSONL Metadata'] = output_metadata_jsonl
+ logger.info(
+ "✅ JSONL metadata file saved: %s (%.2f MB)",
+ output_metadata_jsonl,
+ os.path.getsize(output_metadata_jsonl) / 1024 / 1024,
+ )
+ output_files_dict["JSONL Metadata"] = output_metadata_jsonl
except Exception as e:
logger.error("❌ Failed to save JSONL metadata file: %s", e)
raise
@@ -9462,20 +9998,24 @@ def virus(
try:
save_metadata_to_csv(filtered_metadata_final, protein_headers, output_metadata_csv)
if os.path.exists(output_metadata_csv):
- logger.info("✅ CSV metadata file saved: %s (%.2f MB)", output_metadata_csv, os.path.getsize(output_metadata_csv) / 1024 / 1024)
- output_files_dict['CSV Metadata'] = output_metadata_csv
+ logger.info(
+ "✅ CSV metadata file saved: %s (%.2f MB)",
+ output_metadata_csv,
+ os.path.getsize(output_metadata_csv) / 1024 / 1024,
+ )
+ output_files_dict["CSV Metadata"] = output_metadata_csv
else:
logger.error("❌ Failed to create CSV file: %s", output_metadata_csv)
except Exception as e:
logger.error("❌ Failed to save CSV metadata file: %s", e)
raise
-
+
# SECTION 7b: BASELINE MERGE/NO-MERGE OUTPUT
if baseline_accessions is not None and baseline_metadata:
logger.info("=" * 60)
logger.info("STEP 7b: Baseline merge/split output")
logger.info("=" * 60)
-
+
if merge_results:
# Merge new results with baseline into a single file
merged_csv_path = os.path.join(outfolder, f"{virus_clean}_merged.csv")
@@ -9483,7 +10023,7 @@ def virus(
baseline_metadata, filtered_metadata_final, merged_csv_path
)
if merge_success:
- output_files_dict['Merged Metadata (CSV)'] = merged_csv_path
+ output_files_dict["Merged Metadata (CSV)"] = merged_csv_path
logger.info("✅ Merged CSV: %s", merged_csv_path)
else:
logger.warning("⚠️ Merge failed. New-only output is available at: %s", output_metadata_csv)
@@ -9491,71 +10031,74 @@ def virus(
# No-merge mode: label the new-only output clearly
new_csv_path = os.path.join(outfolder, f"{virus_clean}_new.csv")
baseline_ref_path = os.path.join(outfolder, f"{virus_clean}_baseline_provided.csv")
-
+
# Rename existing CSV to _new
if os.path.exists(output_metadata_csv):
shutil.copy2(output_metadata_csv, new_csv_path)
- output_files_dict['New Metadata (CSV)'] = new_csv_path
+ output_files_dict["New Metadata (CSV)"] = new_csv_path
logger.info("✅ New-only CSV: %s (%d sequences)", new_csv_path, len(filtered_metadata_final))
-
+
# Copy baseline as reference
shutil.copy2(baseline_metadata, baseline_ref_path)
- output_files_dict['Baseline Provided (CSV)'] = baseline_ref_path
+ output_files_dict["Baseline Provided (CSV)"] = baseline_ref_path
logger.info("✅ Baseline reference: %s", baseline_ref_path)
else:
logger.info("Skipping this step since no sequences passed all filters")
-
+
# Clean up before GenBank fetch
# filtered_metadata_final contains all we need - clear other references
# Note: filtered_metadata may still reference same objects, but that's okay
_force_garbage_collection("before GenBank fetch")
- # SECTION 8: GENBANK METADATA RETRIEVAL (OPTIONAL)
+ # SECTION 8: GENBANK METADATA RETRIEVAL (OPTIONAL)
logger.info("=" * 60)
logger.info("STEP 8: Fetching detailed GenBank metadata")
logger.info("=" * 60)
_log_memory_usage("STEP 8 start")
if genbank_metadata and total_final_sequences > 0:
logger.info("GenBank metadata retrieval requested...")
-
+
# Check if we already have GenBank data from early pre-fetch (Step 4.5)
if genbank_prefetch_done and genbank_data_prefetch:
logger.info("Using pre-fetched GenBank data from Step 4.5 (no re-fetch needed)")
genbank_data = genbank_data_prefetch
-
+
# Save GenBank metadata to final output location
try:
# Copy temp XML to final location if it exists
genbank_prefetch_xml = os.path.join(temp_dir, f"{virus_clean}_genbank_prefetch.xml")
genbank_prefetch_csv = os.path.join(temp_dir, f"{virus_clean}_genbank_prefetch.csv")
-
+
if os.path.exists(genbank_prefetch_xml):
shutil.copy(genbank_prefetch_xml, genbank_full_xml_path)
if os.path.exists(genbank_prefetch_csv):
shutil.copy(genbank_prefetch_csv, genbank_full_csv_path)
-
+
# Save GenBank metadata to CSV
save_genbank_metadata_to_csv(
genbank_metadata=genbank_data,
output_file=genbank_csv_path,
- virus_metadata=filtered_metadata_final
+ virus_metadata=filtered_metadata_final,
+ )
+ logger.info(
+ "✅ GenBank metadata CSV saved: %s (%.2f MB)",
+ genbank_csv_path,
+ os.path.getsize(genbank_csv_path) / 1024 / 1024,
)
- logger.info("✅ GenBank metadata CSV saved: %s (%.2f MB)",
- genbank_csv_path, os.path.getsize(genbank_csv_path) / 1024 / 1024)
-
+
# Merge with standard metadata CSV if it exists
if os.path.exists(output_metadata_csv):
merge_metadata_csvs(genbank_csv_path, output_metadata_csv)
-
- output_files_dict['GenBank CSV Metadata'] = genbank_csv_path
+
+ output_files_dict["GenBank CSV Metadata"] = genbank_csv_path
if os.path.exists(genbank_full_xml_path):
- output_files_dict['GenBank Full XML'] = genbank_full_xml_path
+ output_files_dict["GenBank Full XML"] = genbank_full_xml_path
if os.path.exists(genbank_full_csv_path):
- output_files_dict['GenBank Full CSV'] = genbank_full_csv_path
+ output_files_dict["GenBank Full CSV"] = genbank_full_csv_path
genbank_success = True
-
- except Exception as e:
+
+ except Exception as e: # noqa: BLE001
logger.error("❌ Failed to save pre-fetched GenBank data: %s", e)
genbank_error_msg = str(e)
else:
@@ -9565,64 +10108,69 @@ def virus(
# Use filtered_accessions (from metadata) instead of iterating
# in-memory sequences - avoids loading all sequences into RAM
final_accessions = list(filtered_accessions)
-
+
if final_accessions:
logger.info("Fetching GenBank metadata for %d sequences...", len(final_accessions))
# Fetch GenBank metadata
genbank_data, genbank_failed_log = fetch_genbank_metadata(
accessions=list(set(final_accessions)), # Remove duplicates
- genbank_full_xml_path=genbank_full_xml_path, genbank_full_csv_path=genbank_full_csv_path,
+ genbank_full_xml_path=genbank_full_xml_path,
+ genbank_full_csv_path=genbank_full_csv_path,
batch_size=genbank_batch_size,
delay=GENBANK_INTER_BATCH_DELAY,
- api_key=api_key
+ api_key=api_key,
)
-
+
# Parse GenBank failed batches log if it exists
if genbank_failed_log and os.path.exists(genbank_failed_log):
try:
- with open(genbank_failed_log, 'r') as flog:
+ with open(genbank_failed_log) as flog:
content = flog.read()
# Parse the log file to extract failed batch information
- batch_pattern = r'FAILED_BATCH: \[([^\]]+)\][\s\S]*?URL: ([^\n]+)'
+ batch_pattern = r"FAILED_BATCH: \[([^\]]+)\][\s\S]*?URL: ([^\n]+)"
matches = re.findall(batch_pattern, content)
for accessions_str, url in matches:
# Clean up accessions string
- batch_accessions = [acc.strip().strip("'").strip('"') for acc in accessions_str.split(',')]
- failed_commands['genbank_batches'].append({
- 'accessions': batch_accessions,
- 'retry_url': url.strip()
- })
- except Exception as parse_error:
+ batch_accessions = [
+ acc.strip().strip("'").strip('"') for acc in accessions_str.split(",")
+ ]
+ failed_commands["genbank_batches"].append(
+ {"accessions": batch_accessions, "retry_url": url.strip()}
+ )
+ except Exception as parse_error: # noqa: BLE001
logger.debug("Could not parse GenBank failed batches log: %s", parse_error)
-
+
if genbank_data and not genbank_dependent_filters_active:
# No GenBank filters - just save the data
save_genbank_metadata_to_csv(
genbank_metadata=genbank_data,
output_file=genbank_csv_path,
- virus_metadata=filtered_metadata_final
+ virus_metadata=filtered_metadata_final,
)
- logger.info("✅ GenBank metadata CSV saved: %s (%.2f MB)",
- genbank_csv_path, os.path.getsize(genbank_csv_path) / 1024 / 1024)
-
+ logger.info(
+ "✅ GenBank metadata CSV saved: %s (%.2f MB)",
+ genbank_csv_path,
+ os.path.getsize(genbank_csv_path) / 1024 / 1024,
+ )
+
# Merge with standard metadata CSV if it exists
if os.path.exists(output_metadata_csv):
merge_metadata_csvs(genbank_csv_path, output_metadata_csv)
-
- output_files_dict['GenBank CSV Metadata'] = genbank_csv_path
+
+ output_files_dict["GenBank CSV Metadata"] = genbank_csv_path
if os.path.exists(genbank_full_xml_path):
- output_files_dict['GenBank Full XML'] = genbank_full_xml_path
+ output_files_dict["GenBank Full XML"] = genbank_full_xml_path
if os.path.exists(genbank_full_csv_path):
- output_files_dict['GenBank Full CSV'] = genbank_full_csv_path
+ output_files_dict["GenBank Full CSV"] = genbank_full_csv_path
genbank_success = True
elif genbank_data and genbank_dependent_filters_active:
# GenBank filters needed - this is the fallback path when pre-fetch failed
logger.info("GenBank metadata retrieved. Applying filters (fallback path)...")
_log_memory_usage("before fallback GenBank filtering")
_force_garbage_collection("before fallback filtering")
-
- filters_genbank={
+
+ filters_genbank = {
"provirus": provirus,
"has_proteins": has_proteins,
"genotype": genotype,
@@ -9641,54 +10189,68 @@ def virus(
)
if genbank_filtered_accessions:
- logger.info("After applying GenBank-based filters, %d sequences remain", len(genbank_filtered_accessions))
-
+ logger.info(
+ "After applying GenBank-based filters, %d sequences remain",
+ len(genbank_filtered_accessions),
+ )
+
genbank_filtered_set = set(genbank_filtered_accessions)
total_after_genbank_filter = len(genbank_filtered_accessions)
# Re-filter FASTA by streaming from output
# file through accession filter, instead of holding all sequences in RAM
temp_refiltered_fasta = output_fasta_file + ".tmp"
- refiltered_count = _stream_copy_fasta(output_fasta_file, temp_refiltered_fasta, genbank_filtered_set)
+ refiltered_count = _stream_copy_fasta(
+ output_fasta_file, temp_refiltered_fasta, genbank_filtered_set
+ )
shutil.move(temp_refiltered_fasta, output_fasta_file)
total_final_sequences = refiltered_count
-
- filtered_metadata_final = [md for md in filtered_metadata_final if md['accession'] in genbank_filtered_set]
- genbank_data_filtered = {acc: genbank_data[acc] for acc in genbank_filtered_accessions if acc in genbank_data}
-
+
+ filtered_metadata_final = [
+ md for md in filtered_metadata_final if md["accession"] in genbank_filtered_set
+ ]
+ genbank_data_filtered = {
+ acc: genbank_data[acc]
+ for acc in genbank_filtered_accessions
+ if acc in genbank_data
+ }
+
del genbank_data
_force_garbage_collection("after fallback GenBank filtering")
save_metadata_to_csv(filtered_metadata_final, protein_headers, output_metadata_csv)
-
+
try:
with open(output_metadata_jsonl, "w", encoding="utf-8") as f:
for md in filtered_metadata_final:
f.write(json.dumps(md) + "\n")
- except Exception as e:
+ except Exception as e: # noqa: BLE001
logger.warning("❌ Failed to update JSONL metadata file: %s", e)
-
+
save_genbank_metadata_to_csv(
genbank_metadata=genbank_data_filtered,
output_file=genbank_csv_path,
- virus_metadata=filtered_metadata_final
+ virus_metadata=filtered_metadata_final,
)
- logger.info("✅ GenBank metadata CSV saved: %s (%.2f MB)",
- genbank_csv_path, os.path.getsize(genbank_csv_path) / 1024 / 1024)
-
+ logger.info(
+ "✅ GenBank metadata CSV saved: %s (%.2f MB)",
+ genbank_csv_path,
+ os.path.getsize(genbank_csv_path) / 1024 / 1024,
+ )
+
if os.path.exists(output_metadata_csv):
merge_metadata_csvs(genbank_csv_path, output_metadata_csv)
-
- output_files_dict['FASTA Sequences'] = output_fasta_file
- output_files_dict['CSV Metadata'] = output_metadata_csv
- output_files_dict['GenBank CSV Metadata'] = genbank_csv_path
+
+ output_files_dict["FASTA Sequences"] = output_fasta_file
+ output_files_dict["CSV Metadata"] = output_metadata_csv
+ output_files_dict["GenBank CSV Metadata"] = genbank_csv_path
if os.path.exists(genbank_full_xml_path):
- output_files_dict['GenBank Full XML'] = genbank_full_xml_path
+ output_files_dict["GenBank Full XML"] = genbank_full_xml_path
if os.path.exists(genbank_full_csv_path):
- output_files_dict['GenBank Full CSV'] = genbank_full_csv_path
+ output_files_dict["GenBank Full CSV"] = genbank_full_csv_path
final_metadata_for_summary = filtered_metadata_final
genbank_success = True
-
+
del genbank_data_filtered
del genbank_filtered_accessions
del genbank_filtered_set
@@ -9703,28 +10265,30 @@ def virus(
os.remove(output_metadata_csv)
if os.path.exists(output_metadata_jsonl):
os.remove(output_metadata_jsonl)
- except Exception as e:
+ except Exception as e: # noqa: BLE001
logger.error("❌ Failed to apply GenBank-based filters: %s", e)
- logger.warning("Continuing with previously filtered results without GenBank-based filtering.")
+ logger.warning(
+ "Continuing with previously filtered results without GenBank-based filtering."
+ )
else:
logger.warning("No GenBank metadata was retrieved")
genbank_error_msg = "No GenBank metadata was retrieved"
else:
logger.warning("No accession numbers found for GenBank metadata lookup")
genbank_error_msg = "No accession numbers found for GenBank metadata lookup"
-
- except Exception as genbank_error:
+
+ except Exception as genbank_error: # noqa: BLE001
logger.error("❌ GenBank metadata retrieval failed: %s", genbank_error)
logger.warning("Continuing without GenBank metadata - standard output files are still available")
genbank_error_msg = str(genbank_error)
-
+
_log_memory_usage("GenBank processing complete")
_force_garbage_collection("after GenBank processing")
logger.info("GenBank metadata processing completed")
else:
logger.info("Skipping this step since GenBank metadata retrieval was not requested.")
- # SECTION 9: FINAL SUMMARY
+ # SECTION 9: FINAL SUMMARY
# Provide comprehensive summary of the results
if total_final_sequences > 0:
logger.info("=" * 60)
@@ -9744,11 +10308,16 @@ def virus(
# Check if GenBank metadata CSV was created
if genbank_metadata:
if genbank_success and os.path.exists(genbank_csv_path):
- logger.info(" 📊 Metadata (including Genbank information): %s", os.path.basename(genbank_csv_path))
+ logger.info(
+ " 📊 Metadata (including Genbank information): %s", os.path.basename(genbank_csv_path)
+ )
if os.path.exists(genbank_full_xml_path):
logger.info(" 🧬 GenBank-only full XML: %s", os.path.basename(genbank_full_xml_path))
if os.path.exists(genbank_full_csv_path):
- logger.info(" 🧬 GenBank-only full CSV (readable XML format): %s", os.path.basename(genbank_full_csv_path))
+ logger.info(
+ " 🧬 GenBank-only full CSV (readable XML format): %s",
+ os.path.basename(genbank_full_csv_path),
+ )
else:
logger.warning("")
logger.warning("⚠️ GenBank metadata was requested but NOT saved due to errors:")
@@ -9756,7 +10325,7 @@ def virus(
logger.warning(" Standard metadata files are still available.")
logger.info("=" * 60)
-
+
# Save command summary
save_command_summary(
outfolder=outfolder,
@@ -9793,7 +10362,7 @@ def virus(
logger.warning(" - Trying a broader virus query term")
logger.warning(" - Removing some of the more restrictive filters")
logger.warning("=" * 60)
-
+
# Save command summary even when no sequences pass
save_command_summary(
outfolder=outfolder,
@@ -9817,9 +10386,11 @@ def virus(
except Exception as e:
# Handle any unexpected errors during processing
error_msg = str(e)
-
+
# Check if this is a server-side issue that we can provide guidance for
- if any(indicator in error_msg.lower() for indicator in ['timeout', '500 server error', 'internal server error']):
+ if any(
+ indicator in error_msg.lower() for indicator in ["timeout", "500 server error", "internal server error"]
+ ):
logger.error("=" * 80)
logger.error("SERVER-SIDE ERROR DETECTED")
logger.error("=" * 80)
@@ -9828,17 +10399,17 @@ def virus(
logger.error("")
logger.error("Error details: %s", e)
logger.error("")
-
+
# Provide alternative commands based on the problematic parameters
if geographic_location:
logger.error("🔧 SUGGESTED SOLUTION:")
logger.error("The geographic location filter may be causing server issues.")
logger.error("Try running without the geographic filter and filter manually afterward:")
logger.error("")
-
+
# Build alternative command
cmd_parts = [f"gget.virus('{virus}'"]
-
+
# Add all non-problematic filters
if host:
cmd_parts.append(f"host='{host}'")
@@ -9873,25 +10444,25 @@ def virus(
cmd_parts.append(f"has_proteins={has_proteins}")
else:
cmd_parts.append(f"has_proteins='{has_proteins}'")
-
+
cmd_parts.append(f"outfolder='{virus_clean}_data'")
-
+
alternative_cmd = ", ".join(cmd_parts) + ")"
logger.error("📋 ALTERNATIVE COMMAND:")
logger.error(" %s", alternative_cmd)
logger.error("")
logger.error("After download completes, filter the output CSV file by the")
logger.error("'Geographic Location' column to get sequences from '%s'.", geographic_location)
-
- elif any(x in virus.lower() for x in ['sars-cov-2', 'covid', 'influenza']) and not host:
+
+ elif any(x in virus.lower() for x in ["sars-cov-2", "covid", "influenza"]) and not host:
logger.error("🔧 SUGGESTED SOLUTION:")
logger.error("Large datasets like '%s' may cause server timeouts.", virus)
logger.error("Try adding a host filter to reduce the dataset size:")
logger.error("")
-
+
# Build alternative command with host filter
cmd_parts = [f"gget.virus('{virus}'", "host='human'"]
-
+
# Add existing filters
if min_seq_length:
cmd_parts.append(f"min_seq_length={min_seq_length}")
@@ -9901,20 +10472,20 @@ def virus(
cmd_parts.append(f"nuc_completeness='{nuc_completeness}'")
if annotated is not None:
cmd_parts.append(f"annotated={annotated}")
-
+
cmd_parts.append(f"outfolder='{virus_clean}_data'")
-
+
alternative_cmd = ", ".join(cmd_parts) + ")"
logger.error("📋 ALTERNATIVE COMMAND:")
logger.error(" %s", alternative_cmd)
-
+
else:
logger.error("🔧 SUGGESTED SOLUTIONS:")
logger.error("1. Wait a few minutes and try again (server issues are often temporary)")
logger.error("2. Try using more specific filters to reduce dataset size")
logger.error("3. Use host='human' filter if studying human pathogens")
logger.error("4. Add date range filters to limit the time period")
-
+
logger.error("=" * 80)
else:
# For non-server errors, show the original error message
@@ -9922,7 +10493,7 @@ def virus(
logger.error("Error type: %s", type(e).__name__)
if logger.getEffectiveLevel() <= logging.DEBUG:
logger.debug("Full traceback:\n%s", traceback.format_exc())
-
+
# Save command summary with error information
save_command_summary(
outfolder=outfolder if outfolder else os.getcwd(),
@@ -9942,9 +10513,9 @@ def virus(
genbank_filter_stats=genbank_filter_stats,
sequence_filter_stats=sequence_filter_stats,
)
-
+
raise
-
+
# SECTION 10: CLEANUP
finally:
# Always clean up temporary files, regardless of success or failure
@@ -9958,9 +10529,9 @@ def virus(
if genbank_metadata and genbank_success and os.path.exists(output_metadata_csv):
os.remove(output_metadata_csv)
logger.debug("✅ Cleaned up temporary directory: %s", temp_dir)
- except Exception as e:
+ except Exception as e: # noqa: BLE001
logger.warning("Failed to clean up temporary directory %s: %s", temp_dir, e)
-
+
# Clean up cached download files (zip file and extracted directory)
# This ensures the folder structure is identical whether using cached or API-based downloads
logger.debug("Checking for cached download files to clean up...")
@@ -9977,11 +10548,11 @@ def virus(
if os.path.exists(cached_extract_dir) and os.path.isdir(cached_extract_dir):
shutil.rmtree(cached_extract_dir)
logger.debug("✅ Cleaned up cached extracted directory: %s", cached_extract_dir)
- except Exception as e:
+ except Exception as e: # noqa: BLE001
logger.warning("Failed to clean up cached download files: %s", e)
elif cached_zip_file and keep_temp:
logger.debug("Preserving cached download files as per user request: %s", cached_zip_file)
-
+
if keep_temp and os.path.exists(output_api_metadata_jsonl):
logger.debug("Preserving temporary directory as per user request: %s", temp_dir)
shutil.move(output_api_metadata_jsonl, os.path.join(temp_dir, os.path.basename(output_api_metadata_jsonl)))
@@ -9991,15 +10562,16 @@ def virus(
if total_final_sequences == 0 and os.path.exists(output_api_metadata_jsonl):
try:
os.remove(output_api_metadata_jsonl)
- logger.debug("Removed filtered metadata JSONL due to no passing sequences: %s", output_api_metadata_jsonl)
- except Exception as e:
- logger.warning("Failed to remove filtered metadata JSONL even though no sequence passed all filters: %s", e)
+ logger.debug(
+ "Removed filtered metadata JSONL due to no passing sequences: %s", output_api_metadata_jsonl
+ )
+ except Exception as e: # noqa: BLE001
+ logger.warning(
+ "Failed to remove filtered metadata JSONL even though no sequence passed all filters: %s", e
+ )
-
-
-
logger.info("NCBI virus data retrieval process completed.")
-
+
# Restore the original logger level
logger.setLevel(original_logger_level)
diff --git a/gget/main.py b/gget/main.py
index b51adc92e..a7a57c6c5 100644
--- a/gget/main.py
+++ b/gget/main.py
@@ -1,51 +1,51 @@
import argparse
import sys
from datetime import datetime
-from typing import Optional
import pandas as pd
# Get current date and time for alphafold default foldername
dt_string = datetime.now().strftime("%Y_%m_%d-%H_%M")
-import os
-import json
-import subprocess
+import json # noqa: E402
+import os # noqa: E402
+import subprocess # noqa: E402
-from .utils import set_up_logger
+from .utils import set_up_logger # noqa: E402
logger = set_up_logger()
-from .__init__ import __version__
+from .__init__ import __version__ # noqa: E402
+from .gget_alphafold import alphafold # noqa: E402
+from .gget_archs4 import archs4 # noqa: E402
+from .gget_bgee import bgee # noqa: E402
+from .gget_blast import blast # noqa: E402
+from .gget_blat import blat # noqa: E402
+from .gget_cbio import cbio_plot, cbio_search # noqa: E402
+from .gget_cellxgene import cellxgene # noqa: E402
+from .gget_cosmic import cosmic # noqa: E402
+from .gget_diamond import diamond # noqa: E402
+from .gget_elm import elm # noqa: E402
+from .gget_enrichr import enrichr # noqa: E402
+from .gget_gpt import gpt # noqa: E402
+from .gget_info import info # noqa: E402
+from .gget_muscle import muscle # noqa: E402
+from .gget_mutate import mutate # noqa: E402
+from .gget_opentargets import OPENTARGETS_RESOURCES, opentargets # noqa: E402
+from .gget_pdb import pdb # noqa: E402
# Module functions
-from .gget_ref import ref
-from .gget_search import search
-from .gget_info import info
-from .gget_seq import seq
-from .gget_muscle import muscle
-from .gget_blast import blast
-from .gget_blat import blat
-from .gget_enrichr import enrichr
-from .gget_archs4 import archs4
-from .gget_alphafold import alphafold
-from .gget_setup import setup
-from .gget_pdb import pdb
-from .gget_gpt import gpt
-from .gget_cellxgene import cellxgene
-from .gget_elm import elm
-from .gget_diamond import diamond
-from .gget_cosmic import cosmic
-from .gget_mutate import mutate
-from .gget_opentargets import opentargets, OPENTARGETS_RESOURCES
-from .gget_cbio import cbio_plot, cbio_search
-from .gget_bgee import bgee
-from .gget_8cube import specificity, psi_block, gene_expression
-from .gget_virus import virus
+from .gget_ref import ref # noqa: E402
+from .gget_search import search # noqa: E402
+from .gget_seq import seq # noqa: E402
+from .gget_setup import setup # noqa: E402
+from .gget_virus import virus # noqa: E402
# Custom formatter for help messages that preserved the text formatting and adds the default value to the end of the help message
class CustomHelpFormatter(argparse.RawTextHelpFormatter):
+ """Help formatter that preserves text formatting and appends default values to help messages."""
+
def _get_help_string(self, action):
help_str = action.help if action.help else ""
if (
@@ -63,28 +63,33 @@ def _get_help_string(self, action):
def convert_to_list(*args):
+ """Return the given arguments as a list."""
args_list = list(args)
return args_list
def int_or_str(value):
+ """Return value as an int if possible, otherwise return it unchanged."""
try:
return int(value)
except ValueError:
return value
-
+
+
def str_to_bool_or_none(value):
- if value is None or value.lower() in ('none', 'null', ''):
+ """Convert a string to None, True, False, or return it unchanged."""
+ if value is None or value.lower() in ("none", "null", ""):
return None
- if value.lower() in ('true', 'yes', 't', '1'):
+ if value.lower() in ("true", "yes", "t", "1"):
return True
- if value.lower() in ('false', 'no', 'f', '0'):
+ if value.lower() in ("false", "no", "f", "0"):
return False
# If it's not a clear boolean/None, treat as a string or raise error
- return value
+ return value
def parse_opentargets_filter(filter_arg):
+ """Parse a COLUMN=VALUE OpenTargets filter argument into a (key, value) tuple."""
if "=" not in filter_arg:
raise argparse.ArgumentTypeError(
"OpenTargets filters must be passed as COLUMN=VALUE, e.g. 'disease.id=EFO_0000274'."
@@ -95,34 +100,24 @@ def parse_opentargets_filter(filter_arg):
filter_value = filter_value.strip()
if not filter_key:
- raise argparse.ArgumentTypeError(
- "OpenTargets filter column name cannot be empty."
- )
+ raise argparse.ArgumentTypeError("OpenTargets filter column name cannot be empty.")
return filter_key, int_or_str(str_to_bool_or_none(filter_value))
def main():
- """
- Function containing argparse parsers and arguments to allow the use of gget from the terminal.
- """
+ """Function containing argparse parsers and arguments to allow the use of gget from the terminal."""
# Define parent parser
- parent_parser = argparse.ArgumentParser(
- description=f"gget v{__version__}", add_help=False
- )
+ parent_parser = argparse.ArgumentParser(description=f"gget v{__version__}", add_help=False)
# Initiate subparsers
parent_subparsers = parent_parser.add_subparsers(dest="command")
# Define parent (not sure why I need both parent parser and parent, but otherwise it does not work)
parent = argparse.ArgumentParser(add_help=False)
# Add custom help argument to parent parser
- parent_parser.add_argument(
- "-h", "--help", action="store_true", help="Print manual."
- )
+ parent_parser.add_argument("-h", "--help", action="store_true", help="Print manual.")
# Add custom version argument to parent parser
- parent_parser.add_argument(
- "-v", "--version", action="store_true", help="Print version."
- )
+ parent_parser.add_argument("-v", "--version", action="store_true", help="Print version.")
## gget ref subparser
ref_desc = "Fetch FTPs for reference genomes and annotations by species."
@@ -248,9 +243,7 @@ def main():
)
## gget search subparser
- search_desc = (
- "Fetch gene and transcript IDs from Ensembl using free-form search terms."
- )
+ search_desc = "Fetch gene and transcript IDs from Ensembl using free-form search terms."
parser_gget = parent_subparsers.add_parser(
"search",
parents=[parent],
@@ -460,10 +453,7 @@ def main():
"--out",
type=str,
required=False,
- help=(
- "Path to folder to save results in, e.g. path/to/directory.\n"
- "Default: Standard out."
- ),
+ help=("Path to folder to save results in, e.g. path/to/directory.\nDefault: Standard out."),
)
# gget diamond parser
diamond_desc = "Align multiple protein or translated DNA sequences using DIAMOND."
@@ -636,8 +626,7 @@ def main():
type=str,
required=False,
help=(
- "Path to file the results will be saved as, e.g. path/to/directory/results.json.\n"
- "Default: Standard out."
+ "Path to file the results will be saved as, e.g. path/to/directory/results.json.\nDefault: Standard out."
),
)
parser_info.add_argument(
@@ -697,9 +686,7 @@ def main():
default=False,
action="store_true",
required=False,
- help=(
- "Returns amino acid sequences from UniProt. (Otherwise returns nucleotide sequences from Ensembl.)"
- ),
+ help=("Returns amino acid sequences from UniProt. (Otherwise returns nucleotide sequences from Ensembl.)"),
)
parser_seq.add_argument(
"-iso",
@@ -752,7 +739,9 @@ def main():
)
## gget muscle subparser
- muscle_desc = "Align multiple nucleotide or amino acid sequences against each other (using the Muscle v5 algorithm)."
+ muscle_desc = (
+ "Align multiple nucleotide or amino acid sequences against each other (using the Muscle v5 algorithm)."
+ )
parser_muscle = parent_subparsers.add_parser(
"muscle",
parents=[parent],
@@ -929,9 +918,7 @@ def main():
)
## gget blat subparser
- blat_desc = (
- "BLAT a nucleotide or amino acid sequence against any BLAT UCSC assembly."
- )
+ blat_desc = "BLAT a nucleotide or amino acid sequence against any BLAT UCSC assembly."
parser_blat = parent_subparsers.add_parser(
"blat",
parents=[parent],
@@ -1190,10 +1177,7 @@ def main():
default=100,
type=int,
required=False,
- help=(
- "Number of correlated genes to return (default: 100).\n"
- "(Only for gene correlation.)"
- ),
+ help=("Number of correlated genes to return (default: 100).\n(Only for gene correlation.)"),
)
parser_archs4.add_argument(
"-s",
@@ -1527,9 +1511,7 @@ def main():
)
# cellxgene parser arguments
- cellxgene_desc = (
- "Query data from CZ CELLxGENE Discover (https://cellxgene.cziscience.com/)."
- )
+ cellxgene_desc = "Query data from CZ CELLxGENE Discover (https://cellxgene.cziscience.com/)."
parser_cellxgene = parent_subparsers.add_parser(
"cellxgene",
parents=[parent],
@@ -2156,9 +2138,7 @@ def main():
add_help=True,
formatter_class=CustomHelpFormatter,
)
- parser_cbio_subparsers = parser_cbio.add_subparsers(
- dest="subcommand", help="Subcommand to execute."
- )
+ parser_cbio_subparsers = parser_cbio.add_subparsers(dest="subcommand", help="Subcommand to execute.")
parser_cbio_search = parser_cbio_subparsers.add_parser(
"search",
description="Search for genes in cBioPortal.",
@@ -2367,9 +2347,7 @@ def main():
formatter_class=CustomHelpFormatter,
)
- parser_cube_spec.add_argument(
- "genes", nargs="+", help="Gene symbols or Ensembl IDs."
- )
+ parser_cube_spec.add_argument("genes", nargs="+", help="Gene symbols or Ensembl IDs.")
parser_cube_spec.add_argument(
"-csv",
@@ -2398,9 +2376,7 @@ def main():
formatter_class=CustomHelpFormatter,
)
- parser_cube_psib.add_argument(
- "genes", nargs="+", help="Gene symbols or Ensembl IDs."
- )
+ parser_cube_psib.add_argument("genes", nargs="+", help="Gene symbols or Ensembl IDs.")
parser_cube_psib.add_argument(
"-al",
@@ -2443,13 +2419,9 @@ def main():
formatter_class=CustomHelpFormatter,
)
- parser_cube_expr.add_argument(
- "genes", nargs="+", help="Gene symbols or Ensembl IDs."
- )
+ parser_cube_expr.add_argument("genes", nargs="+", help="Gene symbols or Ensembl IDs.")
- parser_cube_expr.add_argument(
- "-al", "--analysis_level", required=True, help="Analysis level, e.g. 'Kidney'."
- )
+ parser_cube_expr.add_argument("-al", "--analysis_level", required=True, help="Analysis level, e.g. 'Kidney'.")
parser_cube_expr.add_argument(
"-at",
@@ -2475,7 +2447,7 @@ def main():
action="store_false",
help="Does not print progress information.",
)
-
+
## gget virus subparser
virus_desc = "Download virus genome datasets and associated GenBank metadata from the NCBI Virus database."
parser_virus = parent_subparsers.add_parser(
@@ -2493,11 +2465,11 @@ def main():
nargs="?",
default=None,
help="Virus taxon name/ID to query, e.g. 'SARS-CoV-2', 'zika virus', or taxon ID '1335626'.\n"
- "When using --is_accession flag, can also be:\n"
- " - Single accession: 'NC_038294.1'\n"
- " - Space-separated accessions: 'NC_038294.1 NC_045512.2'\n"
- " - Path to text file: 'accessions.txt' (one accession per line)\n"
- "For SARS-CoV-2 and Alphainfluenza cached downloads, multiple accessions are fully supported.",
+ "When using --is_accession flag, can also be:\n"
+ " - Single accession: 'NC_038294.1'\n"
+ " - Space-separated accessions: 'NC_038294.1 NC_045512.2'\n"
+ " - Path to text file: 'accessions.txt' (one accession per line)\n"
+ "For SARS-CoV-2 and Alphainfluenza cached downloads, multiple accessions are fully supported.",
)
parser_virus.add_argument(
"-a",
@@ -2506,8 +2478,8 @@ def main():
action="store_true",
required=False,
help="Treat the virus argument as an accession number (single, space-separated list, or text file path with one accession per line).\n"
- "Single: 'NC_038294.1' | List: 'NC_038294.1 NC_045512.2' | File: 'accessions.txt'\n"
- "For SARS-CoV-2 and Alphainfluenza cached downloads, multiple accessions are fully supported.",
+ "Single: 'NC_038294.1' | List: 'NC_038294.1 NC_045512.2' | File: 'accessions.txt'\n"
+ "For SARS-CoV-2 and Alphainfluenza cached downloads, multiple accessions are fully supported.",
)
parser_virus.add_argument(
"-o",
@@ -2577,7 +2549,7 @@ def main():
parser_virus.add_argument(
"--annotated",
type=str_to_bool_or_none,
- nargs='?',
+ nargs="?",
const=True,
required=False,
default=None,
@@ -2685,7 +2657,7 @@ def main():
"--vaccine_strain",
default=None,
type=str_to_bool_or_none,
- nargs='?',
+ nargs="?",
const=True,
required=False,
help="Vaccine strain filter: 'true' or 'false' or None. True will only keep sequences marked as vaccine strains. False filters out vaccine strains. and None (Default) will not filter based on vaccine strain status.",
@@ -2693,7 +2665,7 @@ def main():
parser_virus.add_argument(
"--lab_passaged",
type=str_to_bool_or_none,
- nargs='?',
+ nargs="?",
const=True,
required=False,
default=None,
@@ -2702,7 +2674,7 @@ def main():
parser_virus.add_argument(
"--provirus",
type=str_to_bool_or_none,
- nargs='?',
+ nargs="?",
const=True,
required=False,
default=None,
@@ -2797,9 +2769,9 @@ def main():
default=None,
dest="baseline_metadata",
help="Path to a baseline metadata file (CSV/JSONL/JSON/text) containing accessions to skip.\n"
- "Only new accessions (not in baseline) will be downloaded.\n"
- "Useful for incremental updates or resuming after API failures.\n"
- "CSV files must have an 'accession' column. Text files: one accession per line.",
+ "Only new accessions (not in baseline) will be downloaded.\n"
+ "Useful for incremental updates or resuming after API failures.\n"
+ "CSV files must have an 'accession' column. Text files: one accession per line.",
)
parser_virus.add_argument(
"--merge-results",
@@ -2816,7 +2788,7 @@ def main():
dest="no_merge",
required=False,
help="When using --baseline, output new results separately from baseline.\n"
- "Creates {virus}_new.csv (new sequences only) and {virus}_baseline_provided.csv (reference).",
+ "Creates {virus}_new.csv (new sequences only) and {virus}_baseline_provided.csv (reference).",
)
parser_virus.add_argument(
"--api_key",
@@ -2824,9 +2796,9 @@ def main():
required=False,
default=None,
help="NCBI API key for higher E-utilities rate limits (10 requests/sec vs 3/sec without).\n"
- "Obtain a free key from https://www.ncbi.nlm.nih.gov/account/settings/\n"
- "Can also be set via the NCBI_API_KEY environment variable.\n"
- "If not provided, requests continue at the lower default rate limit.",
+ "Obtain a free key from https://www.ncbi.nlm.nih.gov/account/settings/\n"
+ "Can also be set via the NCBI_API_KEY environment variable.\n"
+ "If not provided, requests continue at the lower default rate limit.",
)
parser_virus.add_argument(
"-q",
@@ -2844,14 +2816,12 @@ def main():
if args.help:
# Retrieve all subparsers from the parent parser
subparsers_actions = [
- action
- for action in parent_parser._actions
- if isinstance(action, argparse._SubParsersAction)
+ action for action in parent_parser._actions if isinstance(action, argparse._SubParsersAction)
]
for subparsers_action in subparsers_actions:
# Get all subparsers and print help
for choice, subparser in subparsers_action.choices.items():
- print("Subparser '{}'".format(choice))
+ print(f"Subparser '{choice}'")
print(subparser.format_help())
sys.exit(0)
@@ -3139,14 +3109,10 @@ def main():
if args.command == "archs4":
# Handle deprecated flags for backwards compatibility
if args.gene_deprecated and args.gene:
- logger.warning(
- "The [-g][--gene] argument is deprecated, using positional argument [gene] instead."
- )
+ logger.warning("The [-g][--gene] argument is deprecated, using positional argument [gene] instead.")
if args.gene_deprecated and not args.gene:
args.gene = args.gene_deprecated
- logger.warning(
- "The [-g][--gene] argument is deprecated, please use positional argument [gene] instead."
- )
+ logger.warning("The [-g][--gene] argument is deprecated, please use positional argument [gene] instead.")
if not args.gene_deprecated and not args.gene:
parser_archs4.error("the following arguments are required: gene")
@@ -3191,14 +3157,10 @@ def main():
if args.command == "muscle":
# Handle deprecated flags for backwards compatibility
if args.fasta_deprecated and args.fasta:
- logger.warning(
- "The [-fa][--fasta] argument is deprecated, using positional argument [fasta] instead."
- )
+ logger.warning("The [-fa][--fasta] argument is deprecated, using positional argument [fasta] instead.")
if args.fasta_deprecated and not args.fasta:
args.fasta = args.fasta_deprecated
- logger.warning(
- "The [-fa][--fasta] argument is deprecated, please use positional argument [fasta] instead."
- )
+ logger.warning("The [-fa][--fasta] argument is deprecated, please use positional argument [fasta] instead.")
if not args.fasta_deprecated and not args.fasta:
parser_muscle.error("the following arguments are required: fasta")
@@ -3250,9 +3212,7 @@ def main():
if args.command == "ref":
# Return all vertebrate available species
if args.list_species:
- species_list = ref(
- species=None, release=args.release, list_species=args.list_species
- )
+ species_list = ref(species=None, release=args.release, list_species=args.list_species)
# Save in specified directory if -o specified
if args.out:
directory = "/".join(args.out.split("/")[:-1])
@@ -3266,9 +3226,7 @@ def main():
# Return all invertebrate available species
elif args.list_iv_species:
- species_list = ref(
- species=None, release=args.release, list_iv_species=args.list_iv_species
- )
+ species_list = ref(species=None, release=args.release, list_iv_species=args.list_iv_species)
# Save in specified directory if -o specified
if args.out:
directory = "/".join(args.out.split("/")[:-1])
@@ -3282,9 +3240,7 @@ def main():
# Handle deprecated flags for backwards compatibility
if args.species_deprecated and args.species:
- logger.warning(
- "The [-s][--species] argument is deprecated, using positional argument [species] instead."
- )
+ logger.warning("The [-s][--species] argument is deprecated, using positional argument [species] instead.")
if args.species_deprecated and not args.species:
args.species = args.species_deprecated
logger.warning(
@@ -3292,11 +3248,7 @@ def main():
)
# Raise error if neither species nor list flag passed
- if (
- args.species is None
- and args.list_species is False
- and args.list_iv_species is False
- ):
+ if args.species is None and args.list_species is False and args.list_iv_species is False:
parser_ref.error(
"the following arguments are required: species \n"
"'gget ref --list_species' -> lists out all available vertebrate species. \n"
@@ -3441,14 +3393,10 @@ def main():
if args.command == "enrichr":
# Handle deprecated flags for backwards compatibility
if args.genes_deprecated and args.genes:
- logger.warning(
- "The [-g][--genes] argument is deprecated, using positional argument [genes] instead."
- )
+ logger.warning("The [-g][--genes] argument is deprecated, using positional argument [genes] instead.")
if args.genes_deprecated and not args.genes:
args.genes = args.genes_deprecated
- logger.warning(
- "The [-g][--genes] argument is deprecated, please use positional argument [genes] instead."
- )
+ logger.warning("The [-g][--genes] argument is deprecated, please use positional argument [genes] instead.")
if not args.genes_deprecated and not args.genes:
parser_enrichr.error("the following arguments are required: genes")
@@ -3471,9 +3419,7 @@ def main():
for gene in args.background_list:
bkg_genes_clean.append(gene.split(","))
# Flatten bkg_genes_clean
- bkg_genes_clean_final = [
- item for sublist in bkg_genes_clean for item in sublist
- ]
+ bkg_genes_clean_final = [item for sublist in bkg_genes_clean for item in sublist]
# Remove empty strings resulting from split
while "" in genes_clean_final:
bkg_genes_clean_final.remove("")
@@ -3523,14 +3469,10 @@ def main():
if args.command == "info":
# Handle deprecated flags for backwards compatibility
if args.id_deprecated and args.ens_ids:
- logger.warning(
- "The [-id][--ens_ids] argument is deprecated, using positional argument [ens_ids] instead."
- )
+ logger.warning("The [-id][--ens_ids] argument is deprecated, using positional argument [ens_ids] instead.")
if args.id_deprecated and not args.ens_ids:
args.ens_ids = args.id_deprecated
- logger.warning(
- "The [-id][--genes] argument is deprecated, please use arguments [ens_ids] instead."
- )
+ logger.warning("The [-id][--genes] argument is deprecated, please use arguments [ens_ids] instead.")
if args.ensembl_only:
logger.warning(
"The [-eo][--ensembl_only] argument is deprecated, please use arguments [ncbi] and [uniprot] instead."
@@ -3591,9 +3533,7 @@ def main():
if args.command == "seq":
# Handle deprecated flags for backwards compatibility
if args.id_deprecated and args.ens_ids:
- logger.warning(
- "The [-id][--ens_ids] argument is deprecated, using positional argument [ens_ids] instead."
- )
+ logger.warning("The [-id][--ens_ids] argument is deprecated, using positional argument [ens_ids] instead.")
if args.id_deprecated and not args.ens_ids:
args.ens_ids = args.id_deprecated
logger.warning(
@@ -3624,7 +3564,7 @@ def main():
)
# Save in specified directory if -o specified
- if args.out and seq_results != None:
+ if args.out and seq_results is not None:
directory = "/".join(args.out.split("/")[:-1])
if directory != "":
os.makedirs(directory, exist_ok=True)
@@ -3635,7 +3575,7 @@ def main():
# Print results if no directory specified
else:
- if seq_results != None:
+ if seq_results is not None:
for seq_res in seq_results:
print(seq_res)
@@ -3722,18 +3662,12 @@ def main():
if args.csv:
opentargets_results.to_csv(f, index=False)
else:
- opentargets_results.to_json(
- f, orient="records", force_ascii=False, indent=4
- )
+ opentargets_results.to_json(f, orient="records", force_ascii=False, indent=4)
else:
if args.csv:
opentargets_results.to_csv(sys.stdout, index=False)
else:
- print(
- opentargets_results.to_json(
- orient="records", force_ascii=False, indent=4
- )
- )
+ print(opentargets_results.to_json(orient="records", force_ascii=False, indent=4))
## cbio return
if args.command == "cbio":
@@ -3776,25 +3710,19 @@ def main():
if args.csv:
bgee_results.to_csv(f, index=False)
else:
- bgee_results.to_json(
- f, orient="records", force_ascii=False, indent=4
- )
+ bgee_results.to_json(f, orient="records", force_ascii=False, indent=4)
else:
if args.csv:
bgee_results.to_csv(sys.stdout, index=False)
else:
- print(
- bgee_results.to_json(orient="records", force_ascii=False, indent=4)
- )
+ print(bgee_results.to_json(orient="records", force_ascii=False, indent=4))
## 8cube return
if args.command == "8cube":
- from .gget_8cube import specificity, psi_block, gene_expression
+ from .gget_8cube import gene_expression, psi_block, specificity
if args.cube_command is None:
- parser_8cube.error(
- "Please specify a subcommand: specificity, psi_block, or expression"
- )
+ parser_8cube.error("Please specify a subcommand: specificity, psi_block, or expression")
# SPECIFICITY
if args.cube_command == "specificity":
@@ -3811,7 +3739,7 @@ def main():
if directory:
os.makedirs(directory, exist_ok=True)
- if not args.csv: # args.csv stores False
+ if not args.csv: # args.csv stores False
pd.DataFrame(results).to_csv(args.out, index=False)
else:
with open(args.out, "w", encoding="utf-8") as f:
@@ -3819,7 +3747,7 @@ def main():
return
# Print to STDOUT
- if not args.csv: # args.csv stores False
+ if not args.csv: # args.csv stores False
pd.DataFrame(results).to_csv(sys.stdout, index=False)
else:
print(json.dumps(results, ensure_ascii=False, indent=4))
@@ -3841,14 +3769,14 @@ def main():
if directory:
os.makedirs(directory, exist_ok=True)
- if not args.csv: # args.csv stores False
+ if not args.csv: # args.csv stores False
pd.DataFrame(results).to_csv(args.out, index=False)
else:
with open(args.out, "w", encoding="utf-8") as f:
json.dump(results, f, ensure_ascii=False, indent=4)
return
- if not args.csv: # args.csv stores False
+ if not args.csv: # args.csv stores False
pd.DataFrame(results).to_csv(sys.stdout, index=False)
else:
print(json.dumps(results, ensure_ascii=False, indent=4))
@@ -3870,14 +3798,14 @@ def main():
if directory:
os.makedirs(directory, exist_ok=True)
- if not args.csv: # args.csv stores False
+ if not args.csv: # args.csv stores False
pd.DataFrame(results).to_csv(args.out, index=False)
else:
with open(args.out, "w", encoding="utf-8") as f:
json.dump(results, f, ensure_ascii=False, indent=4)
return
- if not args.csv: # args.csv stores False
+ if not args.csv: # args.csv stores False
pd.DataFrame(results).to_csv(sys.stdout, index=False)
else:
print(json.dumps(results, ensure_ascii=False, indent=4))
@@ -3887,48 +3815,48 @@ def main():
if args.command == "virus":
# Parse has_proteins argument - convert comma-separated string to list
has_proteins_arg = args.has_proteins
- if has_proteins_arg and ',' in has_proteins_arg:
- has_proteins_arg = [p.strip() for p in has_proteins_arg.split(',')]
-
+ if has_proteins_arg and "," in has_proteins_arg:
+ has_proteins_arg = [p.strip() for p in has_proteins_arg.split(",")]
+
segment_arg = args.segment
- if segment_arg and ',' in segment_arg:
- segment_arg = [s.strip() for s in segment_arg.split(',')]
+ if segment_arg and "," in segment_arg:
+ segment_arg = [s.strip() for s in segment_arg.split(",")]
isolate_arg = args.isolate
- if isolate_arg and ',' in isolate_arg:
- isolate_arg = [i.strip() for i in isolate_arg.split(',')]
+ if isolate_arg and "," in isolate_arg:
+ isolate_arg = [i.strip() for i in isolate_arg.split(",")]
submitter_name_arg = args.submitter_name
- if submitter_name_arg and ',' in submitter_name_arg:
- submitter_name_arg = [a.strip() for a in submitter_name_arg.split(',')]
+ if submitter_name_arg and "," in submitter_name_arg:
+ submitter_name_arg = [a.strip() for a in submitter_name_arg.split(",")]
submitter_institution_arg = args.submitter_institution
- if submitter_institution_arg and ',' in submitter_institution_arg:
- submitter_institution_arg = [i.strip() for i in submitter_institution_arg.split(',')]
+ if submitter_institution_arg and "," in submitter_institution_arg:
+ submitter_institution_arg = [i.strip() for i in submitter_institution_arg.split(",")]
submitter_country_arg = args.submitter_country
- if submitter_country_arg and ',' in submitter_country_arg:
- submitter_country_arg = [c.strip() for c in submitter_country_arg.split(',')]
-
+ if submitter_country_arg and "," in submitter_country_arg:
+ submitter_country_arg = [c.strip() for c in submitter_country_arg.split(",")]
+
env_source_arg = args.env_source
- if env_source_arg and ',' in env_source_arg:
- env_source_arg = [e.strip() for e in env_source_arg.split(',')]
+ if env_source_arg and "," in env_source_arg:
+ env_source_arg = [e.strip() for e in env_source_arg.split(",")]
lineage_arg = args.lineage
- if lineage_arg and ',' in lineage_arg:
- lineage_arg = [l.strip() for l in lineage_arg.split(',')]
+ if lineage_arg and "," in lineage_arg:
+ lineage_arg = [l.strip() for l in lineage_arg.split(",")]
genotype_arg = args.genotype
- if genotype_arg and ',' in genotype_arg:
- genotype_arg = [g.strip() for g in genotype_arg.split(',')]
+ if genotype_arg and "," in genotype_arg:
+ genotype_arg = [g.strip() for g in genotype_arg.split(",")]
isolation_source_arg = args.isolation_source
- if isolation_source_arg and ',' in isolation_source_arg:
- isolation_source_arg = [i.strip() for i in isolation_source_arg.split(',')]
+ if isolation_source_arg and "," in isolation_source_arg:
+ isolation_source_arg = [i.strip() for i in isolation_source_arg.split(",")]
gen_mol_type_arg = args.gen_mol_type
- if gen_mol_type_arg and ',' in gen_mol_type_arg:
- gen_mol_type_arg = [g.strip() for g in gen_mol_type_arg.split(',')]
+ if gen_mol_type_arg and "," in gen_mol_type_arg:
+ gen_mol_type_arg = [g.strip() for g in gen_mol_type_arg.split(",")]
# Determine merge_results: --no-merge overrides --merge-results
merge_results_arg = True # default
diff --git a/gget/utils.py b/gget/utils.py
index 399194473..461de07cf 100644
--- a/gget/utils.py
+++ b/gget/utils.py
@@ -1,44 +1,42 @@
-from bs4 import BeautifulSoup
-import requests
-
import concurrent.futures
+import functools
import json
-import re
+import logging
import os
import time
import uuid
-import functools
-import pandas as pd
+
import numpy as np
-from IPython.display import display, HTML
-import logging
+import pandas as pd
+import requests
+from bs4 import BeautifulSoup
+from IPython.display import HTML, display
# from datetime import datetime
# Mute numexpr threads info
logging.getLogger("numexpr").setLevel(logging.WARNING)
-from .constants import (
- ENSEMBL_FTP_URL,
- ENSEMBL_FTP_URL_NV,
- ENS_TO_PDB_API,
+from .constants import ( # noqa: E402
COSMIC_RELEASE_URL,
DEFAULT_REQUESTS_TIMEOUT,
+ ENS_TO_PDB_API,
+ ENSEMBL_FTP_URL,
+ ENSEMBL_FTP_URL_NV,
)
def set_up_logger():
+ """Configure and return the module logger using the GGET_LOGLEVEL environment variable."""
logging_level_name = os.getenv("GGET_LOGLEVEL", "INFO")
logging_level = logging.getLevelName(logging_level_name)
- if type(logging_level) != int: # unknown log level
+ if not isinstance(logging_level, int): # unknown log level
logging_level = logging.INFO
logger = logging.getLogger(__name__)
logger.setLevel(logging_level)
if not logger.hasHandlers():
- formatter = logging.Formatter(
- "%(asctime)s - %(levelname)s - %(message)s", "%H:%M:%S"
- )
+ formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s", "%H:%M:%S")
console_handler = logging.StreamHandler()
console_handler.setFormatter(formatter)
@@ -63,15 +61,13 @@ def set_up_logger():
def flatten(xss):
- """
- Function to flatten a list of lists.
- """
+ """Function to flatten a list of lists."""
return [x for xs in xss for x in xs]
def parallel_map(fn, items, *, max_workers=None):
- """
- Apply `fn` to each item using a thread pool and return the results
+ """Apply `fn` to each item using a thread pool and return the results
+
in input order. Designed for I/O-bound work — typically per-ID HTTP
calls — where the per-call latency is dominated by network RTT.
@@ -105,8 +101,8 @@ def http_json(
backoff=1.0,
**kwargs,
):
- """
- Issue an HTTP request and return the parsed JSON body, raising a
+ """Issue an HTTP request and return the parsed JSON body, raising a
+
RuntimeError with consistent context if the request fails or the body
is not valid JSON.
@@ -142,15 +138,13 @@ def http_json(
# to the caller without retry.
if response.status_code < 500:
body = response.text[:200] if response.text else ""
- raise RuntimeError(
- f"{label} returned HTTP {response.status_code}. Body: {body}"
- )
+ raise RuntimeError(f"{label} returned HTTP {response.status_code}. Body: {body}")
last_exc = None
last_status = response.status_code
last_body = response.text[:200] if response.text else ""
if attempt < attempts - 1:
- delay = backoff * (2 ** attempt)
+ delay = backoff * (2**attempt)
logger.warning(
f"{label}: transient failure (%s); retrying in %.1fs (attempt %d/%d).",
last_exc or f"HTTP {last_status}",
@@ -161,17 +155,13 @@ def http_json(
time.sleep(delay)
if last_exc is not None:
- raise RuntimeError(
- f"{label} request failed after {attempts} attempts: {last_exc}"
- ) from last_exc
- raise RuntimeError(
- f"{label} returned HTTP {last_status} after {attempts} attempts. Body: {last_body}"
- )
+ raise RuntimeError(f"{label} request failed after {attempts} attempts: {last_exc}") from last_exc
+ raise RuntimeError(f"{label} returned HTTP {last_status} after {attempts} attempts. Body: {last_body}")
def dig(obj, *path, context=""):
- """
- Walk a nested key path through `obj` and return the resulting value.
+ """Walk a nested key path through `obj` and return the resulting value.
+
Raises RuntimeError with `context` if any intermediate key is missing
or any intermediate value is not a dict. Use to make
`response["data"]["target"]`-style access fail with a clear message
@@ -182,26 +172,24 @@ def dig(obj, *path, context=""):
if not isinstance(cur, dict) or key not in cur:
traversed = ".".join(path[:i]) or ""
label = f"{context}: " if context else ""
- raise RuntimeError(
- f"{label}expected key '{key}' under {traversed} in response."
- )
+ raise RuntimeError(f"{label}expected key '{key}' under {traversed} in response.")
cur = cur[key]
return cur
def get_latest_cosmic():
+ """Fetch and return the latest COSMIC release version number."""
html = requests.get(COSMIC_RELEASE_URL)
if html.status_code != 200:
- raise RuntimeError(
- f"The COSMIC server returned error status code {html.status_code}. Please try again."
- )
+ raise RuntimeError(f"The COSMIC server returned error status code {html.status_code}. Please try again.")
soup = BeautifulSoup(html.text, "html.parser")
return int(soup.find("div", class_="news").get("id").split("v")[-1])
def check_file_for_error_message(filepath, filename, download_path):
- with open(filepath, "r", encoding="utf-8") as file:
+ """Raise a ValueError if the downloaded file contains a known server error message."""
+ with open(filepath, encoding="utf-8") as file:
content = file.read().strip()
# Define common error indicators
@@ -217,7 +205,7 @@ def check_file_for_error_message(filepath, filename, download_path):
if any(keyword in content for keyword in error_keywords):
raise ValueError(
f"""
- The {filename} downloaded from {download_path}
+ The {filename} downloaded from {download_path}
contains an error message instead of valid data.\n
Error message:\n{content}\n
Please try again. If the problem persists, please report it here: https://github.com/pachterlab/gget/issues/new?template=issue_report.yml
@@ -226,11 +214,10 @@ def check_file_for_error_message(filepath, filename, download_path):
def read_fasta(fasta):
- """
+ """Return titles and seqs from a fasta file as two list objects.
+
Args:
- fasta (str) Path to fasta file.
-
- Returns titles and seqs from fasta file as two list objects.
"""
titles = []
seqs = []
@@ -269,11 +256,10 @@ def read_fasta(fasta):
def n_colors(nucleotide):
- """
- Returns a string format to print the nucleotide
+ """Returns a string format to print the nucleotide
+
with its appropriate background color according to the Clustal Colour Scheme.
"""
-
# Raw python background colors
# References:
# https://stackabuse.com/how-to-print-colored-text-in-python/
@@ -305,7 +291,7 @@ def n_colors(nucleotide):
# If the nucleotide does not fall into the defined color categories,
# make it white (e.g. "-")
- if bkg_color == None:
+ if bkg_color is None:
bkg_color = raw_colors["white"]
if letter_color is not None and letter_color in ["blue", "red"]:
@@ -319,12 +305,11 @@ def n_colors(nucleotide):
def aa_colors(amino_acid):
- """
- Returns a string format to print the amino acid
+ """Returns a string format to print the amino acid
+
with its appropriate background color according to the Clustal Colour Scheme:
- http://www.jalview.org/help/html/colourSchemes/clustal.html
+ http://www.jalview.org/help/html/colourSchemes/clustal.html.
"""
-
# Raw python background colors
# References:
# https://stackabuse.com/how-to-print-colored-text-in-python/
@@ -364,7 +349,7 @@ def aa_colors(amino_acid):
# If the amino acid does not fall into the defined color categories,
# make it white (e.g. "-")
- if bkg_color == None:
+ if bkg_color is None:
bkg_color = raw_colors["white"]
if letter_color is not None and letter_color in [
@@ -399,9 +384,7 @@ def _fetch_uniprot_for_id(server, id_):
)
payload = r.json()
if len(payload["results"]) > 0:
- logger.warning(
- f"No reviewed UniProt results were found for ID {id_}. Returning all unreviewed results."
- )
+ logger.warning(f"No reviewed UniProt results were found for ID {id_}. Returning all unreviewed results.")
if not len(payload["results"]) > 0:
logger.warning(f"No UniProt sequences were found for ID {id_}.")
@@ -435,8 +418,7 @@ def _fetch_uniprot_for_id(server, id_):
def get_uniprot_seqs(server, ensembl_ids):
- """
- Retrieve UniProt sequences based on Ensemsbl, WormBase or FlyBase identifiers.
+ """Retrieve UniProt sequences based on Ensemsbl, WormBase or FlyBase identifiers.
Args:
- server Link to UniProt REST API server.
@@ -444,17 +426,14 @@ def get_uniprot_seqs(server, ensembl_ids):
Returns data frame with UniProt ID, gene name, organism, sequence, sequence length, and query ID.
"""
-
# If a single UniProt ID is passed as string, convert to list
- if type(ensembl_ids) == str:
+ if isinstance(ensembl_ids, str):
ensembl_ids = [ensembl_ids]
# Fan out per-ID requests across a thread pool. Each call is independent
# and entirely I/O-bound, so the wall-clock saving on a list of IDs is
# roughly the pool size. Override with GGET_MAX_WORKERS env var.
- results = parallel_map(
- lambda id_: _fetch_uniprot_for_id(server, id_), ensembl_ids
- )
+ results = parallel_map(lambda id_: _fetch_uniprot_for_id(server, id_), ensembl_ids)
per_id_dfs = [df for df in results if df is not None]
if per_id_dfs:
return pd.concat(per_id_dfs, ignore_index=True)
@@ -462,8 +441,7 @@ def get_uniprot_seqs(server, ensembl_ids):
def get_uniprot_info(server, ensembl_id, verbose=True):
- """
- Retrieve UniProt synonyms and description based on Ensemsbl identifiers.
+ """Retrieve UniProt synonyms and description based on Ensemsbl identifiers.
Args:
- server Link to UniProt REST API server.
@@ -520,7 +498,7 @@ def get_uniprot_info(server, ensembl_id, verbose=True):
for i in np.arange(len(json["results"])):
try:
gene_names.append(json["results"][i]["genes"][0]["geneName"]["value"])
- except:
+ except Exception: # noqa: BLE001
gene_names.append(np.nan)
df["primary_gene_name"] = gene_names
@@ -531,7 +509,7 @@ def get_uniprot_info(server, ensembl_id, verbose=True):
try:
for syn in json["results"][i]["genes"][0]["synonyms"]:
uni_syn_temp.append(syn["value"])
- except:
+ except Exception: # noqa: BLE001
uni_syn_temp.append(np.nan)
uni_synonyms.append(uni_syn_temp)
df["uni_synonyms"] = uni_synonyms
@@ -540,12 +518,8 @@ def get_uniprot_info(server, ensembl_id, verbose=True):
protein_names = []
for i in np.arange(len(json["results"])):
try:
- protein_names.append(
- json["results"][i]["proteinDescription"]["recommendedName"][
- "fullName"
- ]["value"]
- )
- except:
+ protein_names.append(json["results"][i]["proteinDescription"]["recommendedName"]["fullName"]["value"])
+ except Exception: # noqa: BLE001
protein_names.append(np.nan)
df["protein_names"] = protein_names
@@ -561,7 +535,7 @@ def get_uniprot_info(server, ensembl_id, verbose=True):
des_temp = np.unique(np.array(des_temp))
# Append all descriptions to a single string object
des_temp = " ".join(des_temp)
- except:
+ except Exception: # noqa: BLE001
des_temp.append(np.nan)
descriptions.append(des_temp)
@@ -577,7 +551,7 @@ def get_uniprot_info(server, ensembl_id, verbose=True):
if comment_json["commentType"] == "SUBCELLULAR LOCATION":
for location_dict in comment_json["subcellularLocations"]:
subcel_locs.append(location_dict["location"]["value"])
- except:
+ except Exception: # noqa: BLE001
pass
subcel_locs_final.append(subcel_locs)
@@ -601,16 +575,16 @@ def get_uniprot_info(server, ensembl_id, verbose=True):
syn_lists = df[column].values
try:
flat_list = [item for sublist in syn_lists for item in sublist]
- final_df[column] = [list({value: "" for value in flat_list})]
+ final_df[column] = [list(dict.fromkeys(flat_list, ""))]
- except:
+ except Exception: # noqa: BLE001
final_df[column] = [syn_lists]
else:
val_list = df[column].values
try:
- final_df[column] = [list({value: "" for value in val_list})]
- except:
+ final_df[column] = [list(dict.fromkeys(val_list, ""))]
+ except Exception: # noqa: BLE001
final_df[column] = [val_list]
# Try to clean up the entries (so they are not a bunch of lists of one item)
@@ -618,8 +592,8 @@ def get_uniprot_info(server, ensembl_id, verbose=True):
if len(final_df[column]) == 1 and column != "uni_synonyms":
try:
final_df[column] = final_df[column][0]
- except:
- None
+ except Exception: # noqa: BLE001
+ pass
return final_df
@@ -724,14 +698,13 @@ def get_uniprot_info(server, ensembl_id, verbose=True):
def get_pdb_ids(ens_id):
- """
- Function to fetch all PDB IDs linked to an Ensembl ID.
- using the PDBe API https://wwwdev.ebi.ac.uk/pdbe/aggregated-api/mappings/ensembl_to_pdb/[ens_id]
+ """Function to fetch all PDB IDs linked to an Ensembl ID.
+
+ using the PDBe API https://wwwdev.ebi.ac.uk/pdbe/aggregated-api/mappings/ensembl_to_pdb/[ens_id].
API documentation:
https://www.ebi.ac.uk/pdbe/aggregated-api/#/SIFTS/get_ensembl_to_pdb_mappings_api_mappings_ensembl_to_pdb__gene_id__get
"""
-
res = requests.get(ENS_TO_PDB_API + ens_id)
if not res.ok:
@@ -748,12 +721,12 @@ def get_pdb_ids(ens_id):
for entry in pdb_dict:
pdb_ids.append(entry["pdb_id"])
- return sorted(list(set(pdb_ids)))
+ return sorted(set(pdb_ids))
def wrap_cols_func(df, cols):
- """
- Function to wrap columns cols of a
+ """Function to wrap columns cols of a
+
data frame df for easier reading.
"""
for col in cols:
@@ -763,8 +736,7 @@ def wrap_cols_func(df, cols):
def rest_query(server, query, content_type):
- """
- Function to perform a REST API query.
+ """Function to perform a REST API query.
Args:
- server Server to query.
@@ -773,13 +745,11 @@ def rest_query(server, query, content_type):
Returns server output.
"""
-
r = requests.get(server + query, headers={"Content-Type": content_type})
if not r.ok:
raise RuntimeError(
- f"{server} returned error status code {r.status_code}. "
- "Please double-check arguments and try again.\n"
+ f"{server} returned error status code {r.status_code}. Please double-check arguments and try again.\n"
)
if content_type == "application/json":
@@ -789,8 +759,7 @@ def rest_query(server, query, content_type):
def post_query(server, endpoint, query):
- """
- Function to perform a POST API query.
+ """Function to perform a POST API query.
:param server: Server to query .
:param endpoint: Server endpoint
@@ -798,23 +767,18 @@ def post_query(server, endpoint, query):
:return: server output
"""
-
- r = requests.post(
- server + endpoint, json=query, headers={"Content-Type": "application/json"}
- )
+ r = requests.post(server + endpoint, json=query, headers={"Content-Type": "application/json"})
if not r.ok:
raise RuntimeError(
- f"{server} returned error status code {r.status_code}. "
- "Please double-check arguments and try again.\n"
+ f"{server} returned error status code {r.status_code}. Please double-check arguments and try again.\n"
)
return r.json()
def graphql_query(server, query, variables):
- """
- Function to perform a GraphQL API query.
+ """Function to perform a GraphQL API query.
Args:
- server Server to query.
@@ -823,25 +787,20 @@ def graphql_query(server, query, variables):
Returns server output.
"""
-
r = requests.post(server, json={"query": query, "variables": variables})
if not r.ok:
- logger.debug(
- f"Server: {server}, Query: {query}, Variables: {variables}, Response: {r.text}"
- )
+ logger.debug(f"Server: {server}, Query: {query}, Variables: {variables}, Response: {r.text}")
raise RuntimeError(
- f"{server} returned error status code {r.status_code}. "
- "Please double-check arguments and try again.\n"
+ f"{server} returned error status code {r.status_code}. Please double-check arguments and try again.\n"
)
return r.json()
-@functools.lru_cache(maxsize=None)
+@functools.cache
def find_latest_ens_rel(database=ENSEMBL_FTP_URL):
- """
- Returns the latest Ensembl release number.
+ """Returns the latest Ensembl release number.
Args:
- database Link to Ensembl database.
@@ -870,18 +829,15 @@ def find_latest_ens_rel(database=ENSEMBL_FTP_URL):
html = requests.get(database + "VERSION")
if html.status_code != 200:
- raise RuntimeError(
- f"The Ensembl FTP server returned error status code {html.status_code}. Please try again."
- )
+ raise RuntimeError(f"The Ensembl FTP server returned error status code {html.status_code}. Please try again.")
ENS_rel = int(html.text)
return ENS_rel
-@functools.lru_cache(maxsize=None)
+@functools.cache
def search_species_options(database=ENSEMBL_FTP_URL, release=None):
- """
- Function to find all available species core databases for gget search.
+ """Function to find all available species core databases for gget search.
Args:
- release Ensembl release for which the databases are fetched.
@@ -894,11 +850,9 @@ def search_species_options(database=ENSEMBL_FTP_URL, release=None):
ENS_rel = find_latest_ens_rel(database)
# If release != None, use user-defined Ensembl release
- if release != None:
+ if release is not None:
if release > ENS_rel:
- logger.warning(
- f"Provided Ensembl release number {release} is greater than the latest release ({ENS_rel})."
- )
+ logger.warning(f"Provided Ensembl release number {release} is greater than the latest release ({ENS_rel}).")
ENS_rel = release
## Find all available databases
@@ -929,9 +883,7 @@ def search_species_options(database=ENSEMBL_FTP_URL, release=None):
# Raise error if status code not "OK" Response
if html.status_code != 200:
- raise RuntimeError(
- f"The Ensembl server returned error status code {html.status_code}. Please try again."
- )
+ raise RuntimeError(f"The Ensembl server returned error status code {html.status_code}. Please try again.")
soup = BeautifulSoup(html.text, "html.parser")
@@ -944,8 +896,9 @@ def search_species_options(database=ENSEMBL_FTP_URL, release=None):
return databases
-@functools.lru_cache(maxsize=None)
+@functools.cache
def find_nv_kingdom(species, release):
+ """Return the Ensembl non-vertebrate kingdom that contains the given species for a release."""
kds = ["plants", "protists", "metazoa", "fungi"]
for kingdom in kds:
url = ENSEMBL_FTP_URL_NV + f"release-{release}/{kingdom}/fasta/"
@@ -953,9 +906,7 @@ def find_nv_kingdom(species, release):
# Raise error if status code not "OK" Response
if html.status_code != 200:
- raise RuntimeError(
- f"The Ensembl server returned error status code {html.status_code}. Please try again."
- )
+ raise RuntimeError(f"The Ensembl server returned error status code {html.status_code}. Please try again.")
# Parse the html and generate a clean list of the available genomes
soup = BeautifulSoup(html.text, "html.parser")
@@ -969,10 +920,9 @@ def find_nv_kingdom(species, release):
return kingdom
-@functools.lru_cache(maxsize=None)
+@functools.cache
def ref_species_options(which, database=ENSEMBL_FTP_URL, release=None):
- """
- Function to find all available species for gget ref.
+ """Function to find all available species for gget ref.
Args:
- which Which type of file to check for.
@@ -987,12 +937,10 @@ def ref_species_options(which, database=ENSEMBL_FTP_URL, release=None):
ENS_rel = find_latest_ens_rel(database)
# If release != None, use user-defined Ensembl release
- if release != None:
+ if release is not None:
# Warn user if user-defined release is higher than the latest release
if release > ENS_rel:
- logger.warning(
- f"Provided Ensembl release number {release} is greater than the latest release ({ENS_rel})."
- )
+ logger.warning(f"Provided Ensembl release number {release} is greater than the latest release ({ENS_rel}).")
ENS_rel = release
# Handle structure of non-vertebrate database
@@ -1034,9 +982,7 @@ def ref_species_options(which, database=ENSEMBL_FTP_URL, release=None):
# Raise error if status code not "OK" Response
if html.status_code != 200:
- raise RuntimeError(
- f"The Ensembl server returned error status code {html.status_code}. Please try again."
- )
+ raise RuntimeError(f"The Ensembl server returned error status code {html.status_code}. Please try again.")
# Parse the html and generate a clean list of the available genomes
soup = BeautifulSoup(html.text, "html.parser")
@@ -1052,8 +998,8 @@ def ref_species_options(which, database=ENSEMBL_FTP_URL, release=None):
def parse_blast_ref_page(handle):
- """
- Extract RID and RTOE from the NCBI 'please wait' page (handle).
+ """Extract RID and RTOE from the NCBI 'please wait' page (handle).
+
RTOE = 'Estimated time fo completion.'
RID = 'Request ID'.
@@ -1064,7 +1010,6 @@ def parse_blast_ref_page(handle):
Biopython License Agreement and BSD 3-Clause License
https://github.com/biopython/biopython/blob/171697883aca6894f8367f8f20f1463ce7784d0c/LICENSE.rst
"""
-
# Decode handle
string = handle.read().decode()
@@ -1107,9 +1052,7 @@ def parse_blast_ref_page(handle):
msg = string[i:].split("<", 1)[0].split("\n", 1)[0].strip()
raise ValueError(f"Error message from NCBI: {msg}")
# Raise general error, if the error layout was not recognized
- raise ValueError(
- "No request ID and no estimated time to completion were found in the NCBI 'please wait' page."
- )
+ raise ValueError("No request ID and no estimated time to completion were found in the NCBI 'please wait' page.")
# Raise error if RTOE was found but RID was not
elif not rid:
raise ValueError(
@@ -1126,12 +1069,11 @@ def parse_blast_ref_page(handle):
except ValueError:
raise ValueError(
f"A non-integer estimated time to completion was found in the NCBI 'please wait' page: '{rtoe}'."
- )
+ ) from None
def tsv_to_df(tsv_file, headers=None, skiprows=None):
- """
- Convert tsv file to dataframe format.
+ """Convert tsv file to dataframe format.
Args:
- tsv_file File to be converted
@@ -1143,12 +1085,11 @@ def tsv_to_df(tsv_file, headers=None, skiprows=None):
return df
except pd.errors.EmptyDataError:
- raise RuntimeError(f"tsv to data frame reformatting failed.")
+ raise RuntimeError("tsv to data frame reformatting failed.") from None
def create_tmp_fasta(sequences):
- """
- Create temporary FASTA file from str or list of sequences.
+ """Create temporary FASTA file from str or list of sequences.
Args:
- sequences List of user input amino acid sequences
@@ -1158,7 +1099,7 @@ def create_tmp_fasta(sequences):
# Generate random ID
random_id = str(uuid.uuid4())
- if type(sequences) == str:
+ if isinstance(sequences, str):
sequences = [sequences]
with open(f"tmp_{random_id}.fa", "w") as f:
@@ -1169,8 +1110,7 @@ def create_tmp_fasta(sequences):
def remove_temp_files(files_to_delete):
- """
- Delete temporary files.
+ """Delete temporary files.
Args:
- files_to_delete List of paths to files to delete.
@@ -1181,8 +1121,7 @@ def remove_temp_files(files_to_delete):
def json_list_to_df(json_list, columns) -> pd.DataFrame:
- """
- Convert list of JSON objects to data frame.
+ """Convert list of JSON objects to data frame.
Args:
@@ -1192,7 +1131,6 @@ def json_list_to_df(json_list, columns) -> pd.DataFrame:
Returns data frame with columns as specified in keys.
"""
-
tmp_columns = [[] for _ in range(len(columns))]
for json_obj in json_list:
@@ -1206,7 +1144,7 @@ def json_list_to_df(json_list, columns) -> pd.DataFrame:
value = value[k]
tmp_columns[i].append(value)
- return pd.DataFrame({key[0]: value for key, value in zip(columns, tmp_columns)})
+ return pd.DataFrame({key[0]: value for key, value in zip(columns, tmp_columns, strict=False)})
# FASTA parsing functionality
@@ -1219,8 +1157,10 @@ def json_list_to_df(json_list, columns) -> pd.DataFrame:
# functionality specifically for FASTA files, maintaining compatibility with
# the original BioPython API while removing the external dependency.
+
class FastaRecord:
- """Simple FASTA record class compatible with BioPython SeqIO.SeqRecord"""
+ """Simple FASTA record class compatible with BioPython SeqIO.SeqRecord."""
+
def __init__(self, seq, id, description=""):
self.seq = seq
self.id = id
@@ -1228,35 +1168,35 @@ def __init__(self, seq, id, description=""):
class FastaIO:
- """Simple FASTA parser and writer, compatible with BioPython SeqIO interface"""
-
+ """Simple FASTA parser and writer, compatible with BioPython SeqIO interface."""
+
@staticmethod
def parse(filename, format=None):
- """Parse FASTA file and yield records. Compatible with SeqIO.parse()"""
+ """Parse FASTA file and yield records. Compatible with SeqIO.parse()."""
if format and format.lower() != "fasta":
raise ValueError(f"Unsupported format: {format}")
-
- with open(filename, 'r', encoding='utf-8') as handle:
+
+ with open(filename, encoding="utf-8") as handle:
current_id = None
current_description = ""
current_seq = []
-
+
for line in handle:
line = line.strip()
if not line:
continue
-
- if line.startswith('>'):
+
+ if line.startswith(">"):
# Yield previous record if exists
if current_id is not None:
- seq_str = ''.join(current_seq)
+ seq_str = "".join(current_seq)
yield FastaRecord(seq_str, current_id, current_description)
-
+
# Start new record
header = line[1:] # Remove '>'
- if ' ' in header:
- current_id = header.split(' ', 1)[0]
- current_description = header.split(' ', 1)[1]
+ if " " in header:
+ current_id = header.split(" ", 1)[0]
+ current_description = header.split(" ", 1)[1]
else:
current_id = header
current_description = ""
@@ -1264,27 +1204,27 @@ def parse(filename, format=None):
else:
# Accumulate sequence
current_seq.append(line)
-
+
# Yield final record if exists
if current_id is not None:
- seq_str = ''.join(current_seq)
+ seq_str = "".join(current_seq)
yield FastaRecord(seq_str, current_id, current_description)
-
+
@staticmethod
def write(records, filename, format=None):
- """Write records to FASTA file. Compatible with SeqIO.write()"""
+ """Write records to FASTA file. Compatible with SeqIO.write()."""
if format and format.lower() != "fasta":
raise ValueError(f"Unsupported format: {format}")
-
- with open(filename, 'w', encoding='utf-8') as handle:
+
+ with open(filename, "w", encoding="utf-8") as handle:
for record in records:
# Write header
- if hasattr(record, 'description') and record.description:
+ if hasattr(record, "description") and record.description:
handle.write(f">{record.id} {record.description}\n")
else:
handle.write(f">{record.id}\n")
-
+
# Write sequence (wrap at 70 characters)
seq_str = str(record.seq)
for i in range(0, len(seq_str), 70):
- handle.write(seq_str[i:i+70] + '\n')
+ handle.write(seq_str[i : i + 70] + "\n")
diff --git a/pyproject.toml b/pyproject.toml
index 8fe2f47af..0947f81fb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,3 +1,128 @@
[build-system]
-requires = ["setuptools>=42", "wheel"]
-build-backend = "setuptools.build_meta"
+build-backend = "hatchling.build"
+requires = [ "hatchling" ]
+
+[project]
+name = "gget"
+version = "0.30.6"
+description = "Efficient querying of genomic databases."
+readme = "README.md"
+keywords = [ "gget" ]
+license = "BSD-2-Clause"
+license-files = [ "LICENSE" ]
+maintainers = [
+ { name = "Laura Luebbert", email = "lauralubbert@gmail.com" },
+]
+authors = [
+ { name = "Laura Luebbert", email = "lauralubbert@gmail.com" },
+]
+requires-python = ">=3.12"
+classifiers = [
+ "Environment :: Console",
+ "Framework :: Jupyter",
+ "Intended Audience :: Science/Research",
+ "Operating System :: OS Independent",
+ "Programming Language :: Python :: 3 :: Only",
+ "Programming Language :: Python :: 3.12",
+ "Programming Language :: Python :: 3.13",
+ "Programming Language :: Python :: 3.14",
+ "Topic :: Scientific/Engineering :: Bio-Informatics",
+ "Topic :: Utilities",
+]
+dependencies = [
+ "beautifulsoup4>=4.10",
+ "ipython",
+ "ipywidgets",
+ "lxml",
+ "matplotlib",
+ "mysql-connector-python>=8.0.32",
+ "numpy>=1.17.2",
+ "pandas>=1",
+ "requests>=2.22",
+ "tqdm",
+]
+# Optional feature dependency for `gget cellxgene` (install: pip install gget[cellxgene]).
+# No wheels for the newest Python versions yet (e.g. 3.14 via tiledbsoma).
+optional-dependencies.cellxgene = [ "cellxgene-census" ]
+# https://docs.pypi.org/project_metadata/#project-urls
+urls.Documentation = "https://pachterlab.github.io/gget"
+urls.Homepage = "https://github.com/pachterlab/gget"
+urls.Source = "https://github.com/pachterlab/gget"
+scripts.gget = "gget.main:main"
+
+[dependency-groups]
+dev = [ "pre-commit" ]
+test = [
+ "bravado==11.0.3",
+ "coverage>=7",
+ "openai<=0.28.1",
+ "parameterized==0.9",
+ "pytest>=7",
+ "pytest-cov>=6.2.1",
+]
+
+[tool.hatch]
+build.targets.wheel.packages = [ "gget" ]
+envs.default.installer = "uv"
+envs.hatch-test.matrix = [
+ { python = [ "3.12", "3.13", "3.14" ] },
+]
+# cellxgene-census (the `cellxgene` extra) has no wheels for the newest Python
+# versions yet (e.g. 3.14 via tiledbsoma), so install it only where available;
+# the gget cellxgene test skips itself when the dependency is absent.
+envs.hatch-test.overrides.matrix.python.features = [
+ { value = "cellxgene", if = [ "3.12", "3.13" ] },
+]
+# pyproject.toml is the single source of truth for the environments CI tests:
+# the workflow reads this matrix via `hatch env show --json`.
+envs.hatch-test.default-args = [ "tests" ]
+envs.hatch-test.dependency-groups = [ "test" ]
+
+[tool.ruff]
+line-length = 120
+src = [ "gget" ]
+extend-include = [ "*.ipynb" ]
+format.docstring-code-format = true
+lint.select = [
+ "B", # flake8-bugbear
+ "BLE", # flake8-blind-except
+ "C4", # flake8-comprehensions
+ "D", # pydocstyle
+ "E", # Error detected by Pycodestyle
+ "F", # Errors detected by Pyflakes
+ "I", # isort
+ "RUF100", # Report unused noqa directives
+ "TID", # flake8-tidy-imports
+ "UP", # pyupgrade
+ "W", # Warning detected by Pycodestyle
+]
+lint.ignore = [
+ "B008", # Errors from function calls in argument defaults. These are fine when the result is immutable.
+ "D100", # Missing docstring in public module
+ "D104", # Missing docstring in public package
+ "D105", # __magic__ methods are often self-explanatory, allow missing docstrings
+ "D107", # Missing docstring in __init__
+ # Disable one in each pair of mutually incompatible rules
+ "D203", # We don’t want a blank line before a class docstring
+ "D213", # <> We want docstrings to start immediately after the opening triple quote
+ "D400", # first line should end with a period [Bug: doesn’t work with single-line docstrings]
+ "D401", # First line should be in imperative mood; try rephrasing
+ "E501", # line too long -> we accept long comment lines; formatter gets rid of long code lines
+ "E731", # Do not assign a lambda expression, use a def -> lambda expression assignments are convenient
+ "E741", # allow I, O, l as variable names -> I is the identity matrix
+]
+lint.per-file-ignores."*/__init__.py" = [ "F401" ]
+lint.per-file-ignores."docs/*" = [ "I" ]
+lint.per-file-ignores."tests/*" = [ "D" ]
+lint.pydocstyle.convention = "numpy"
+
+[tool.pytest]
+ini_options.testpaths = [ "tests" ]
+ini_options.addopts = [ "-ra" ]
+
+[tool.coverage]
+run.omit = [
+ "**/test_*.py",
+ "gget/main.py",
+]
+run.source = [ "gget" ]
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index d9b0ef104..000000000
--- a/requirements.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-# Also add new dependencies to setup.cfg
-numpy>=1.17.2
-pandas>=1.0.0
-requests>=2.22.0
-ipython
-matplotlib
-mysql-connector-python>=8.0.32
-beautifulsoup4>=4.10.0
-ipywidgets
-tqdm
-lxml
diff --git a/setup.cfg b/setup.cfg
deleted file mode 100644
index fffdbc011..000000000
--- a/setup.cfg
+++ /dev/null
@@ -1,43 +0,0 @@
-[metadata]
-name = gget
-version = 0.30.7
-author = Laura Luebbert
-author_email = lauralubbert@gmail.com
-maintainer = Laura Luebbert
-maintainer_email = lauralubbert@gmail.com
-description = Efficient querying of genomic databases.
-long_description = file: README.md
-long_description_content_type = text/markdown
-license = BSD-2
-url = https://github.com/scverse/gget
-keywords = gget
-classifiers =
- Environment :: Console
- Framework :: Jupyter
- Intended Audience :: Science/Research
- License :: OSI Approved :: BSD License
- Operating System :: OS Independent
- Programming Language :: Python :: 3.8
- Programming Language :: Python :: 3.9
- Programming Language :: Python :: 3.10
- Programming Language :: Python :: 3.11
- Programming Language :: Python :: 3.12
- Topic :: Scientific/Engineering :: Bio-Informatics
- Topic :: Utilities
-
-[options]
-python_requires = >=3.8
-packages = find:
-include_package_data = True
-zip_safe = False
-install_requires =
- numpy>=1.17.2
- pandas>=1.0.0
- requests>=2.22.0
- ipython
- matplotlib
- mysql-connector-python>=8.0.32
- beautifulsoup4>=4.10.0
- ipywidgets
- tqdm
- lxml
diff --git a/setup.py b/setup.py
deleted file mode 100644
index 0ff30dc7d..000000000
--- a/setup.py
+++ /dev/null
@@ -1,9 +0,0 @@
-from setuptools import setup, find_packages
-
-setup(
- packages=find_packages(include=["gget", "gget.*"]),
- include_package_data=True,
- entry_points={
- "console_scripts": ["gget=gget.main:main"],
- },
-)
diff --git a/tests/from_json.py b/tests/from_json.py
index 2cc562a9d..1b41cbcf6 100644
--- a/tests/from_json.py
+++ b/tests/from_json.py
@@ -1,14 +1,16 @@
from __future__ import annotations
import unittest
+
unittest.TestCase.maxDiff = 10_000
# from typing import Callable, Any, Optional, Union
-import logging
-import pandas as pd
-import sys
-import json
-import hashlib
+import hashlib # noqa: E402
+import json # noqa: E402
+import logging # noqa: E402
+import sys # noqa: E402
+
+import pandas as pd # noqa: E402
# Here's a question: how many errors does Copilot know? Answer: see below.
_KNOWN_ERRORS = {
@@ -66,16 +68,17 @@ def assert_equal(self: unittest.TestCase):
result_to_test = do_call(func, td[test]["args"])
if test == "test_cosmic_defaults": # special case for cosmic
import numpy as np
+
expected_result = pd.DataFrame(expected_result[0])
expected_result = expected_result.replace({None: np.nan})
# result_to_test.equals(expected_result)
pd.testing.assert_frame_equal(result_to_test, expected_result, check_dtype=False)
return
-
+
# If result is a DataFrame, convert to list
if isinstance(result_to_test, pd.DataFrame):
result_to_test = result_to_test.dropna(axis=1).values.tolist()
-
+
self.assertEqual(result_to_test, expected_result)
return assert_equal
@@ -117,10 +120,7 @@ def assert_none(self: unittest.TestCase):
self.assertIn(
expected_log,
joined,
- msg=(
- f"Expected log substring {expected_log!r} not found. "
- f"Captured: {joined}"
- ),
+ msg=(f"Expected log substring {expected_log!r} not found. Captured: {joined}"),
)
else:
result_to_test = do_call(func, td[test]["args"])
@@ -154,9 +154,7 @@ def assert_equal_nested(self: unittest.TestCase):
result_to_test = do_call(func, td[test]["args"])
# If result is a DataFrame, convert to json (nested dataframes prevent easy listification)
if isinstance(result_to_test, pd.DataFrame):
- result_to_test = json.loads(
- result_to_test.to_json(orient="records", force_ascii=False)
- )
+ result_to_test = json.loads(result_to_test.to_json(orient="records", force_ascii=False))
self.assertEqual(result_to_test, expected_result)
@@ -170,9 +168,7 @@ def assert_equal_json_hash_nested(self: unittest.TestCase):
result_to_test = do_call(func, td[test]["args"])
# If result is a DataFrame, convert to json (nested dataframes prevent easy listification)
if isinstance(result_to_test, pd.DataFrame):
- result_to_test = json.loads(
- result_to_test.to_json(orient="records", force_ascii=False)
- )
+ result_to_test = json.loads(result_to_test.to_json(orient="records", force_ascii=False))
result_to_test = json.dumps(result_to_test)
result_to_test = hashlib.md5(result_to_test.encode()).hexdigest()
@@ -181,6 +177,7 @@ def assert_equal_json_hash_nested(self: unittest.TestCase):
return assert_equal_json_hash_nested
+
def _assert_equal_json_with_keys(name, td, func):
def assert_equal_json_with_keys(self: unittest.TestCase):
def normalize(x):
@@ -194,10 +191,10 @@ def normalize(x):
if all(isinstance(i, list) and len(i) == 2 for i in x):
try:
x = {i[0]: i[1] for i in x}
- except Exception:
+ except Exception: # noqa: BLE001
try:
x = {i[1]: i[0] for i in x}
- except Exception:
+ except Exception: # noqa: BLE001
pass
# Collapse singleton wrappers such as:
@@ -218,16 +215,14 @@ def normalize(x):
return x
return x
-
+
test = name
-
+
expected_result = td[test]["expected_result"]
result_to_test = do_call(func, td[test]["args"])
if isinstance(result_to_test, pd.DataFrame):
result_to_test = json.loads(
- result_to_test.dropna(axis=1, how="all").to_json(
- orient="records", force_ascii=False
- )
+ result_to_test.dropna(axis=1, how="all").to_json(orient="records", force_ascii=False)
)
result_to_test = normalize(result_to_test)
@@ -240,19 +235,12 @@ def normalize(x):
keys = list(expected_result[0].keys())
# Convert list-of-lists → list-of-dicts
- if (
- isinstance(result_to_test, list)
- and len(result_to_test) > 0
- and isinstance(result_to_test[0], list)
- ):
- result_to_test = [dict(zip(keys, row)) for row in result_to_test]
+ if isinstance(result_to_test, list) and len(result_to_test) > 0 and isinstance(result_to_test[0], list):
+ result_to_test = [dict(zip(keys, row, strict=False)) for row in result_to_test]
# Optional: float rounding
def round_dict(d, ndigits=10):
- return {
- k: round(v, ndigits) if isinstance(v, float) else v
- for k, v in d.items()
- }
+ return {k: round(v, ndigits) if isinstance(v, float) else v for k, v in d.items()}
result_to_test = [round_dict(r) for r in result_to_test]
expected_result = [round_dict(r) for r in expected_result]
@@ -261,12 +249,13 @@ def round_dict(d, ndigits=10):
return assert_equal_json_with_keys
+
def _error(name, td, func):
try:
# noinspection PyPep8Naming
Error = td[name]["expected_result"]
except KeyError:
- raise ValueError("Error test must have an 'expected_result' key.")
+ raise ValueError("Error test must have an 'expected_result' key.") from None
if Error not in _KNOWN_ERRORS:
raise ValueError(f"Unknown error type: {Error}")
@@ -275,9 +264,7 @@ def _error(name, td, func):
Error = _KNOWN_ERRORS[Error]
if "expected_msg" not in td[name]:
- print(
- f"^ Warning: 'error' test should have an 'expected_msg' key, but test '{name}' lacks one."
- )
+ print(f"^ Warning: 'error' test should have an 'expected_msg' key, but test '{name}' lacks one.")
def error(self: unittest.TestCase):
test = name
@@ -286,9 +273,7 @@ def error(self: unittest.TestCase):
the_exception = cm.exception
if "expected_msg" in td[test]:
- self.assertEqual(
- td[test]["expected_msg"], str(the_exception), f"Error message mismatch"
- )
+ self.assertEqual(td[test]["expected_msg"], str(the_exception), "Error message mismatch")
return error
@@ -329,9 +314,7 @@ def __new__(cls, name, bases, dct):
type_ = v["type"]
if type_ == "code_defined":
if k not in dct:
- raise ValueError(
- f"Test {k} is not defined in code, despite being of type 'code_defined'."
- )
+ raise ValueError(f"Test {k} is not defined in code, despite being of type 'code_defined'.")
continue
if type_ in local_types:
if not k.startswith("test_"):
@@ -360,9 +343,7 @@ def inner(*args, **kwargs):
print(f"Loaded test {k} of type {type_} from json.")
else:
if k not in dct:
- raise ValueError(
- f"Unknown test type: {type_} and no test method defined."
- )
+ raise ValueError(f"Unknown test type: {type_} and no test method defined.")
print(f"Unknown test type: {type_}", file=sys.stderr)
return super().__new__(cls, name, bases, dct)
diff --git a/tests/test_8cube.py b/tests/test_8cube.py
index a26bfdafc..0a3d14a51 100644
--- a/tests/test_8cube.py
+++ b/tests/test_8cube.py
@@ -1,8 +1,9 @@
-import unittest
import json
import os
+import unittest
+
+from gget.gget_8cube import gene_expression, psi_block, specificity
-from gget.gget_8cube import specificity, psi_block, gene_expression
from .from_json import from_json
# Load JSON fixture
@@ -19,9 +20,7 @@
gene_expression_tests = {k: v for k, v in fixture.items() if "gene_expression" in k}
-class TestSpecificity(
- unittest.TestCase, metaclass=from_json(specificity_tests, specificity)
-):
+class TestSpecificity(unittest.TestCase, metaclass=from_json(specificity_tests, specificity)):
"""Tests for specificity()"""
pass
@@ -33,9 +32,7 @@ class TestPsiBlock(unittest.TestCase, metaclass=from_json(psi_block_tests, psi_b
pass
-class TestGeneExpression(
- unittest.TestCase, metaclass=from_json(gene_expression_tests, gene_expression)
-):
+class TestGeneExpression(unittest.TestCase, metaclass=from_json(gene_expression_tests, gene_expression)):
"""Tests for gene_expression()"""
pass
diff --git a/tests/test_archs4.py b/tests/test_archs4.py
index c9847cadc..c6336bc91 100644
--- a/tests/test_archs4.py
+++ b/tests/test_archs4.py
@@ -1,6 +1,8 @@
-import unittest
import json
+import unittest
+
from gget.gget_archs4 import archs4
+
from .from_json import from_json
# Load dictionary containing arguments and expected results
diff --git a/tests/test_bgee.py b/tests/test_bgee.py
index 58af3cb49..08903199a 100644
--- a/tests/test_bgee.py
+++ b/tests/test_bgee.py
@@ -1,6 +1,8 @@
-import unittest
import json
+import unittest
+
from gget.gget_bgee import bgee
+
from .from_json import from_json
# Load dictionary containing arguments and expected results
diff --git a/tests/test_blast.py b/tests/test_blast.py
index 9a668c34f..8fea295e6 100644
--- a/tests/test_blast.py
+++ b/tests/test_blast.py
@@ -1,12 +1,14 @@
-import unittest
import json
+import unittest
+
from gget.gget_blast import blast
+
from .from_json import from_json
# Load dictionary containing arguments and expected results
with open("./tests/fixtures/test_blast.json") as json_file:
blast_dict = json.load(json_file)
+
class TestBlast(unittest.TestCase, metaclass=from_json(blast_dict, blast)):
pass # all tests are loaded from json
-
\ No newline at end of file
diff --git a/tests/test_blat.py b/tests/test_blat.py
index d871c27bb..adcd114b4 100644
--- a/tests/test_blat.py
+++ b/tests/test_blat.py
@@ -1,6 +1,8 @@
-import unittest
import json
+import unittest
+
from gget.gget_blat import blat
+
from .from_json import from_json
# Load dictionary containing arguments and expected results
diff --git a/tests/test_cbio.py b/tests/test_cbio.py
index 9e9629691..66040f191 100644
--- a/tests/test_cbio.py
+++ b/tests/test_cbio.py
@@ -1,9 +1,10 @@
-import hashlib
+import json
import os
import unittest
-import json
-from gget.gget_cbio import download_cbioportal_data, cbio_search
-from .from_json import from_json, do_call
+
+from gget.gget_cbio import cbio_search, download_cbioportal_data
+
+from .from_json import do_call, from_json
# Load dictionary containing arguments and expected results
with open("./tests/fixtures/test_cbio_search.json") as json_file:
@@ -13,9 +14,7 @@
cb_dict = json.load(json_file)
-class TestCbioSearch(
- unittest.TestCase, metaclass=from_json(cb_search_dict, cbio_search)
-):
+class TestCbioSearch(unittest.TestCase, metaclass=from_json(cb_search_dict, cbio_search)):
pass # all tests are loaded from json
@@ -24,9 +23,7 @@ def cbio_download(self: unittest.TestCase):
test = name
expected_result = td[test]["expected_result"]
- if not isinstance(expected_result, dict) and not isinstance(
- expected_result, bool
- ):
+ if not isinstance(expected_result, dict) and not isinstance(expected_result, bool):
raise ValueError("Expected result must be a dictionary or a boolean")
result = do_call(func, td[test]["args"])
@@ -35,7 +32,7 @@ def cbio_download(self: unittest.TestCase):
# # check that all files downloaded
# self.assertTrue(result)
- for file_name, expected_hash in expected_result.items():
+ for file_name, _expected_hash in expected_result.items():
if os.path.exists(file_name):
# # check non-empty
if os.path.getsize(file_name) == 0:
@@ -57,8 +54,6 @@ def cbio_download(self: unittest.TestCase):
class TestCbio(
unittest.TestCase,
- metaclass=from_json(
- cb_dict, download_cbioportal_data, {"cbio_download": _cbio_download}
- ),
+ metaclass=from_json(cb_dict, download_cbioportal_data, {"cbio_download": _cbio_download}),
):
pass # all tests are loaded from json
diff --git a/tests/test_cellxgene.py b/tests/test_cellxgene.py
index 07acf7301..29b77e67d 100644
--- a/tests/test_cellxgene.py
+++ b/tests/test_cellxgene.py
@@ -1,7 +1,14 @@
-import unittest
-import pandas as pd
+import importlib.util
import json
-from gget.gget_cellxgene import cellxgene, SUPPORTED_SPECIES
+import unittest
+
+from gget.gget_cellxgene import SUPPORTED_SPECIES, cellxgene
+
+# cellxgene-census has no wheels for some newer Python versions (e.g. 3.14, via
+# its tiledbsoma dependency). The live integration tests below need it, so they
+# skip when it is unavailable; the validation tests do not need it (the species
+# allowlist check raises before the optional dependency is imported) and always run.
+_HAS_CELLXGENE_CENSUS = importlib.util.find_spec("cellxgene_census") is not None
# Load dictionary containing arguments and expected results
with open("./tests/fixtures/test_cellxgene.json") as json_file:
@@ -9,9 +16,7 @@
def repr_dict(adata):
- """
- Function to convert the items/structure of an AnnData object to a dictionary.
- """
+ """Convert the items/structure of an AnnData object to a dictionary."""
d = {}
for attr in (
"n_obs",
@@ -35,6 +40,7 @@ def repr_dict(adata):
return d
+@unittest.skipUnless(_HAS_CELLXGENE_CENSUS, "cellxgene-census is not installed")
class TestCellxgene(unittest.TestCase):
def test_cellxgene_adata(self):
test = "test_cellxgene_adata"
diff --git a/tests/test_compile.py b/tests/test_compile.py
index 619fef4e7..ed0a71c9a 100644
--- a/tests/test_compile.py
+++ b/tests/test_compile.py
@@ -1,12 +1,9 @@
+import contextlib
+import os
import unittest
# Used here to mock different operating systems
-from unittest.mock import patch
-from unittest.mock import MagicMock
-
-import os
-import shutil
-import contextlib
+from unittest.mock import MagicMock, patch
from gget.compile import compile_muscle
@@ -23,6 +20,7 @@ def test_compiler_windows(self):
with contextlib.redirect_stdout(open(os.devnull, "w")):
compile_muscle()
+
## The make command requires different programs for each OS, so these tests do not work universally
# class TestCompilerLinux(unittest.TestCase):
# def test_compiler_linux(self):
diff --git a/tests/test_cosmic.py b/tests/test_cosmic.py
index 5a5b33561..9a7b3fe93 100644
--- a/tests/test_cosmic.py
+++ b/tests/test_cosmic.py
@@ -1,11 +1,11 @@
-import unittest
+import json
import os
import pathlib as pl
-import pandas as pd
-import json
import time
+import unittest
from gget.gget_cosmic import cosmic
+
# from gget.utils import get_latest_cosmic
from .from_json import from_json
@@ -28,10 +28,12 @@
class TestCaseBase(unittest.TestCase):
def assertIsFile(self, path):
if not pl.Path(path).resolve().is_file():
- raise AssertionError("File does not exist: %s" % str(path))
+ raise AssertionError(f"File does not exist: {path}")
-class TestCosmicWorkflow(TestCaseBase, metaclass=from_json(cosmic_dict, cosmic, pre_test=lambda: time.sleep(sleep_time))):
+class TestCosmicWorkflow(
+ TestCaseBase, metaclass=from_json(cosmic_dict, cosmic, pre_test=lambda: time.sleep(sleep_time))
+):
"""
Combined test class to:
1. Download COSMIC cancer_example data
diff --git a/tests/test_diamond.py b/tests/test_diamond.py
index 5bf602744..24db19e64 100644
--- a/tests/test_diamond.py
+++ b/tests/test_diamond.py
@@ -1,7 +1,8 @@
-import unittest
import json
+import unittest
from gget.gget_diamond import diamond
+
from .from_json import from_json
# Load dictionary containing arguments and expected results
diff --git a/tests/test_elm.py b/tests/test_elm.py
index 983a74a52..de8f67c35 100644
--- a/tests/test_elm.py
+++ b/tests/test_elm.py
@@ -1,5 +1,5 @@
-import unittest
import json
+import unittest
from gget.gget_elm import elm
from gget.gget_setup import setup as gget_setup
@@ -10,16 +10,14 @@
gget_setup(module="elm")
+
class TestELM(unittest.TestCase):
def test_elm_uniprot_id_in_elm(self):
test = "test1"
expected_result = elm_dict[test]["expected_result"]
result1, result2 = elm(**elm_dict[test]["args"])
- result_to_test = (
- result1.dropna(axis=1).values.tolist()
- + result2.dropna(axis=1).values.tolist()[15:20]
- )
+ result_to_test = result1.dropna(axis=1).values.tolist() + result2.dropna(axis=1).values.tolist()[15:20]
self.assertListEqual(result_to_test, expected_result)
@@ -28,10 +26,7 @@ def test_elm_uniprot_id_new(self):
expected_result = elm_dict[test]["expected_result"]
result1, result2 = elm(**elm_dict[test]["args"])
- result_to_test = (
- result1.dropna(axis=1).values.tolist()
- + result2.dropna(axis=1).values.tolist()[15:20]
- )
+ result_to_test = result1.dropna(axis=1).values.tolist() + result2.dropna(axis=1).values.tolist()[15:20]
self.assertListEqual(result_to_test, expected_result)
@@ -40,9 +35,6 @@ def test_elm_uniprot_aminoacidseq(self):
expected_result = elm_dict[test]["expected_result"]
result1, result2 = elm(**elm_dict[test]["args"])
- result_to_test = (
- result1.dropna(axis=1).values.tolist()
- + result2.dropna(axis=1).values.tolist()[15:20]
- )
+ result_to_test = result1.dropna(axis=1).values.tolist() + result2.dropna(axis=1).values.tolist()[15:20]
self.assertListEqual(result_to_test, expected_result)
diff --git a/tests/test_enrichr.py b/tests/test_enrichr.py
index 5e0da6c85..3df50fa34 100644
--- a/tests/test_enrichr.py
+++ b/tests/test_enrichr.py
@@ -1,9 +1,11 @@
-import unittest
-import pandas as pd
import json
+import math
+import unittest
+
import matplotlib
import matplotlib.pyplot as plt
-import math
+import pandas as pd
+
from .from_json import from_json
# Prevent matplotlib from opening windows
@@ -31,10 +33,7 @@ def test_enrichr_background(self):
# If result is a DataFrame, convert to list
if isinstance(result_to_test, pd.DataFrame):
result_to_test = result_to_test.values.tolist()[:20]
- result_to_test = [
- list(map(lambda x: x if x != math.inf else "inf", i))
- for i in result_to_test
- ]
+ result_to_test = [[x if x != math.inf else "inf" for x in i] for i in result_to_test]
self.assertListEqual(result_to_test, expected_result)
@@ -45,10 +44,7 @@ def test_enrichr_background_ensembl(self):
# If result is a DataFrame, convert to list
if isinstance(result_to_test, pd.DataFrame):
result_to_test = result_to_test.values.tolist()
- result_to_test = [
- list(map(lambda x: x if x != math.inf else "inf", i))
- for i in result_to_test
- ]
+ result_to_test = [[x if x != math.inf else "inf" for x in i] for i in result_to_test]
self.assertListEqual(result_to_test, expected_result)
diff --git a/tests/test_gpt.py b/tests/test_gpt.py
index 824a70227..d3853a557 100644
--- a/tests/test_gpt.py
+++ b/tests/test_gpt.py
@@ -1,5 +1,6 @@
import unittest
from unittest.mock import patch
+
from gget.gget_gpt import gpt
diff --git a/tests/test_info.py b/tests/test_info.py
index 359613532..c4157bb11 100644
--- a/tests/test_info.py
+++ b/tests/test_info.py
@@ -1,9 +1,11 @@
-import unittest
# import unittest.mock
# import pandas as pd
import json
+import unittest
+
# import time
from gget.gget_info import info
+
from .from_json import from_json
# Load dictionary containing arguments and expected results
@@ -13,9 +15,11 @@
# Sleep time in seconds (wait [sleep_time] seconds between server requests to avoid 502 errors for WB and FB IDs)
# sleep_time = 15
+
class TestInfo(unittest.TestCase, metaclass=from_json(info_dict, info)):
pass # all tests are loaded from json
+
# # todo convert to json loading once wormbase & flybase IDs are fixed. At that point, the json test framework will need a way to handle the ANY values
# class TestInfo(unittest.TestCase):
# maxDiff = None
diff --git a/tests/test_muscle.py b/tests/test_muscle.py
index 10a1202de..5ba50e509 100644
--- a/tests/test_muscle.py
+++ b/tests/test_muscle.py
@@ -1,10 +1,10 @@
+import contextlib
+import filecmp
+import os
import unittest
# Library to test functions that have calls to print()
from unittest import mock
-import os
-import contextlib
-import filecmp
from gget.gget_muscle import muscle
@@ -28,7 +28,7 @@ def test_muscle_nt(self):
)
def tearDown(self):
- super(TestMuscle, self).tearDown()
+ super().tearDown()
# Delete temporary result file
os.remove("tests/fixtures/tmp.afa")
@@ -49,11 +49,6 @@ def test_muscle_nt_txt(self):
"The reference and muscle nucleotide alignment are not the same.",
)
- def tearDown(self):
- super(TestMuscle, self).tearDown()
- # Delete temporary result file
- os.remove("tests/fixtures/tmp.afa")
-
class TestMuscleSuper(unittest.TestCase):
def test_muscle_nt_super5(self):
@@ -74,7 +69,7 @@ def test_muscle_nt_super5(self):
)
def tearDown(self):
- super(TestMuscleSuper, self).tearDown()
+ super().tearDown()
# Delete temporary result file
os.remove("tests/fixtures/tmp.afa")
@@ -98,7 +93,7 @@ def test_muscle_aa(self):
)
def tearDown(self):
- super(TestMuscleAA, self).tearDown()
+ super().tearDown()
# Delete temporary result file
os.remove("tests/fixtures/tmp.afa")
@@ -127,7 +122,7 @@ def test_muscle_seqs_as_input(self):
)
def tearDown(self):
- super(TestMuscleSeqsInput, self).tearDown()
+ super().tearDown()
# Delete temporary result file
os.remove("tests/fixtures/tmp.afa")
@@ -151,7 +146,7 @@ def test_muscle_aa_super5(self):
)
def tearDown(self):
- super(TestMuscleAASuper, self).tearDown()
+ super().tearDown()
# Delete temporary result file
os.remove("tests/fixtures/tmp.afa")
@@ -167,6 +162,4 @@ def test_muscle_print_nt(self):
muscle(fasta)
# print_mock.assert_called_with("\n")
# print_mock.assert_called_with("test1\n", "\x1b[38;5;15m\x1b[48;5;9mA\x1b[0;0m")
- print_mock.assert_called_with(
- "test2\n", "\t", "\x1b[38;5;15m\x1b[48;5;9mA\x1b[0;0m"
- )
+ print_mock.assert_called_with("test2\n", "\t", "\x1b[38;5;15m\x1b[48;5;9mA\x1b[0;0m")
diff --git a/tests/test_mutate.py b/tests/test_mutate.py
index e990ad8f9..5d8f7250a 100644
--- a/tests/test_mutate.py
+++ b/tests/test_mutate.py
@@ -1,16 +1,15 @@
import json
-
-import pytest
+import os
+import tempfile
import unittest
+
import gget
import pandas as pd
-import os
-import tempfile
-from .from_json import from_json, do_call
+import pytest
+
+from .from_json import do_call, from_json
-LONG_SEQUENCE = (
- "CCCCGCCCCACCCCGCCCCTCCCCGCCCCACCCCGCCCCTCCCCGCCCCACCCCGCCCCTCCCCGCCCCACCCCG"
-)
+LONG_SEQUENCE = "CCCCGCCCCACCCCGCCCCTCCCCGCCCCACCCCGCCCCTCCCCGCCCCACCCCGCCCCTCCCCGCCCCACCCCG"
EXTRA_LONG_SEQUENCE = "CCCCGCCCCACCCCGCCCCTCCCCGCCCCACCCCGCCCCTCCCCGCCCCACCCCGCCCCTCCCCGCCCCACCCCGCCCCTCCCCGCCCCACCCCGCCCCTCCCCGCCCCACCCCGCCCCTCCCCGCCCCACCCCG"
LONG_SEQUENCE_WITH_N = "CCCCGCCCCACCCCGCCCCTCCCCGCCCCACCCCGCCCCNCCCCGCCCCACCCCGCCCCTCCCCGCCCCACCCCGCCCCTCCCCGCCCCACCCCGCCCCTCCCCGCCCCACCCCGCCCCTCCCCGCCCCACCCCG"
@@ -46,7 +45,7 @@ def create_temp_files():
temp_fasta_file = tempfile.NamedTemporaryFile(delete=False, suffix=".fasta")
with open(temp_fasta_file.name, "w") as fasta_file:
- for seq_id, sequence in zip(seq_ID_list, sequence_list):
+ for seq_id, sequence in zip(seq_ID_list, sequence_list, strict=False):
fasta_file.write(f">{seq_id}\n")
fasta_file.write(f"{sequence}\n")
@@ -65,15 +64,9 @@ def assert_global_variables_zero(
number_index_errors=0,
):
assert gget.gget_mutate.intronic_mutations == number_intronic_position_mutations
- assert (
- gget.gget_mutate.posttranslational_region_mutations
- == number_posttranslational_region_mutations
- )
+ assert gget.gget_mutate.posttranslational_region_mutations == number_posttranslational_region_mutations
assert gget.gget_mutate.uncertain_mutations == number_uncertain_mutations
- assert (
- gget.gget_mutate.ambiguous_position_mutations
- == number_ambiguous_position_mutations
- )
+ assert gget.gget_mutate.ambiguous_position_mutations == number_ambiguous_position_mutations
assert gget.gget_mutate.mut_idx_outside_seq == number_index_errors
@@ -88,9 +81,7 @@ def _recursive_replace(v, old: str, new: str, exact=False):
return v.replace(old, new)
elif isinstance(v, dict):
return {
- _recursive_replace(k, old, new, exact=exact): _recursive_replace(
- v, old, new, exact=exact
- )
+ _recursive_replace(k, old, new, exact=exact): _recursive_replace(v, old, new, exact=exact)
for k, v in v.items()
}
elif isinstance(v, list):
@@ -212,9 +203,7 @@ class TestMutate(
def test_csv_of_mutations(create_temp_files):
mutation_temp_csv_file, sequence_temp_fasta_path = create_temp_files
- result = gget.mutate(
- sequences=sequence_temp_fasta_path, mutations=mutation_temp_csv_file
- )
+ result = gget.mutate(sequences=sequence_temp_fasta_path, mutations=mutation_temp_csv_file)
assert result == [
"GCCCCACCCCGCCCCTCCCCGCCCCACCCCACCCCTCCCCGCCCCACCCCGCCCCTCCCCG",
diff --git a/tests/test_opentargets.py b/tests/test_opentargets.py
index 3879b0921..e73ced1cc 100644
--- a/tests/test_opentargets.py
+++ b/tests/test_opentargets.py
@@ -1,6 +1,8 @@
-import unittest
import json
+import unittest
+
from gget.gget_opentargets import opentargets
+
from .from_json import from_json
# Load dictionary containing arguments and expected results
diff --git a/tests/test_pdb.py b/tests/test_pdb.py
index ecaacfeb6..a805e09a1 100644
--- a/tests/test_pdb.py
+++ b/tests/test_pdb.py
@@ -1,8 +1,8 @@
-import unittest
-import pandas as pd
-import json
import filecmp
+import json
import os
+import unittest
+
from gget.gget_pdb import pdb
# Load dictionary containing arguments and expected results
@@ -79,7 +79,7 @@ def test_pdb_pdb(self):
)
def tearDown(self):
- super(TestPDB, self).tearDown()
+ super().tearDown()
# Delete temporary result file
try:
os.remove("4ACQ.pdb")
diff --git a/tests/test_ref.py b/tests/test_ref.py
index 6111e4dfe..c6ab3e0e8 100644
--- a/tests/test_ref.py
+++ b/tests/test_ref.py
@@ -1,6 +1,8 @@
-import unittest
import json
+import unittest
+
from gget.gget_ref import ref
+
from .from_json import from_json
# Load dictionary containing arguments and expected results
diff --git a/tests/test_search.py b/tests/test_search.py
index b6b911cbf..f284c5368 100644
--- a/tests/test_search.py
+++ b/tests/test_search.py
@@ -1,7 +1,8 @@
-import unittest
-import pandas as pd
import json
+import unittest
+
from gget.gget_search import search
+
from .from_json import from_json
# Load dictionary containing arguments and expected results
diff --git a/tests/test_seq.py b/tests/test_seq.py
index 94893be2c..8d34e4ed7 100644
--- a/tests/test_seq.py
+++ b/tests/test_seq.py
@@ -1,7 +1,7 @@
-import unittest
-import pandas as pd
import json
import time
+import unittest
+
from gget.gget_seq import seq
# Load dictionary containing arguments and expected results
@@ -11,6 +11,7 @@
# Sleep time in seconds (wait [sleep_time] seconds between server requests to avoid 502 errors for WB and FB IDs)
sleep_time = 10
+
# todo convert to json loading once wormbase & flybase IDs are fixed. At that point, the json test framework will need a way to handle the ANY values
class TestSeq(unittest.TestCase):
def test_seq_gene(self):
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 125b36578..1eda8078d 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,33 +1,30 @@
import unittest
-import numpy as np
+
+from gget.constants import ENSEMBL_FTP_URL_NV, ENSEMBL_REST_API, UNIPROT_REST_API
from gget.utils import (
- n_colors,
aa_colors,
- get_uniprot_seqs,
+ find_latest_ens_rel,
get_uniprot_info,
+ get_uniprot_seqs,
+ n_colors,
+ read_fasta,
+ ref_species_options,
rest_query,
- find_latest_ens_rel,
search_species_options,
- ref_species_options,
- read_fasta,
)
-from gget.constants import UNIPROT_REST_API, ENSEMBL_REST_API, ENSEMBL_FTP_URL_NV
-
from .fixtures import (
- LATEST_ENS_RELEASE,
- SPECIES_OPTIONS,
IV_SPECIES_OPTIONS,
- REF_SPECIES_OPTIONS,
+ LATEST_ENS_RELEASE,
REF_IV_SPECIES_OPTIONS,
+ REF_SPECIES_OPTIONS,
+ SPECIES_OPTIONS,
)
class TestUtils(unittest.TestCase):
def test_read_fasta(self):
- result_to_test1, result_to_test2 = read_fasta(
- "tests/fixtures/muscle_nt_test.fa"
- )
+ result_to_test1, result_to_test2 = read_fasta("tests/fixtures/muscle_nt_test.fa")
result_to_test = result_to_test1 + result_to_test2
expected_result1 = [
@@ -55,9 +52,7 @@ def test_aa_colors(self):
self.assertEqual(result_to_test, expected_result)
def test_get_uniprot_seqs(self):
- df = get_uniprot_seqs(
- UNIPROT_REST_API, ["ENST00000392653.3", "ENST00000392657.7"]
- )
+ df = get_uniprot_seqs(UNIPROT_REST_API, ["ENST00000392653.3", "ENST00000392657.7"])
result_to_test = df.values.tolist()
expected_result = [
[
@@ -179,9 +174,7 @@ def test_ref_species_options(self):
self.assertEqual(result_to_test, expected_result)
def test_ref_iv_species_options(self):
- result_to_test = ref_species_options(
- database=ENSEMBL_FTP_URL_NV, which="dna", release=55
- )
+ result_to_test = ref_species_options(database=ENSEMBL_FTP_URL_NV, which="dna", release=55)
expected_result = REF_IV_SPECIES_OPTIONS
self.assertEqual(result_to_test, expected_result)
diff --git a/tests/test_virus.py b/tests/test_virus.py
index 1be574528..12798ac11 100644
--- a/tests/test_virus.py
+++ b/tests/test_virus.py
@@ -129,75 +129,76 @@
Total: 186 tests
"""
-import unittest
+
+import calendar
+import functools
import json
import os
-import re
import shutil
import subprocess
import tempfile
import time
-import functools
+import unittest
import zipfile
-import calendar
from datetime import datetime
import pandas as pd
-
from gget.gget_virus import (
- virus,
- _get_datasets_path,
+ _batch_accessions_for_url,
+ _calculate_max_accessions_per_batch,
+ _clean_xml_declarations,
_clear_datasets_cache,
- _get_modified_virus_name,
- _track_failed_operation,
- _validate_datasets_binary,
+ _deduplicate_metadata_against_baseline,
+ _force_garbage_collection,
+ _genbank_xml_to_csv,
+ _get_datasets_path,
_get_datasets_version,
_get_gget_version,
+ _get_memory_usage,
+ _get_modified_virus_name,
+ _local_name,
+ _merge_baseline_with_new,
_parse_accession_input,
_parse_baseline_file,
- _deduplicate_metadata_against_baseline,
- _save_partial_metadata,
- _merge_baseline_with_new,
- _calculate_max_accessions_per_batch,
- _batch_accessions_for_url,
- _retry_with_exponential_backoff,
_parse_date,
+ _parse_genbank_xml,
_parse_partial_date_for_range_check,
- _clean_xml_declarations,
- _local_name,
+ _retry_with_exponential_backoff,
+ _save_partial_metadata,
+ _stream_copy_fasta,
+ _track_failed_operation,
_unzip_file,
- _get_memory_usage,
- _force_garbage_collection,
- is_sars_cov2_query,
- is_alphainfluenza_query,
- load_metadata_from_api_reports,
+ _validate_datasets_binary,
+ _write_fasta_record,
check_min_max,
- filter_metadata_only,
- filter_genbank_metadata,
filter_cached_metadata_for_unused_filters,
- _write_fasta_record,
- _stream_copy_fasta,
+ filter_genbank_metadata,
+ filter_metadata_only,
filter_sequences,
- save_command_summary,
+ is_alphainfluenza_query,
+ is_sars_cov2_query,
+ load_metadata_from_api_reports,
merge_metadata_csvs,
- save_metadata_to_csv,
- _genbank_xml_to_csv,
- _parse_genbank_xml,
+ save_command_summary,
save_genbank_metadata_to_csv,
+ save_metadata_to_csv,
+ virus,
)
+
from .from_json import from_json
def retry_on_network_error(max_retries=3, delay=5):
"""Decorator to retry tests that may fail due to network issues.
-
+
This is useful for tests that make real API calls to NCBI, which can
occasionally time out or fail due to network flakiness.
-
+
Args:
max_retries: Maximum number of retry attempts (default: 3)
delay: Seconds to wait between retries (default: 5)
"""
+
def decorator(test_func):
@functools.wraps(test_func)
def wrapper(*args, **kwargs):
@@ -208,7 +209,7 @@ def wrapper(*args, **kwargs):
except Exception as e:
# Only retry on network-related errors
error_msg = str(e).lower()
- if any(keyword in error_msg for keyword in ['timeout', 'timed out', 'connection', 'network']):
+ if any(keyword in error_msg for keyword in ["timeout", "timed out", "connection", "network"]):
last_exception = e
if attempt < max_retries - 1:
time.sleep(delay)
@@ -217,9 +218,12 @@ def wrapper(*args, **kwargs):
raise
# If all retries failed, raise the last exception
raise last_exception
+
return wrapper
+
return decorator
+
# Load dictionary containing arguments and expected results
with open("./tests/fixtures/test_virus.json") as json_file:
virus_dict = json.load(json_file)
@@ -227,14 +231,14 @@ def wrapper(*args, **kwargs):
class TestVirus(unittest.TestCase, metaclass=from_json(virus_dict, virus)):
"""Test suite for gget.virus module.
-
+
This comprehensive test suite covers:
-
+
1. Input Validation (19 JSON-defined tests):
- Type checking for boolean, string, and integer parameters
- Value validation (completeness, batch sizes, virus names)
- Range validation (min/max pairs for dates, lengths, counts)
-
+
2. Functional Tests (18 code-defined tests):
- Basic file creation and accession downloads
- Individual filter functionality (host, completeness, length, annotated, refseq)
@@ -245,172 +249,174 @@ class TestVirus(unittest.TestCase, metaclass=from_json(virus_dict, virus)):
- GenBank metadata retrieval
- Multiple filter combinations
- Integer virus ID handling
-
+
3. Data Quality & Verification Tests (6 code-defined tests):
- Relationship checks: FASTA/CSV/JSONL count consistency
- Filter verification: Host and release date filter effectiveness
- Schema validation: Expected metadata columns exist
- Completeness filter verification
- Multi-filter relationship checks
-
+
Coverage: 85% of parameters tested (29/34), with 43 total test cases.
See module docstring for detailed parameter coverage analysis.
"""
-
+
@classmethod
def setUpClass(cls):
"""Set up test fixtures that are shared across all tests."""
cls.test_output_dir = "test_virus_output"
-
+
@classmethod
def tearDownClass(cls):
"""Clean up after all tests have run."""
# Clean up test output directory if it exists
if os.path.exists(cls.test_output_dir):
shutil.rmtree(cls.test_output_dir)
-
+
def setUp(self):
"""Set up before each test method."""
# Create a fresh test output directory
if os.path.exists(self.test_output_dir):
shutil.rmtree(self.test_output_dir)
os.makedirs(self.test_output_dir, exist_ok=True)
-
+
def tearDown(self):
"""Clean up after each test method."""
# Clean up test output directory
if os.path.exists(self.test_output_dir):
try:
shutil.rmtree(self.test_output_dir)
- except Exception:
+ except Exception: # noqa: BLE001
pass # Ignore cleanup errors
-
+
def _check_output_files(self, virus_name, outfolder):
"""Helper method to check if expected output files were created.
-
+
Args:
virus_name: Name of the virus (used in file naming)
outfolder: Output folder where files should be created
-
+
Returns:
dict: Dictionary with file paths and existence status
"""
# Clean virus name for file naming (replace spaces with underscores)
virus_clean = virus_name.replace(" ", "_")
-
+
expected_files = {
"fasta": os.path.join(outfolder, f"{virus_clean}_sequences.fasta"),
"csv": os.path.join(outfolder, f"{virus_clean}_metadata.csv"),
- "jsonl": os.path.join(outfolder, f"{virus_clean}_metadata.jsonl")
+ "jsonl": os.path.join(outfolder, f"{virus_clean}_metadata.jsonl"),
}
-
+
results = {}
for file_type, file_path in expected_files.items():
results[file_type] = {
"path": file_path,
"exists": os.path.exists(file_path),
- "size": os.path.getsize(file_path) if os.path.exists(file_path) else 0
+ "size": os.path.getsize(file_path) if os.path.exists(file_path) else 0,
}
-
+
return results
-
+
def _count_fasta_sequences(self, fasta_file):
"""Count the number of sequences in a FASTA file.
-
+
Args:
fasta_file: Path to FASTA file
-
+
Returns:
int: Number of sequences
"""
count = 0
if os.path.exists(fasta_file):
- with open(fasta_file, 'r') as f:
+ with open(fasta_file) as f:
for line in f:
- if line.startswith('>'):
+ if line.startswith(">"):
count += 1
return count
-
+
def _count_jsonl_records(self, jsonl_file):
"""Count the number of records in a JSONL file.
-
+
Args:
jsonl_file: Path to JSONL file
-
+
Returns:
int: Number of records
"""
count = 0
if os.path.exists(jsonl_file):
- with open(jsonl_file, 'r') as f:
+ with open(jsonl_file) as f:
for line in f:
if line.strip():
count += 1
return count
-
+
def _count_csv_records(self, csv_file):
"""Count the number of records in a CSV file (excluding header).
-
+
Args:
csv_file: Path to CSV file
-
+
Returns:
int: Number of records (excluding header)
"""
count = 0
if os.path.exists(csv_file):
- with open(csv_file, 'r') as f:
+ with open(csv_file) as f:
# Skip header
next(f, None)
for line in f:
if line.strip():
count += 1
return count
-
+
def _parse_csv_metadata(self, csv_file):
"""Parse CSV metadata file and return records as list of dicts.
-
+
Args:
csv_file: Path to CSV file
-
+
Returns:
list: List of dictionaries containing metadata records
"""
import csv
+
records = []
if os.path.exists(csv_file):
- with open(csv_file, 'r', encoding='utf-8') as f:
+ with open(csv_file, encoding="utf-8") as f:
reader = csv.DictReader(f)
for row in reader:
records.append(row)
return records
-
+
def _get_csv_columns(self, csv_file):
"""Get column names from CSV file.
-
+
Args:
csv_file: Path to CSV file
-
+
Returns:
list: List of column names
"""
import csv
+
if os.path.exists(csv_file):
- with open(csv_file, 'r', encoding='utf-8') as f:
+ with open(csv_file, encoding="utf-8") as f:
reader = csv.DictReader(f)
return reader.fieldnames
return []
-
+
# =========================================================================
# FUNCTIONAL TESTS: Basic file creation and filter functionality
# =========================================================================
# These tests verify that the virus function creates output files
# correctly and that individual filters work as expected.
-
+
@retry_on_network_error(max_retries=3, delay=5)
def test_virus_specific_accession_file_creation(self):
"""Test that files are created when downloading a specific accession.
-
+
Downloads SARS-CoV-2 reference sequence (NC_045512.2) and verifies:
- Function returns None (writes to disk)
- All three output files created (FASTA, CSV, JSONL)
@@ -419,124 +425,100 @@ def test_virus_specific_accession_file_creation(self):
"""
virus_name = "NC_045512.2"
outfolder = self.test_output_dir
-
+
# Run the function (should create files, returns None)
- result = virus(
- virus=virus_name,
- is_accession=True,
- outfolder=outfolder
- )
-
+ result = virus(virus=virus_name, is_accession=True, outfolder=outfolder)
+
# Check that function returns None
self.assertIsNone(result)
-
+
# Check that output files were created
files = self._check_output_files(virus_name, outfolder)
-
+
# Assert all files exist
- self.assertTrue(files["fasta"]["exists"],
- f"FASTA file not created: {files['fasta']['path']}")
- self.assertTrue(files["csv"]["exists"],
- f"CSV file not created: {files['csv']['path']}")
- self.assertTrue(files["jsonl"]["exists"],
- f"JSONL file not created: {files['jsonl']['path']}")
-
+ self.assertTrue(files["fasta"]["exists"], f"FASTA file not created: {files['fasta']['path']}")
+ self.assertTrue(files["csv"]["exists"], f"CSV file not created: {files['csv']['path']}")
+ self.assertTrue(files["jsonl"]["exists"], f"JSONL file not created: {files['jsonl']['path']}")
+
# Assert files are not empty
self.assertGreater(files["fasta"]["size"], 0, "FASTA file is empty")
self.assertGreater(files["csv"]["size"], 0, "CSV file is empty")
self.assertGreater(files["jsonl"]["size"], 0, "JSONL file is empty")
-
+
# Count sequences (should be 1 for a specific accession)
seq_count = self._count_fasta_sequences(files["fasta"]["path"])
self.assertGreaterEqual(seq_count, 1, "No sequences found in FASTA file")
-
+
@retry_on_network_error(max_retries=3, delay=5)
def test_virus_with_host_filter(self):
"""Test that host filter works and creates appropriate files."""
virus_name = "Zika virus"
outfolder = self.test_output_dir
-
- result = virus(
- virus=virus_name,
- host="human",
- outfolder=outfolder
- )
-
+
+ result = virus(virus=virus_name, host="human", outfolder=outfolder)
+
self.assertIsNone(result)
-
+
files = self._check_output_files(virus_name, outfolder)
self.assertTrue(files["fasta"]["exists"], "FASTA file not created with host filter")
self.assertTrue(files["csv"]["exists"], "CSV file not created with host filter")
self.assertTrue(files["jsonl"]["exists"], "JSONL file not created with host filter")
-
+
# Verify that files contain data
self.assertGreater(files["fasta"]["size"], 0, "FASTA file is empty with host filter")
-
+
@retry_on_network_error(max_retries=3, delay=5)
def test_virus_with_completeness_filter(self):
"""Test that completeness filter works correctly."""
virus_name = "Zika virus"
outfolder = self.test_output_dir
-
- result = virus(
- virus=virus_name,
- nuc_completeness="complete",
- outfolder=outfolder
- )
-
+
+ result = virus(virus=virus_name, nuc_completeness="complete", outfolder=outfolder)
+
self.assertIsNone(result)
-
+
files = self._check_output_files(virus_name, outfolder)
self.assertTrue(files["fasta"]["exists"], "FASTA file not created with completeness filter")
self.assertGreater(files["fasta"]["size"], 0, "FASTA file is empty with completeness filter")
-
+
@retry_on_network_error(max_retries=3, delay=5)
def test_virus_with_length_filters(self):
"""Test that sequence length filters work correctly."""
virus_name = "Zika virus"
outfolder = self.test_output_dir
-
- result = virus(
- virus=virus_name,
- min_seq_length=10000,
- max_seq_length=11000,
- outfolder=outfolder
- )
-
+
+ result = virus(virus=virus_name, min_seq_length=10000, max_seq_length=11000, outfolder=outfolder)
+
self.assertIsNone(result)
-
+
files = self._check_output_files(virus_name, outfolder)
self.assertTrue(files["fasta"]["exists"], "FASTA file not created with length filters")
-
+
# Verify sequences are within expected length range
# This would require parsing the FASTA file, which we do with count
seq_count = self._count_fasta_sequences(files["fasta"]["path"])
self.assertGreater(seq_count, 0, "No sequences passed length filters")
-
+
@retry_on_network_error(max_retries=3, delay=5)
def test_virus_with_annotated_filter(self):
"""Test that annotated filter works correctly."""
virus_name = "Zika virus"
outfolder = self.test_output_dir
-
- result = virus(
- virus=virus_name,
- annotated=True,
- outfolder=outfolder
- )
-
+
+ result = virus(virus=virus_name, annotated=True, outfolder=outfolder)
+
self.assertIsNone(result)
-
+
files = self._check_output_files(virus_name, outfolder)
self.assertTrue(files["fasta"]["exists"], "FASTA file not created with annotated filter")
self.assertGreater(files["fasta"]["size"], 0, "FASTA file is empty with annotated filter")
-
+
@retry_on_network_error(max_retries=3, delay=5)
def test_virus_with_multiple_filters(self):
"""Test that multiple filters can be combined correctly."""
virus_name = "Zika virus"
outfolder = self.test_output_dir
-
+
result = virus(
virus=virus_name,
host="human",
@@ -544,226 +526,215 @@ def test_virus_with_multiple_filters(self):
min_seq_length=10500,
max_seq_length=11000,
annotated=True,
- outfolder=outfolder
+ outfolder=outfolder,
)
-
+
self.assertIsNone(result)
-
+
files = self._check_output_files(virus_name, outfolder)
self.assertTrue(files["fasta"]["exists"], "FASTA file not created with multiple filters")
self.assertTrue(files["csv"]["exists"], "CSV file not created with multiple filters")
self.assertTrue(files["jsonl"]["exists"], "JSONL file not created with multiple filters")
-
+
# Check that filters reduced the dataset (should have some sequences)
seq_count = self._count_fasta_sequences(files["fasta"]["path"])
self.assertGreater(seq_count, 0, "No sequences passed multiple filters")
-
+
@retry_on_network_error(max_retries=3, delay=5)
def test_virus_integer_virus_id(self):
"""Test that integer virus IDs are handled correctly.
-
+
Tests using Zika virus taxon ID (64320) as integer input.
Verifies that integer IDs are properly converted and files created.
"""
virus_id = 64320 # Zika virus taxon ID
outfolder = self.test_output_dir
-
- result = virus(
- virus=virus_id,
- outfolder=outfolder
- )
-
+
+ result = virus(virus=virus_id, outfolder=outfolder)
+
self.assertIsNone(result)
-
+
# Check files with string version of virus ID
virus_clean = str(virus_id)
expected_fasta = os.path.join(outfolder, f"{virus_clean}_sequences.fasta")
- self.assertTrue(os.path.exists(expected_fasta),
- f"FASTA file not created for integer virus ID: {expected_fasta}")
-
+ self.assertTrue(
+ os.path.exists(expected_fasta), f"FASTA file not created for integer virus ID: {expected_fasta}"
+ )
+
# =========================================================================
# DATA QUALITY & VERIFICATION TESTS
# =========================================================================
# These tests verify data consistency, filter effectiveness, and that
# API/data source changes would be detected. They go beyond simple file
# existence checks to validate actual data quality.
-
+
@retry_on_network_error(max_retries=3, delay=5)
def test_virus_relationship_check_counts_match(self):
"""Test that FASTA sequence count matches CSV and JSONL record counts.
-
+
Downloads a specific accession and verifies:
- Number of FASTA sequences = number of CSV records = number of JSONL records
- No data loss between different output formats
- At least one record in all files
-
+
This catches: Format conversion bugs, data loss, parsing errors.
"""
virus_name = "NC_045512.2"
outfolder = self.test_output_dir
-
- result = virus(
- virus=virus_name,
- is_accession=True,
- outfolder=outfolder
- )
-
+
+ result = virus(virus=virus_name, is_accession=True, outfolder=outfolder)
+
self.assertIsNone(result)
-
+
files = self._check_output_files(virus_name, outfolder)
-
+
# Count records in each file type
fasta_count = self._count_fasta_sequences(files["fasta"]["path"])
csv_count = self._count_csv_records(files["csv"]["path"])
jsonl_count = self._count_jsonl_records(files["jsonl"]["path"])
-
+
# All counts should match
- self.assertEqual(fasta_count, csv_count,
- f"FASTA count ({fasta_count}) does not match CSV count ({csv_count})")
- self.assertEqual(fasta_count, jsonl_count,
- f"FASTA count ({fasta_count}) does not match JSONL count ({jsonl_count})")
- self.assertEqual(csv_count, jsonl_count,
- f"CSV count ({csv_count}) does not match JSONL count ({jsonl_count})")
-
+ self.assertEqual(fasta_count, csv_count, f"FASTA count ({fasta_count}) does not match CSV count ({csv_count})")
+ self.assertEqual(
+ fasta_count, jsonl_count, f"FASTA count ({fasta_count}) does not match JSONL count ({jsonl_count})"
+ )
+ self.assertEqual(csv_count, jsonl_count, f"CSV count ({csv_count}) does not match JSONL count ({jsonl_count})")
+
# Should have at least one record
self.assertGreater(fasta_count, 0, "No records found in output files")
-
+
@retry_on_network_error(max_retries=3, delay=5)
def test_virus_host_filter_verification(self):
"""Test that host filter actually filters by host in metadata.
-
+
Downloads Zika virus with host="human" filter and verifies:
- Records are returned (filter doesn't break the query)
- Host column exists in metadata
- If host data is populated, it matches the filter criterion
-
+
Note: Host filter is applied server-side by NCBI API. The returned
records should all match, but the Host field in CSV may be empty or
have various formats (scientific names, common names).
-
+
This catches: Broken host filters, API changes in filtering behavior.
"""
virus_name = "Zika virus"
host = "human"
outfolder = self.test_output_dir
-
- result = virus(
- virus=virus_name,
- host=host,
- outfolder=outfolder
- )
-
+
+ result = virus(virus=virus_name, host=host, outfolder=outfolder)
+
self.assertIsNone(result)
-
+
files = self._check_output_files(virus_name, outfolder)
-
+
# Parse CSV metadata
records = self._parse_csv_metadata(files["csv"]["path"])
-
+
# Should have some records
self.assertGreater(len(records), 0, "No records returned with host filter")
-
+
# Check that host column exists
if records:
- self.assertIn("Host", records[0].keys(),
- "Host column not found in metadata")
-
+ self.assertIn("Host", records[0].keys(), "Host column not found in metadata")
+
# Note: Host filter is applied server-side by NCBI API
# The returned records should all match, but the Host field in CSV
# may be empty or have various formats (scientific names, common names)
# We verify the filter worked by checking that records were returned
# (if filter was broken, we'd get all hosts or an error)
-
+
# Count non-empty host values
- non_empty_hosts = sum(1 for record in records
- if record.get("Host", "").strip())
-
+ non_empty_hosts = sum(1 for record in records if record.get("Host", "").strip())
+
# If we have host data populated, verify it matches
if non_empty_hosts > 0:
host_lower = host.lower()
# Also check for "Homo sapiens" which is scientific name for human
- matching_hosts = sum(1 for record in records
- if host_lower in record.get("Host", "").lower()
- or "homo sapiens" in record.get("Host", "").lower())
-
+ matching_hosts = sum(
+ 1
+ for record in records
+ if host_lower in record.get("Host", "").lower() or "homo sapiens" in record.get("Host", "").lower()
+ )
+
# If host data is populated, at least 50% should match
if non_empty_hosts > 0:
match_percentage = (matching_hosts / non_empty_hosts) * 100
- self.assertGreater(match_percentage, 50,
- f"Only {match_percentage:.1f}% of populated host fields match filter '{host}'")
-
+ self.assertGreater(
+ match_percentage,
+ 50,
+ f"Only {match_percentage:.1f}% of populated host fields match filter '{host}'",
+ )
+
@retry_on_network_error(max_retries=3, delay=5)
def test_virus_release_date_filter_verification(self):
"""Test that release date filter is applied correctly in metadata.
-
+
Downloads Mumps virus with min_release_date="2024-12-31" and verifies:
- Records are returned (API is working)
- Release date field exists in metadata
- All release dates are on or after 2024-12-31
- Count matches expected API results
-
+
This test compares against the direct API call:
curl -X GET "https://api.ncbi.nlm.nih.gov/datasets/v2/virus/taxon/mumps%20virus/dataset_report?filter.released_since=2024-12-31T00:00:00.000Z"
-
+
This catches: Release date filter bugs, date parsing errors, API filter issues.
"""
- import requests
from datetime import datetime
-
+
+ import requests
+
virus_name = "mumps virus"
min_release_date = "2024-12-31"
outfolder = self.test_output_dir
-
+
# First, get the expected count from direct API call using full timestamp format
api_url = "https://api.ncbi.nlm.nih.gov/datasets/v2/virus/taxon/mumps%20virus/dataset_report"
params = {"filter.released_since": "2024-12-31T00:00:00.000Z", "page_size": 1000}
-
+
try:
- response = requests.get(api_url, params=params, headers={'accept': 'application/json'})
+ response = requests.get(api_url, params=params, headers={"accept": "application/json"})
response.raise_for_status()
api_data = response.json()
- expected_count = len(api_data.get('reports', []))
- except Exception as e:
+ expected_count = len(api_data.get("reports", []))
+ except Exception as e: # noqa: BLE001
self.skipTest(f"Could not fetch API data for comparison: {e}")
-
+
# Run virus function with same filter
- result = virus(
- virus=virus_name,
- min_release_date=min_release_date,
- outfolder=outfolder
- )
-
+ result = virus(virus=virus_name, min_release_date=min_release_date, outfolder=outfolder)
+
self.assertIsNone(result)
-
+
files = self._check_output_files(virus_name, outfolder)
-
+
# Verify files were created
- self.assertTrue(os.path.exists(files["csv"]["path"]),
- "CSV file not created with release date filter")
- self.assertTrue(os.path.exists(files["fasta"]["path"]),
- "FASTA file not created with release date filter")
-
+ self.assertTrue(os.path.exists(files["csv"]["path"]), "CSV file not created with release date filter")
+ self.assertTrue(os.path.exists(files["fasta"]["path"]), "FASTA file not created with release date filter")
+
# Parse CSV metadata
records = self._parse_csv_metadata(files["csv"]["path"])
-
+
# Should have records matching API count (allowing for small variance due to timing)
self.assertGreater(len(records), 0, "No records returned with release date filter")
- self.assertEqual(len(records), expected_count,
- f"Record count ({len(records)}) doesn't match API count ({expected_count})")
-
+ self.assertEqual(
+ len(records), expected_count, f"Record count ({len(records)}) doesn't match API count ({expected_count})"
+ )
+
# Check that release date column exists
release_date_field = None
for possible_field in ["Release date", "Release Date", "ReleaseDate", "release_date"]:
if possible_field in records[0].keys():
release_date_field = possible_field
break
-
- self.assertIsNotNone(release_date_field,
- f"Release date field not found. Available fields: {list(records[0].keys())}")
-
+
+ self.assertIsNotNone(
+ release_date_field, f"Release date field not found. Available fields: {list(records[0].keys())}"
+ )
+
# Parse filter date for comparison (inclusive - on or after this date)
filter_date = datetime.strptime(min_release_date, "%Y-%m-%d")
-
+
# Verify all release dates are on or after the filter date (inclusive)
invalid_dates = []
for record in records:
@@ -773,117 +744,118 @@ def test_virus_release_date_filter_verification(self):
# Parse ISO date format (YYYY-MM-DD)
record_date = datetime.strptime(date_str, "%Y-%m-%d")
if record_date < filter_date:
- invalid_dates.append((record.get('Accession', 'unknown'), date_str))
+ invalid_dates.append((record.get("Accession", "unknown"), date_str))
except ValueError as e:
# If date parsing fails, that's also a test failure
self.fail(f"Could not parse release date '{date_str}': {e}")
-
- self.assertEqual(len(invalid_dates), 0,
- f"Found {len(invalid_dates)} records with release dates before {min_release_date}: {invalid_dates[:5]}")
-
+
+ self.assertEqual(
+ len(invalid_dates),
+ 0,
+ f"Found {len(invalid_dates)} records with release dates before {min_release_date}: {invalid_dates[:5]}",
+ )
+
@retry_on_network_error(max_retries=3, delay=5)
def test_virus_metadata_schema_validation(self):
"""Test that expected metadata columns exist in CSV output.
-
+
Downloads a specific accession and verifies:
- CSV contains expected essential columns (accession, length, host)
- At least 5 columns present (reasonable metadata breadth)
- Column names are properly formatted
-
+
This catches: API schema changes, missing metadata fields, field
name changes that would break downstream analysis tools.
"""
virus_name = "NC_045512.2"
outfolder = self.test_output_dir
-
- result = virus(
- virus=virus_name,
- is_accession=True,
- outfolder=outfolder
- )
-
+
+ result = virus(virus=virus_name, is_accession=True, outfolder=outfolder)
+
self.assertIsNone(result)
-
+
files = self._check_output_files(virus_name, outfolder)
-
+
# Get column names from CSV
columns = self._get_csv_columns(files["csv"]["path"])
-
+
# Check for expected essential columns (these should always be present)
# Using case-insensitive checking since column names might vary
columns_lower = [col.lower() for col in columns]
-
+
expected_columns = [
"accession", # Or GenBank Accession
- "length", # Or Sequence Length
- "host", # Host information
+ "length", # Or Sequence Length
+ "host", # Host information
]
-
+
missing_columns = []
for expected in expected_columns:
found = any(expected in col_lower for col_lower in columns_lower)
if not found:
missing_columns.append(expected)
-
- self.assertEqual(len(missing_columns), 0,
- f"Missing expected metadata columns: {missing_columns}. "
- f"Available columns: {columns}")
-
+
+ self.assertEqual(
+ len(missing_columns),
+ 0,
+ f"Missing expected metadata columns: {missing_columns}. Available columns: {columns}",
+ )
+
# Verify we have a reasonable number of columns (at least 5)
- self.assertGreaterEqual(len(columns), 5,
- f"Only {len(columns)} columns found, expected at least 5")
-
+ self.assertGreaterEqual(len(columns), 5, f"Only {len(columns)} columns found, expected at least 5")
+
@retry_on_network_error(max_retries=3, delay=5)
def test_virus_completeness_filter_verification(self):
"""Test that completeness filter returns appropriate sequences.
-
+
Downloads Zika virus with nuc_completeness="complete" and verifies:
- Records are returned (filter works)
- If completeness field exists, validates values
- Falls back to checking length field exists
-
+
This catches: Broken completeness filters, metadata field changes,
filter logic errors.
"""
virus_name = "Zika virus"
outfolder = self.test_output_dir
-
- result = virus(
- virus=virus_name,
- nuc_completeness="complete",
- outfolder=outfolder
- )
-
+
+ result = virus(virus=virus_name, nuc_completeness="complete", outfolder=outfolder)
+
self.assertIsNone(result)
-
+
files = self._check_output_files(virus_name, outfolder)
-
+
# Parse CSV metadata
records = self._parse_csv_metadata(files["csv"]["path"])
-
+
# Should have some records
self.assertGreater(len(records), 0, "No records returned with completeness filter")
-
+
# Check if there's a completeness or length field
if records:
# Look for completeness-related fields
completeness_field = None
- for possible_field in ["Completeness", "Nuc_Completeness", "Nucleotide Completeness",
- "Genome Coverage", "completeness"]:
+ for possible_field in [
+ "Completeness",
+ "Nuc_Completeness",
+ "Nucleotide Completeness",
+ "Genome Coverage",
+ "completeness",
+ ]:
if possible_field in records[0].keys():
completeness_field = possible_field
break
-
+
# If completeness field exists, verify values
if completeness_field:
- complete_count = sum(1 for record in records
- if "complete" in record.get(completeness_field, "").lower())
-
+ complete_count = sum(
+ 1 for record in records if "complete" in record.get(completeness_field, "").lower()
+ )
+
# At least 50% should be marked as complete
if complete_count > 0:
complete_percentage = (complete_count / len(records)) * 100
- self.assertGreater(complete_percentage, 50,
- f"Only {complete_percentage:.1f}% marked as complete")
+ self.assertGreater(complete_percentage, 50, f"Only {complete_percentage:.1f}% marked as complete")
else:
# If no explicit completeness field, check length field exists
# (complete genomes should have consistent lengths)
@@ -892,162 +864,154 @@ def test_virus_completeness_filter_verification(self):
if possible_field in records[0].keys():
length_field = possible_field
break
-
- self.assertIsNotNone(length_field,
- "Neither completeness nor length field found in metadata")
-
+
+ self.assertIsNotNone(length_field, "Neither completeness nor length field found in metadata")
+
@retry_on_network_error(max_retries=3, delay=5)
def test_virus_multiple_filters_relationship_check(self):
"""Test relationship checks work correctly with multiple filters applied.
-
+
Downloads Zika virus with multiple filters (host, completeness, length) and verifies:
- FASTA/CSV/JSONL counts still match with complex filtering
- At least one record passes all filters
- No data loss when multiple filters interact
-
+
This catches: Filter interaction bugs, data loss with complex queries,
inconsistent filtering across output formats.
"""
virus_name = "Zika virus"
outfolder = self.test_output_dir
-
+
result = virus(
- virus=virus_name,
- host="human",
- nuc_completeness="complete",
- min_seq_length=10000,
- outfolder=outfolder
+ virus=virus_name, host="human", nuc_completeness="complete", min_seq_length=10000, outfolder=outfolder
)
-
+
self.assertIsNone(result)
-
+
files = self._check_output_files(virus_name, outfolder)
-
+
# Count records in each file type
fasta_count = self._count_fasta_sequences(files["fasta"]["path"])
csv_count = self._count_csv_records(files["csv"]["path"])
jsonl_count = self._count_jsonl_records(files["jsonl"]["path"])
-
+
# All counts should match even with filters
- self.assertEqual(fasta_count, csv_count,
- f"FASTA count ({fasta_count}) does not match CSV count ({csv_count}) with multiple filters")
- self.assertEqual(fasta_count, jsonl_count,
- f"FASTA count ({fasta_count}) does not match JSONL count ({jsonl_count}) with multiple filters")
-
+ self.assertEqual(
+ fasta_count,
+ csv_count,
+ f"FASTA count ({fasta_count}) does not match CSV count ({csv_count}) with multiple filters",
+ )
+ self.assertEqual(
+ fasta_count,
+ jsonl_count,
+ f"FASTA count ({fasta_count}) does not match JSONL count ({jsonl_count}) with multiple filters",
+ )
+
# Should have at least one record
self.assertGreater(fasta_count, 0, "No records found with multiple filters applied")
# =========================================================================
# ADDITIONAL FUNCTIONAL TESTS: Testing previously untested parameters
# =========================================================================
-
+
@retry_on_network_error(max_retries=3, delay=5)
def test_virus_with_geographic_location_filter(self):
"""Test that geographic location filter works correctly.
-
+
Downloads Zika virus sequences from Brazil and verifies:
- Files are created successfully
- Records are returned
- Geographic location metadata field exists
-
+
This catches: Geographic location filter bugs, API parameter issues.
"""
virus_name = "Zika virus"
outfolder = self.test_output_dir
-
- result = virus(
- virus=virus_name,
- geographic_location="Brazil",
- outfolder=outfolder
- )
-
+
+ result = virus(virus=virus_name, geographic_location="Brazil", outfolder=outfolder)
+
self.assertIsNone(result)
-
+
files = self._check_output_files(virus_name, outfolder)
self.assertTrue(files["fasta"]["exists"], "FASTA file not created with geographic location filter")
self.assertTrue(files["csv"]["exists"], "CSV file not created with geographic location filter")
-
+
# Parse CSV metadata
records = self._parse_csv_metadata(files["csv"]["path"])
-
+
# Should have some records (Brazil had Zika outbreak)
self.assertGreater(len(records), 0, "No records returned with geographic location filter")
-
+
# Check that geographic location fields exist
if records:
geo_fields = ["Geographic Location", "Geographic Region", "Geo String"]
has_geo_field = any(field in records[0].keys() for field in geo_fields)
- self.assertTrue(has_geo_field,
- f"No geographic location field found. Available fields: {list(records[0].keys())}")
-
+ self.assertTrue(
+ has_geo_field, f"No geographic location field found. Available fields: {list(records[0].keys())}"
+ )
+
@retry_on_network_error(max_retries=3, delay=5)
def test_virus_with_protein_count_filters(self):
"""Test that protein count filters work correctly.
-
+
Downloads Zika virus with protein count filters and verifies:
- Files are created successfully
- Records are returned
- Protein count field exists in metadata
-
+
This catches: Protein count filter bugs, metadata field issues.
"""
virus_name = "Zika virus"
outfolder = self.test_output_dir
-
- result = virus(
- virus=virus_name,
- min_protein_count=1,
- max_protein_count=20,
- outfolder=outfolder
- )
-
+
+ result = virus(virus=virus_name, min_protein_count=1, max_protein_count=20, outfolder=outfolder)
+
self.assertIsNone(result)
-
+
files = self._check_output_files(virus_name, outfolder)
self.assertTrue(files["fasta"]["exists"], "FASTA file not created with protein count filters")
-
+
# Parse CSV metadata
records = self._parse_csv_metadata(files["csv"]["path"])
-
+
# Should have some records
self.assertGreater(len(records), 0, "No records returned with protein count filters")
-
+
# Check that protein count field exists
if records:
- self.assertIn("Protein count", records[0].keys(),
- f"Protein count field not found. Available fields: {list(records[0].keys())}")
-
+ self.assertIn(
+ "Protein count",
+ records[0].keys(),
+ f"Protein count field not found. Available fields: {list(records[0].keys())}",
+ )
+
@retry_on_network_error(max_retries=3, delay=5)
def test_virus_with_source_database_filter(self):
"""Test that source database filter works correctly.
-
+
Downloads Zika virus from GenBank database and verifies:
- Files are created successfully
- Records are returned
- Source database field exists in metadata
-
+
This catches: Source database filter bugs, API parameter issues.
"""
virus_name = "Zika virus"
outfolder = self.test_output_dir
-
- result = virus(
- virus=virus_name,
- source_database="GenBank",
- outfolder=outfolder
- )
-
+
+ result = virus(virus=virus_name, source_database="GenBank", outfolder=outfolder)
+
self.assertIsNone(result)
-
+
files = self._check_output_files(virus_name, outfolder)
self.assertTrue(files["fasta"]["exists"], "FASTA file not created with source database filter")
-
+
# Parse CSV metadata
records = self._parse_csv_metadata(files["csv"]["path"])
-
+
# Should have some records
self.assertGreater(len(records), 0, "No records returned with source database filter")
-
+
# Check that source database field exists
if records:
db_field = None
@@ -1055,183 +1019,161 @@ def test_virus_with_source_database_filter(self):
if possible_field in records[0].keys():
db_field = possible_field
break
-
- self.assertIsNotNone(db_field,
- f"Source database field not found. Available fields: {list(records[0].keys())}")
-
+
+ self.assertIsNotNone(
+ db_field, f"Source database field not found. Available fields: {list(records[0].keys())}"
+ )
@retry_on_network_error(max_retries=3, delay=5)
def test_virus_with_lab_passaged_filter(self):
"""Test that lab_passaged filter works correctly.
-
+
Downloads Zika virus with lab_passaged=False filter and verifies:
- Files are created successfully
- Records are returned
-
+
Note: Lab passaged data may be sparse, so we mainly verify the filter
doesn't break the query.
-
+
This catches: Lab passaged filter bugs, API parameter issues.
"""
virus_name = "Zika virus"
outfolder = self.test_output_dir
-
- result = virus(
- virus=virus_name,
- lab_passaged=False,
- outfolder=outfolder
- )
-
+
+ result = virus(virus=virus_name, lab_passaged=False, outfolder=outfolder)
+
self.assertIsNone(result)
-
+
files = self._check_output_files(virus_name, outfolder)
self.assertTrue(files["fasta"]["exists"], "FASTA file not created with lab_passaged filter")
-
+
# Should create files (even if no lab passaged field in results)
self.assertGreater(files["fasta"]["size"], 0, "FASTA file is empty with lab_passaged filter")
-
+
@retry_on_network_error(max_retries=3, delay=5)
def test_virus_with_collection_date_filters(self):
"""Test that collection date filters don't break the query.
-
+
Downloads Zika virus with collection date range and verifies:
- Function completes without errors
-
+
Note: Collection date data is often sparse, filters may return no results.
This test just ensures the filter doesn't cause errors.
"""
virus_name = "Zika virus"
outfolder = self.test_output_dir
-
+
# This will complete without error even if no results match
result = virus(
- virus=virus_name,
- min_collection_date="2016-01-01",
- max_collection_date="2016-12-31",
- outfolder=outfolder
+ virus=virus_name, min_collection_date="2016-01-01", max_collection_date="2016-12-31", outfolder=outfolder
)
-
+
# Function should complete successfully
self.assertIsNone(result)
-
+
@retry_on_network_error(max_retries=3, delay=5)
def test_virus_with_max_ambiguous_chars_filter(self):
"""Test that max_ambiguous_chars filter works correctly.
-
+
Downloads Zika virus with max_ambiguous_chars filter and verifies:
- Files are created successfully
- Records are returned
- Filter doesn't break the query
-
+
This catches: Max ambiguous chars filter bugs, sequence quality filtering issues.
"""
virus_name = "Zika virus"
outfolder = self.test_output_dir
-
- result = virus(
- virus=virus_name,
- max_ambiguous_chars=100,
- outfolder=outfolder
- )
-
+
+ result = virus(virus=virus_name, max_ambiguous_chars=100, outfolder=outfolder)
+
self.assertIsNone(result)
-
+
files = self._check_output_files(virus_name, outfolder)
self.assertTrue(files["fasta"]["exists"], "FASTA file not created with max_ambiguous_chars filter")
-
+
# Should have some records (most sequences have some ambiguous bases)
seq_count = self._count_fasta_sequences(files["fasta"]["path"])
self.assertGreater(seq_count, 0, "No sequences passed max_ambiguous_chars filter")
-
+
@retry_on_network_error(max_retries=3, delay=5)
def test_virus_with_has_proteins_filter(self):
"""Test that has_proteins filter works correctly.
-
+
Downloads Zika virus requiring specific proteins and verifies:
- Files are created successfully
- Records are returned
- Filter doesn't break the query
-
+
This catches: has_proteins filter bugs, protein filtering logic issues.
"""
virus_name = "Zika virus"
outfolder = self.test_output_dir
-
+
# Test with a common protein (polyprotein is typical for Zika)
- result = virus(
- virus=virus_name,
- has_proteins="polyprotein",
- outfolder=outfolder
- )
-
+ result = virus(virus=virus_name, has_proteins="polyprotein", outfolder=outfolder)
+
self.assertIsNone(result)
-
+
files = self._check_output_files(virus_name, outfolder)
self.assertTrue(files["fasta"]["exists"], "FASTA file not created with has_proteins filter")
-
+
# Should have some records (polyprotein is common in Zika)
seq_count = self._count_fasta_sequences(files["fasta"]["path"])
self.assertGreater(seq_count, 0, "No sequences passed has_proteins filter")
-
+
@retry_on_network_error(max_retries=3, delay=5)
def test_virus_with_genbank_metadata_retrieval(self):
"""Test that GenBank metadata retrieval works correctly.
-
+
Downloads a single accession with genbank_metadata=True and verifies:
- Function completes without errors
- Standard files are created
- GenBank metadata CSV file is created
-
+
This catches: GenBank metadata retrieval bugs, batch processing issues.
"""
virus_name = "NC_045512.2"
outfolder = self.test_output_dir
-
+
result = virus(
- virus=virus_name,
- is_accession=True,
- genbank_metadata=True,
- genbank_batch_size=10,
- outfolder=outfolder
+ virus=virus_name, is_accession=True, genbank_metadata=True, genbank_batch_size=10, outfolder=outfolder
)
-
+
self.assertIsNone(result)
-
+
files = self._check_output_files(virus_name, outfolder)
self.assertTrue(files["fasta"]["exists"], "FASTA file not created with genbank_metadata")
-
+
# Check for GenBank metadata file
genbank_csv = os.path.join(outfolder, f"{virus_name}_genbank_metadata.csv")
- self.assertTrue(os.path.exists(genbank_csv),
- f"GenBank metadata CSV not created: {genbank_csv}")
-
+ self.assertTrue(os.path.exists(genbank_csv), f"GenBank metadata CSV not created: {genbank_csv}")
+
# Verify GenBank CSV has data
- self.assertGreater(os.path.getsize(genbank_csv), 0,
- "GenBank metadata CSV is empty")
+ self.assertGreater(os.path.getsize(genbank_csv), 0, "GenBank metadata CSV is empty")
# =========================================================================
# DATASETS CLI TESTS: Testing NCBI datasets CLI check and setup
# =========================================================================
# These tests verify the datasets CLI detection and installation functionality
-
+
def test_get_datasets_path_returns_valid_path(self):
"""Test that _get_datasets_path returns a valid path to the datasets CLI.
-
+
The function should return a path to either:
1. The system-installed datasets CLI (if available)
2. The bundled datasets binary (fallback)
-
+
This catches: Detection logic bugs, path resolution issues.
"""
# _get_datasets_path should always return a valid path
# (either system CLI or bundled binary)
datasets_path = _get_datasets_path()
-
+
# Should return a non-empty string
self.assertIsInstance(datasets_path, str)
- self.assertTrue(len(datasets_path) > 0,
- "_get_datasets_path should return a non-empty path")
-
+ self.assertTrue(len(datasets_path) > 0, "_get_datasets_path should return a non-empty path")
+
# The returned path should be executable
result = subprocess.run(
[datasets_path, "--version"],
@@ -1239,57 +1181,54 @@ def test_get_datasets_path_returns_valid_path(self):
text=True,
timeout=5,
)
- self.assertEqual(result.returncode, 0,
- f"datasets CLI at {datasets_path} should be executable")
-
+ self.assertEqual(result.returncode, 0, f"datasets CLI at {datasets_path} should be executable")
+
def test_get_datasets_path_uses_bundled_binary(self):
"""Test that _get_datasets_path falls back to bundled binary when system CLI is missing.
-
+
When the system-installed datasets CLI is not in PATH, the function
should fall back to the bundled binary included with gget.
-
+
Note: _get_datasets_path caches its result, so this test clears the cache
before testing the fallback behavior.
-
+
This catches: Bundled binary fallback logic, path resolution issues.
"""
import gget.gget_virus as gget_virus_module
-
+
# Save original PATH and cache
original_path = os.environ.get("PATH", "")
original_cache = gget_virus_module._datasets_path_cache
-
+
try:
# Clear the cache to force re-detection
_clear_datasets_cache()
-
+
# Set PATH to empty to simulate system datasets not being found
os.environ["PATH"] = ""
-
+
# Should still return a valid path (to bundled binary)
datasets_path = _get_datasets_path()
-
+
# Should return a non-empty string path to bundled binary
self.assertIsInstance(datasets_path, str)
- self.assertTrue(len(datasets_path) > 0,
- "Should return path to bundled binary")
-
+ self.assertTrue(len(datasets_path) > 0, "Should return path to bundled binary")
+
# Path should contain 'bins' indicating bundled binary
- self.assertIn("bins", datasets_path,
- f"Path should be bundled binary, got: {datasets_path}")
-
+ self.assertIn("bins", datasets_path, f"Path should be bundled binary, got: {datasets_path}")
+
finally:
# Restore original PATH and cache
os.environ["PATH"] = original_path
gget_virus_module._datasets_path_cache = original_cache
-
+
def test_datasets_cli_version_output(self):
"""Test that the datasets CLI returns a valid version string.
-
- When available (system or bundled), the datasets CLI should return a
- version string that can be parsed. This helps ensure the CLI is properly
+
+ When available (system or bundled), the datasets CLI should return a
+ version string that can be parsed. This helps ensure the CLI is properly
functional.
-
+
This catches: Corrupted installations, version parsing issues.
"""
# Use _get_datasets_path() to get either system or bundled binary
@@ -1304,71 +1243,67 @@ def test_datasets_cli_version_output(self):
cli_available = result.returncode == 0
except (FileNotFoundError, subprocess.TimeoutExpired, RuntimeError):
cli_available = False
-
+
if not cli_available:
self.skipTest("NCBI datasets CLI not available - skipping version test")
-
+
# Version output should not be empty and should contain version info
version_output = result.stdout.strip()
- self.assertTrue(len(version_output) > 0,
- "Version output should not be empty")
+ self.assertTrue(len(version_output) > 0, "Version output should not be empty")
# NCBI datasets typically outputs version like "datasets version: X.Y.Z" or just "X.Y.Z"
self.assertTrue(
any(char.isdigit() for char in version_output),
- f"Version output should contain version numbers: {version_output}"
+ f"Version output should contain version numbers: {version_output}",
)
-
# =========================================================================
# MULTI-ACCESSION TESTS: Testing new multi-accession functionality
# =========================================================================
# These tests verify the new multi-accession support added in recent commits
-
+
def test_parse_accession_input_single(self):
"""Test parsing of single accession number.
-
+
Tests _parse_accession_input with a single accession identifier and verifies:
- Returns correct type ('single')
- Accession value is preserved
- is_file flag is False
-
+
This catches: Single accession parsing bugs, input validation issues.
"""
- from gget.gget_virus import _parse_accession_input
-
- result = _parse_accession_input('NC_045512.2')
-
- self.assertEqual(result['type'], 'single', "Should identify single accession")
- self.assertEqual(result['accessions'], 'NC_045512.2', "Should preserve accession value")
- self.assertFalse(result['is_file'], "Single accession should not be marked as file")
- self.assertIsNone(result['file_path'], "Single accession should have no file_path")
-
+
+ result = _parse_accession_input("NC_045512.2")
+
+ self.assertEqual(result["type"], "single", "Should identify single accession")
+ self.assertEqual(result["accessions"], "NC_045512.2", "Should preserve accession value")
+ self.assertFalse(result["is_file"], "Single accession should not be marked as file")
+ self.assertIsNone(result["file_path"], "Single accession should have no file_path")
+
def test_parse_accession_input_space_separated(self):
"""Test parsing of space-separated accessions.
-
+
Tests _parse_accession_input with space-separated accessions and verifies:
- Returns correct type ('list')
- Accessions list is created with correct count
- All accessions are preserved without whitespace
- is_file flag is False
-
+
This catches: Space-separated parsing bugs, whitespace handling issues.
"""
- from gget.gget_virus import _parse_accession_input
-
- result = _parse_accession_input('NC_045512.2 MN908947.3 MT020781.1')
-
- self.assertEqual(result['type'], 'list', "Should identify list of accessions")
- self.assertIsInstance(result['accessions'], list, "Should return list type")
- self.assertEqual(len(result['accessions']), 3, "Should parse 3 accessions")
- self.assertEqual(result['accessions'][0], 'NC_045512.2', "First accession should match")
- self.assertEqual(result['accessions'][1], 'MN908947.3', "Second accession should match")
- self.assertEqual(result['accessions'][2], 'MT020781.1', "Third accession should match")
- self.assertFalse(result['is_file'], "Space-separated should not be marked as file")
-
+
+ result = _parse_accession_input("NC_045512.2 MN908947.3 MT020781.1")
+
+ self.assertEqual(result["type"], "list", "Should identify list of accessions")
+ self.assertIsInstance(result["accessions"], list, "Should return list type")
+ self.assertEqual(len(result["accessions"]), 3, "Should parse 3 accessions")
+ self.assertEqual(result["accessions"][0], "NC_045512.2", "First accession should match")
+ self.assertEqual(result["accessions"][1], "MN908947.3", "Second accession should match")
+ self.assertEqual(result["accessions"][2], "MT020781.1", "Third accession should match")
+ self.assertFalse(result["is_file"], "Space-separated should not be marked as file")
+
def test_parse_accession_input_from_file(self):
"""Test parsing of accessions from a file.
-
+
Tests _parse_accession_input with a file path and verifies:
- Returns correct type ('file')
- Accessions list is created from file content
@@ -1376,214 +1311,204 @@ def test_parse_accession_input_from_file(self):
- is_file flag is True
- file_path is preserved
- Empty lines are skipped
-
+
This catches: File parsing bugs, whitespace/empty line issues, file I/O errors.
"""
- from gget.gget_virus import _parse_accession_input
import tempfile
-
+
# Create a temporary file with accessions
- with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f:
+ with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as f:
f.write("NC_045512.2\n")
f.write(" MN908947.3 \n") # Test whitespace handling
f.write("\n") # Empty line
f.write("MT020781.1\n")
temp_file = f.name
-
+
try:
result = _parse_accession_input(temp_file)
-
- self.assertEqual(result['type'], 'file', "Should identify file input")
- self.assertIsInstance(result['accessions'], list, "Should return list type")
- self.assertEqual(len(result['accessions']), 3, "Should parse 3 accessions (empty line skipped)")
- self.assertEqual(result['accessions'][0], 'NC_045512.2', "First accession should match")
- self.assertEqual(result['accessions'][1], 'MN908947.3', "Second accession should be stripped of whitespace")
- self.assertEqual(result['accessions'][2], 'MT020781.1', "Third accession should match")
- self.assertTrue(result['is_file'], "File input should be marked as file")
- self.assertEqual(result['file_path'], temp_file, "File path should be preserved")
+
+ self.assertEqual(result["type"], "file", "Should identify file input")
+ self.assertIsInstance(result["accessions"], list, "Should return list type")
+ self.assertEqual(len(result["accessions"]), 3, "Should parse 3 accessions (empty line skipped)")
+ self.assertEqual(result["accessions"][0], "NC_045512.2", "First accession should match")
+ self.assertEqual(result["accessions"][1], "MN908947.3", "Second accession should be stripped of whitespace")
+ self.assertEqual(result["accessions"][2], "MT020781.1", "Third accession should match")
+ self.assertTrue(result["is_file"], "File input should be marked as file")
+ self.assertEqual(result["file_path"], temp_file, "File path should be preserved")
finally:
os.unlink(temp_file)
-
+
def test_parse_accession_input_empty_file_raises_error(self):
"""Test that parsing empty file raises ValueError.
-
+
Tests _parse_accession_input with an empty file and verifies:
- Raises ValueError
- Error message is informative
-
+
This catches: Empty file validation bugs, error handling issues.
"""
- from gget.gget_virus import _parse_accession_input
import tempfile
-
+
# Create an empty temporary file
- with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as f:
+ with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as f:
temp_file = f.name
-
+
try:
with self.assertRaises(ValueError):
_parse_accession_input(temp_file)
finally:
os.unlink(temp_file)
-
+
def test_parse_accession_input_nonexistent_file_raises_error(self):
"""Test that parsing nonexistent file raises ValueError.
-
+
Tests _parse_accession_input with a nonexistent file path and verifies:
- Raises ValueError (not FileNotFoundError - treated as single accession)
-
+
Note: A nonexistent file path will be treated as a single accession
string since _parse_accession_input checks os.path.isfile() first.
"""
- from gget.gget_virus import _parse_accession_input
-
+
# Nonexistent file path will be treated as single accession
- result = _parse_accession_input('/nonexistent/file/path.txt')
-
+ result = _parse_accession_input("/nonexistent/file/path.txt")
+
# Should treat it as a single accession string since file doesn't exist
- self.assertEqual(result['type'], 'single', "Nonexistent file treated as single accession")
- self.assertEqual(result['accessions'], '/nonexistent/file/path.txt', "Should preserve path as accession")
-
+ self.assertEqual(result["type"], "single", "Nonexistent file treated as single accession")
+ self.assertEqual(result["accessions"], "/nonexistent/file/path.txt", "Should preserve path as accession")
+
def test_calculate_max_accessions_per_batch(self):
"""Test calculation of maximum accessions per batch.
-
+
Tests _calculate_max_accessions_per_batch and verifies:
- Returns positive integer
- At least 1 accession per batch
- Respects URL length limit
- Smaller base URL allows more accessions
-
+
This catches: Batch size calculation bugs, URL limit logic errors.
"""
- from gget.gget_virus import _calculate_max_accessions_per_batch, MAX_URL_LENGTH, BUFFER_SIZE, ACCESSION_AVG_LENGTH
-
+ from gget.gget_virus import ACCESSION_AVG_LENGTH, BUFFER_SIZE, MAX_URL_LENGTH
+
# Test with different base URL lengths
base_url_small = 50
base_url_large = 500
-
+
max_acc_small = _calculate_max_accessions_per_batch(base_url_small)
max_acc_large = _calculate_max_accessions_per_batch(base_url_large)
-
+
# Both should be positive integers
self.assertIsInstance(max_acc_small, int, "Should return integer")
self.assertIsInstance(max_acc_large, int, "Should return integer")
self.assertGreater(max_acc_small, 0, "Should allow at least 1 accession")
self.assertGreater(max_acc_large, 0, "Should allow at least 1 accession")
-
+
# Larger base URL should allow fewer accessions
- self.assertGreater(max_acc_small, max_acc_large,
- "Smaller base URL should allow more accessions")
-
+ self.assertGreater(max_acc_small, max_acc_large, "Smaller base URL should allow more accessions")
+
# Verify the calculation makes sense
# With 2000 char limit, 200 char buffer, typical accession is 11 chars + 3 for %2C
expected_rough = (MAX_URL_LENGTH - base_url_small - BUFFER_SIZE) // (ACCESSION_AVG_LENGTH + 3)
self.assertEqual(max_acc_small, expected_rough, "Calculation should match expected formula")
-
+
def test_batch_accessions_for_url(self):
"""Test batching of accessions for URL length limits.
-
+
Tests _batch_accessions_for_url with large accession list and verifies:
- Returns list of batches
- All accessions are included
- No duplicate accessions
- Each batch respects URL limit
- Batching is consistent
-
+
This catches: Batching algorithm bugs, URL limit violations, data loss.
"""
- from gget.gget_virus import _batch_accessions_for_url, MAX_URL_LENGTH
-
+ from gget.gget_virus import MAX_URL_LENGTH
+
# Create large list of accessions that will need multiple batches
accessions = [f"NC_{100000 + i}.1" for i in range(1000)]
base_url_length = 100
-
+
batches = _batch_accessions_for_url(accessions, base_url_length)
-
+
# Should have multiple batches for 1000 accessions
self.assertIsInstance(batches, list, "Should return list of batches")
self.assertGreater(len(batches), 1, "Should split into multiple batches for 1000 accessions")
-
+
# All accessions should be included
all_batched = [acc for batch in batches for acc in batch]
self.assertEqual(len(all_batched), len(accessions), "All accessions should be included")
-
+
# No duplicates
self.assertEqual(len(set(all_batched)), len(accessions), "Should not have duplicates")
-
+
# Verify order is preserved
self.assertEqual(all_batched, accessions, "Accession order should be preserved")
-
+
# Verify each batch respects URL limit
for batch_num, batch in enumerate(batches, 1):
batch_url_length = base_url_length + sum(len(acc) + 3 for acc in batch)
- self.assertLessEqual(batch_url_length, MAX_URL_LENGTH,
- f"Batch {batch_num} exceeds URL limit ({batch_url_length} > {MAX_URL_LENGTH})")
-
+ self.assertLessEqual(
+ batch_url_length,
+ MAX_URL_LENGTH,
+ f"Batch {batch_num} exceeds URL limit ({batch_url_length} > {MAX_URL_LENGTH})",
+ )
+
@retry_on_network_error(max_retries=3, delay=5)
def test_virus_multi_accession_space_separated(self):
"""Test virus function with space-separated accessions.
-
+
Tests the virus() function with --is_accession flag and space-separated accessions and verifies:
- Function completes without errors
- Command summary is created (shows processing happened)
- Function doesn't crash on multi-accession input
-
+
This catches: Multi-accession parsing bugs, integration issues with virus() function.
-
+
Note: API may return 0 results for some accession combinations, which is acceptable.
The key is that the command processes without crashing.
"""
outfolder = self.test_output_dir
-
+
# Test with space-separated accessions
- result = virus(
- virus='MN908947.3 NC_045512.2',
- is_accession=True,
- outfolder=outfolder
- )
-
+ result = virus(virus="MN908947.3 NC_045512.2", is_accession=True, outfolder=outfolder)
+
# Function should complete successfully
self.assertIsNone(result)
-
+
# Command summary should be created
- summary_files = [f for f in os.listdir(outfolder) if f.startswith('command_summary')]
+ summary_files = [f for f in os.listdir(outfolder) if f.startswith("command_summary")]
self.assertGreater(len(summary_files), 0, "Command summary should be created")
-
+
@retry_on_network_error(max_retries=3, delay=5)
def test_virus_multi_accession_file_input(self):
"""Test virus function with file-based accessions.
-
+
Tests the virus() function with --is_accession flag and file input and verifies:
- Function completes without errors
- Command summary is created
- Correctly reads accessions from file
-
+
This catches: File reading bugs, multi-accession file processing issues.
"""
- import tempfile
-
+
outfolder = self.test_output_dir
-
+
# Create temporary accessions file
- accessions_file = os.path.join(outfolder, 'test_accessions.txt')
- with open(accessions_file, 'w') as f:
+ accessions_file = os.path.join(outfolder, "test_accessions.txt")
+ with open(accessions_file, "w") as f:
f.write("MN908947.3\n")
f.write("NC_045512.2\n")
-
+
# Test with file input
- result = virus(
- virus=accessions_file,
- is_accession=True,
- outfolder=outfolder
- )
-
+ result = virus(virus=accessions_file, is_accession=True, outfolder=outfolder)
+
# Function should complete successfully
self.assertIsNone(result)
-
+
# Command summary should be created
- summary_files = [f for f in os.listdir(outfolder) if f.startswith('command_summary')]
+ summary_files = [f for f in os.listdir(outfolder) if f.startswith("command_summary")]
self.assertGreater(len(summary_files), 0, "Command summary should be created for file input")
-
+
# Clean up
if os.path.exists(accessions_file):
os.unlink(accessions_file)
@@ -1592,36 +1517,34 @@ def test_virus_multi_accession_file_input(self):
# EXPONENTIAL BACKOFF HELPER FUNCTION TESTS
# =========================================================================
# These tests verify the core retry logic without making real API calls
-
+
def test_retry_helper_successful_operation(self):
"""Test successful operation on first attempt (no retries needed)."""
- from gget.gget_virus import _retry_with_exponential_backoff
-
+
def successful_op():
return {"result": "success"}
-
+
success, result, error_info = _retry_with_exponential_backoff(
operation_name="test_success",
operation_func=successful_op,
)
-
+
self.assertTrue(success, "Expected success=True")
self.assertEqual(result, {"result": "success"}, "Expected correct result")
self.assertIsNone(error_info, "Expected no error_info on success")
-
+
def test_retry_helper_success_after_retry(self):
"""Test operation that fails once then succeeds."""
import requests
- from gget.gget_virus import _retry_with_exponential_backoff
-
+
attempt_count = [0] # Use list to allow modification in nested function
-
+
def flaky_op():
attempt_count[0] += 1
if attempt_count[0] == 1:
raise requests.exceptions.ConnectionError("Temporary connection issue")
return {"result": "succeeded after retry"}
-
+
start_time = time.time()
success, result, error_info = _retry_with_exponential_backoff(
operation_name="test_flaky",
@@ -1632,27 +1555,26 @@ def flaky_op():
retryable_exceptions=(requests.exceptions.ConnectionError, requests.exceptions.HTTPError),
)
elapsed = time.time() - start_time
-
+
self.assertTrue(success, "Expected success=True after retry")
self.assertEqual(result, {"result": "succeeded after retry"}, "Expected correct result")
self.assertEqual(attempt_count[0], 2, f"Expected 2 attempts, got {attempt_count[0]}")
self.assertGreaterEqual(elapsed, 0.05, f"Expected at least 0.05s delay, got {elapsed}s")
-
+
def test_retry_helper_exponential_backoff_timing(self):
"""Test that exponential backoff increases delays properly."""
import requests
- from gget.gget_virus import _retry_with_exponential_backoff
-
+
attempt_count = [0]
-
+
def always_fails():
attempt_count[0] += 1
raise requests.exceptions.ConnectionError("Persistent connection issue")
-
+
initial_delay = 0.05
backoff_multiplier = 2.0
max_retries = 3
-
+
start_time = time.time()
success, result, error_info = _retry_with_exponential_backoff(
operation_name="test_backoff",
@@ -1663,24 +1585,24 @@ def always_fails():
retryable_exceptions=(requests.exceptions.ConnectionError, requests.exceptions.HTTPError),
)
elapsed = time.time() - start_time
-
+
# The loop runs max_retries times with delays between attempts
expected_min_delay = initial_delay * (1 + backoff_multiplier)
-
+
self.assertFalse(success, "Expected success=False when all retries fail")
self.assertEqual(attempt_count[0], max_retries, f"Expected {max_retries} attempts")
- self.assertGreaterEqual(elapsed, expected_min_delay * 0.8,
- f"Delay too short: {elapsed}s vs {expected_min_delay}s")
-
+ self.assertGreaterEqual(
+ elapsed, expected_min_delay * 0.8, f"Delay too short: {elapsed}s vs {expected_min_delay}s"
+ )
+
def test_retry_helper_failed_commands_tracking(self):
"""Test that failed_commands dict is properly populated."""
- from gget.gget_virus import _retry_with_exponential_backoff
-
+
def failing_op():
raise ConnectionError("Test error message")
-
+
failed_commands = {"custom_errors": []}
-
+
success, result, error_info = _retry_with_exponential_backoff(
operation_name="test_tracking",
operation_func=failing_op,
@@ -1688,23 +1610,22 @@ def failing_op():
initial_delay=0.01,
failed_commands=failed_commands,
)
-
+
self.assertFalse(success, "Expected operation to fail")
self.assertIsNotNone(error_info, "Expected error_info to be populated")
self.assertIn("exception_type", error_info, "Expected exception_type in error_info")
self.assertIn("error", error_info, "Expected error message in error_info")
-
+
def test_retry_helper_non_retryable_exception(self):
"""Test that non-retryable exceptions fail immediately."""
import requests
- from gget.gget_virus import _retry_with_exponential_backoff
-
+
attempt_count = [0]
-
+
def non_retryable_op():
attempt_count[0] += 1
raise ValueError("This exception is not retryable")
-
+
start_time = time.time()
success, result, error_info = _retry_with_exponential_backoff(
operation_name="test_non_retryable",
@@ -1714,24 +1635,23 @@ def non_retryable_op():
retryable_exceptions=(requests.exceptions.ConnectionError, requests.exceptions.HTTPError),
)
elapsed = time.time() - start_time
-
+
self.assertFalse(success, "Expected operation to fail")
self.assertEqual(attempt_count[0], 1, f"Expected only 1 attempt, got {attempt_count[0]}")
self.assertLess(elapsed, 0.1, f"Expected immediate failure, but took {elapsed:.2f}s")
-
+
def test_retry_helper_custom_retryable_exceptions(self):
"""Test with custom retryable exceptions."""
import requests
- from gget.gget_virus import _retry_with_exponential_backoff
-
+
attempt_count = [0]
-
+
def custom_failing_op():
attempt_count[0] += 1
if attempt_count[0] == 1:
raise requests.exceptions.Timeout("Request timed out")
return {"result": "success"}
-
+
success, result, error_info = _retry_with_exponential_backoff(
operation_name="test_custom_retryable",
operation_func=custom_failing_op,
@@ -1739,7 +1659,7 @@ def custom_failing_op():
initial_delay=0.01,
retryable_exceptions=(requests.exceptions.Timeout, requests.exceptions.ConnectionError),
)
-
+
self.assertTrue(success, "Expected retry to succeed with Timeout in retryable_exceptions")
self.assertEqual(attempt_count[0], 2, f"Expected 2 attempts, got {attempt_count[0]}")
@@ -1981,7 +1901,7 @@ def test_clean_xml_declarations(self):
def test_clean_xml_declarations_no_declarations(self):
"""Test _clean_xml_declarations with no declarations to remove."""
- xml = 'data'
+ xml = "data"
result = _clean_xml_declarations(xml)
self.assertEqual(result, xml)
@@ -2047,7 +1967,7 @@ def test_force_garbage_collection_runs(self):
def test_parse_baseline_file_csv(self):
"""Test _parse_baseline_file with CSV format."""
- with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f:
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f:
f.write("accession,length,host\n")
f.write("NC_045512.2,29903,human\n")
f.write("MN908947.3,29903,human\n")
@@ -2063,7 +1983,7 @@ def test_parse_baseline_file_csv(self):
def test_parse_baseline_file_jsonl(self):
"""Test _parse_baseline_file with JSONL format."""
- with tempfile.NamedTemporaryFile(mode='w', suffix='.jsonl', delete=False) as f:
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".jsonl", delete=False) as f:
f.write('{"accession": "NC_045512.2", "length": 29903}\n')
f.write('{"accession": "MN908947.3", "length": 29903}\n')
path = f.name
@@ -2076,12 +1996,10 @@ def test_parse_baseline_file_jsonl(self):
def test_parse_baseline_file_json(self):
"""Test _parse_baseline_file with JSON array format."""
- with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as f:
import json as json_mod
- json_mod.dump([
- {"accession": "NC_045512.2"},
- {"accession": "MN908947.3"}
- ], f)
+
+ json_mod.dump([{"accession": "NC_045512.2"}, {"accession": "MN908947.3"}], f)
path = f.name
try:
result = _parse_baseline_file(path)
@@ -2091,7 +2009,7 @@ def test_parse_baseline_file_json(self):
def test_parse_baseline_file_text(self):
"""Test _parse_baseline_file with plain text format."""
- with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f:
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".txt", delete=False) as f:
f.write("NC_045512.2\n")
f.write("MN908947.3\n")
f.write("# comment line\n")
@@ -2112,7 +2030,7 @@ def test_parse_baseline_file_nonexistent_raises(self):
def test_parse_baseline_file_empty_raises(self):
"""Test _parse_baseline_file raises for empty file."""
- with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f:
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f:
f.write("accession\n") # header only, no data
path = f.name
try:
@@ -2196,10 +2114,12 @@ def test_merge_baseline_with_new_csv(self):
baseline_path = os.path.join(tmpdir, "baseline.csv")
output_path = os.path.join(tmpdir, "merged.csv")
# Create baseline CSV
- pd.DataFrame([
- {"accession": "ACC1", "length": 100},
- {"accession": "ACC2", "length": 200},
- ]).to_csv(baseline_path, index=False)
+ pd.DataFrame(
+ [
+ {"accession": "ACC1", "length": 100},
+ {"accession": "ACC2", "length": 200},
+ ]
+ ).to_csv(baseline_path, index=False)
# New metadata
new_records = [
{"accession": "ACC3", "length": 300},
@@ -2215,9 +2135,11 @@ def test_merge_baseline_with_new_deduplicates(self):
with tempfile.TemporaryDirectory() as tmpdir:
baseline_path = os.path.join(tmpdir, "baseline.csv")
output_path = os.path.join(tmpdir, "merged.csv")
- pd.DataFrame([
- {"accession": "ACC1", "length": 100},
- ]).to_csv(baseline_path, index=False)
+ pd.DataFrame(
+ [
+ {"accession": "ACC1", "length": 100},
+ ]
+ ).to_csv(baseline_path, index=False)
new_records = [{"accession": "ACC1", "length": 999}]
_merge_baseline_with_new(baseline_path, new_records, output_path)
df = pd.read_csv(output_path)
@@ -2286,10 +2208,24 @@ def test_load_metadata_from_api_reports_empty(self):
def test_load_metadata_from_api_reports_multiple(self):
"""Test load_metadata_from_api_reports with multiple reports."""
api_reports = [
- {"accession": "ACC1", "length": 100, "completeness": "COMPLETE",
- "host": {}, "location": {}, "isolate": {}, "virus": {}},
- {"accession": "ACC2", "length": 200, "completeness": "PARTIAL",
- "host": {}, "location": {}, "isolate": {}, "virus": {}},
+ {
+ "accession": "ACC1",
+ "length": 100,
+ "completeness": "COMPLETE",
+ "host": {},
+ "location": {},
+ "isolate": {},
+ "virus": {},
+ },
+ {
+ "accession": "ACC2",
+ "length": 200,
+ "completeness": "PARTIAL",
+ "host": {},
+ "location": {},
+ "isolate": {},
+ "virus": {},
+ },
]
result = load_metadata_from_api_reports(api_reports)
self.assertEqual(len(result), 2)
@@ -2424,9 +2360,7 @@ def test_filter_metadata_only_source_database(self):
def test_filter_metadata_only_collection_date_range(self):
"""Test filter_metadata_only with collection date range."""
meta = self._make_test_metadata()
- accs, metas, _ = filter_metadata_only(
- meta, min_collection_date="2021-01-01", max_collection_date="2021-12-31"
- )
+ accs, metas, _ = filter_metadata_only(meta, min_collection_date="2021-01-01", max_collection_date="2021-12-31")
self.assertEqual(len(accs), 1)
self.assertIn("ACC3", accs)
@@ -2670,9 +2604,7 @@ def test_filter_genbank_metadata_env_source(self):
def test_filter_genbank_metadata_combined(self):
"""Test filter_genbank_metadata with multiple filters."""
meta = self._make_genbank_metadata()
- result, _ = filter_genbank_metadata(
- meta, min_gene_count=5, genotype="H5N1", has_proteins="hemagglutinin"
- )
+ result, _ = filter_genbank_metadata(meta, min_gene_count=5, genotype="H5N1", has_proteins="hemagglutinin")
self.assertEqual(len(result), 2)
self.assertIn("ACC1", result)
self.assertIn("ACC3", result)
@@ -2690,9 +2622,7 @@ def test_filter_cached_no_filters(self):
def test_filter_cached_host_not_in_strategy(self):
"""Test filter_cached_metadata_for_unused_filters applies host when not in strategy."""
meta = self._make_test_metadata()
- accs, metas = filter_cached_metadata_for_unused_filters(
- meta, host="Homo sapiens", applied_strategy_filters=[]
- )
+ accs, metas = filter_cached_metadata_for_unused_filters(meta, host="Homo sapiens", applied_strategy_filters=[])
self.assertEqual(len(accs), 2)
self.assertIn("ACC1", accs)
self.assertIn("ACC3", accs)
@@ -2709,17 +2639,13 @@ def test_filter_cached_host_in_strategy_skipped(self):
def test_filter_cached_complete_only(self):
"""Test filter_cached_metadata_for_unused_filters with complete_only."""
meta = self._make_test_metadata()
- accs, metas = filter_cached_metadata_for_unused_filters(
- meta, complete_only=True, applied_strategy_filters=[]
- )
+ accs, metas = filter_cached_metadata_for_unused_filters(meta, complete_only=True, applied_strategy_filters=[])
self.assertEqual(len(accs), 2) # ACC1 and ACC3 are complete
def test_filter_cached_annotated(self):
"""Test filter_cached_metadata_for_unused_filters with annotated."""
meta = self._make_test_metadata()
- accs, metas = filter_cached_metadata_for_unused_filters(
- meta, annotated=True, applied_strategy_filters=[]
- )
+ accs, metas = filter_cached_metadata_for_unused_filters(meta, annotated=True, applied_strategy_filters=[])
self.assertEqual(len(accs), 2)
self.assertNotIn("ACC2", accs)
@@ -2735,9 +2661,7 @@ def test_filter_cached_geographic_location(self):
def test_filter_cached_refseq_only(self):
"""Test filter_cached_metadata_for_unused_filters with refseq_only."""
meta = self._make_test_metadata()
- accs, metas = filter_cached_metadata_for_unused_filters(
- meta, refseq_only=True, applied_strategy_filters=[]
- )
+ accs, metas = filter_cached_metadata_for_unused_filters(meta, refseq_only=True, applied_strategy_filters=[])
self.assertEqual(len(accs), 1)
self.assertIn("ACC2", accs)
@@ -2764,7 +2688,7 @@ def test_filter_metadata_only_nuc_completeness_partial(self):
def test_filter_metadata_only_annotated_true(self):
"""Test filter_metadata_only with annotated=True passes all (handled server-side).
-
+
Note: annotated=True is handled server-side by the API, so the client-side
filter_metadata_only does NOT filter on annotated=True. All records pass.
"""
@@ -2779,9 +2703,10 @@ def test_filter_metadata_only_annotated_true(self):
def test_write_fasta_record_with_description(self):
"""Test _write_fasta_record writes correct FASTA format with description."""
- from gget.utils import FastaRecord
import io
+ from gget.utils import FastaRecord
+
record = FastaRecord(seq="ATCGATCGATCG", id="ACC001", description="Test virus isolate")
handle = io.StringIO()
_write_fasta_record(handle, record)
@@ -2792,9 +2717,10 @@ def test_write_fasta_record_with_description(self):
def test_write_fasta_record_without_description(self):
"""Test _write_fasta_record writes correct FASTA format without description."""
- from gget.utils import FastaRecord
import io
+ from gget.utils import FastaRecord
+
record = FastaRecord(seq="ATCG", id="ACC002", description="")
handle = io.StringIO()
_write_fasta_record(handle, record)
@@ -2805,15 +2731,16 @@ def test_write_fasta_record_without_description(self):
def test_write_fasta_record_long_sequence_wraps(self):
"""Test _write_fasta_record wraps long sequences at 70 characters."""
- from gget.utils import FastaRecord
import io
+ from gget.utils import FastaRecord
+
# Create a sequence longer than 70 characters
long_seq = "A" * 150
record = FastaRecord(seq=long_seq, id="ACC003", description="")
handle = io.StringIO()
_write_fasta_record(handle, record)
- lines = handle.getvalue().strip().split('\n')
+ lines = handle.getvalue().strip().split("\n")
# First line is header, then sequence lines
self.assertEqual(lines[0], ">ACC003")
@@ -2831,7 +2758,7 @@ def test_stream_copy_fasta_all_records(self):
input_path = os.path.join(tmpdir, "input.fasta")
output_path = os.path.join(tmpdir, "output.fasta")
- with open(input_path, 'w') as f:
+ with open(input_path, "w") as f:
f.write(">ACC1\nATCGATCG\n>ACC2\nGGGGAAAA\n>ACC3\nTTTTCCCC\n")
count = _stream_copy_fasta(input_path, output_path)
@@ -2840,7 +2767,7 @@ def test_stream_copy_fasta_all_records(self):
# Verify output has all 3 records
with open(output_path) as f:
- headers = [l for l in f if l.startswith('>')]
+ headers = [l for l in f if l.startswith(">")]
self.assertEqual(len(headers), 3)
def test_stream_copy_fasta_with_accession_filter(self):
@@ -2849,7 +2776,7 @@ def test_stream_copy_fasta_with_accession_filter(self):
input_path = os.path.join(tmpdir, "input.fasta")
output_path = os.path.join(tmpdir, "output.fasta")
- with open(input_path, 'w') as f:
+ with open(input_path, "w") as f:
f.write(">ACC1\nATCGATCG\n>ACC2\nGGGGAAAA\n>ACC3\nTTTTCCCC\n")
count = _stream_copy_fasta(input_path, output_path, accession_set={"ACC1", "ACC3"})
@@ -2872,7 +2799,7 @@ def test_filter_sequences_max_ambiguous_chars(self):
output_path = os.path.join(tmpdir, "filtered.fasta")
# ACC1 has 0 N's, ACC2 has 5 N's, ACC3 has 20 N's
- with open(fasta_path, 'w') as f:
+ with open(fasta_path, "w") as f:
f.write(">ACC1\nATCGATCGATCG\n")
f.write(">ACC2\nATNNNNNCG\n")
f.write(">ACC3\n" + "N" * 20 + "\n")
@@ -2884,13 +2811,14 @@ def test_filter_sequences_max_ambiguous_chars(self):
}
count, filtered_meta, protein_headers, stats = filter_sequences(
- fasta_path, metadata_dict,
+ fasta_path,
+ metadata_dict,
max_ambiguous_chars=10,
output_fasta_path=output_path,
)
self.assertEqual(count, 2) # ACC1 and ACC2 pass
- self.assertEqual(stats['ambiguous_chars'], 1) # ACC3 filtered out
+ self.assertEqual(stats["ambiguous_chars"], 1) # ACC3 filtered out
def test_filter_sequences_no_filters(self):
"""Test filter_sequences passes all records when no filters applied."""
@@ -2898,7 +2826,7 @@ def test_filter_sequences_no_filters(self):
fasta_path = os.path.join(tmpdir, "test.fasta")
output_path = os.path.join(tmpdir, "filtered.fasta")
- with open(fasta_path, 'w') as f:
+ with open(fasta_path, "w") as f:
f.write(">ACC1\nATCGATCG\n>ACC2\nGGGGAAAA\n")
metadata_dict = {
@@ -2907,7 +2835,8 @@ def test_filter_sequences_no_filters(self):
}
count, filtered_meta, protein_headers, stats = filter_sequences(
- fasta_path, metadata_dict,
+ fasta_path,
+ metadata_dict,
output_fasta_path=output_path,
)
@@ -2919,7 +2848,7 @@ def test_filter_sequences_proteins_complete(self):
fasta_path = os.path.join(tmpdir, "test.fasta")
output_path = os.path.join(tmpdir, "filtered.fasta")
- with open(fasta_path, 'w') as f:
+ with open(fasta_path, "w") as f:
f.write(">ACC1\nATCGATCG\n>ACC2\nGGGGAAAA\n")
metadata_dict = {
@@ -2928,13 +2857,14 @@ def test_filter_sequences_proteins_complete(self):
}
count, filtered_meta, protein_headers, stats = filter_sequences(
- fasta_path, metadata_dict,
+ fasta_path,
+ metadata_dict,
proteins_complete=True,
output_fasta_path=output_path,
)
self.assertEqual(count, 1) # Only ACC1 has proteins
- self.assertEqual(stats['proteins'], 1) # ACC2 filtered out
+ self.assertEqual(stats["proteins"], 1) # ACC2 filtered out
# =========================================================================
# SAVE COMMAND SUMMARY TESTS
@@ -2996,14 +2926,18 @@ def test_merge_metadata_csvs_fills_missing(self):
standard_path = os.path.join(tmpdir, "standard.csv")
# GenBank CSV with missing host
- pd.DataFrame([
- {"accession": "ACC1", "Host": "", "Length": "29903"},
- ]).to_csv(genbank_path, index=False)
+ pd.DataFrame(
+ [
+ {"accession": "ACC1", "Host": "", "Length": "29903"},
+ ]
+ ).to_csv(genbank_path, index=False)
# Standard CSV with host data
- pd.DataFrame([
- {"accession": "ACC1", "Host": "Homo sapiens", "Length": "29903"},
- ]).to_csv(standard_path, index=False)
+ pd.DataFrame(
+ [
+ {"accession": "ACC1", "Host": "Homo sapiens", "Length": "29903"},
+ ]
+ ).to_csv(standard_path, index=False)
result = merge_metadata_csvs(genbank_path, standard_path)
self.assertTrue(result)
@@ -3027,13 +2961,17 @@ def test_merge_metadata_csvs_no_overwrite(self):
genbank_path = os.path.join(tmpdir, "genbank.csv")
standard_path = os.path.join(tmpdir, "standard.csv")
- pd.DataFrame([
- {"accession": "ACC1", "Host": "chicken", "Length": "29903"},
- ]).to_csv(genbank_path, index=False)
+ pd.DataFrame(
+ [
+ {"accession": "ACC1", "Host": "chicken", "Length": "29903"},
+ ]
+ ).to_csv(genbank_path, index=False)
- pd.DataFrame([
- {"accession": "ACC1", "Host": "human", "Length": "29903"},
- ]).to_csv(standard_path, index=False)
+ pd.DataFrame(
+ [
+ {"accession": "ACC1", "Host": "human", "Length": "29903"},
+ ]
+ ).to_csv(standard_path, index=False)
merge_metadata_csvs(genbank_path, standard_path)
@@ -3202,7 +3140,7 @@ def test_genbank_xml_to_csv_basic(self):
"""
- with open(xml_path, 'w') as f:
+ with open(xml_path, "w") as f:
f.write(xml_content)
_genbank_xml_to_csv(xml_path, csv_path)
@@ -3241,9 +3179,7 @@ def test_save_genbank_metadata_to_csv_basic(self):
"assembly_name": "ASM985889v3",
"taxonomy": "Viruses; Riboviria",
"comment": "",
- "references": [
- {"title": "Paper", "authors": "Wu F", "journal": "Nature", "pubmed_id": "123"}
- ],
+ "references": [{"title": "Paper", "authors": "Wu F", "journal": "Nature", "pubmed_id": "123"}],
},
},
}
@@ -3267,5 +3203,5 @@ def test_save_genbank_metadata_to_csv_empty(self):
self.assertEqual(len(df), 0)
-if __name__ == '__main__':
+if __name__ == "__main__":
unittest.main()