From 00cdb2fecab636b443632d8782692c1c9ed21642 Mon Sep 17 00:00:00 2001 From: Eugenio Mattei Date: Tue, 13 May 2025 14:10:10 -0400 Subject: [PATCH 1/5] - updated actions --- .github/workflows/build-docker-chromap.yml | 2 +- .github/workflows/build-docker-kallisto-bustools.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build-docker-chromap.yml b/.github/workflows/build-docker-chromap.yml index 1d7051d..296de15 100644 --- a/.github/workflows/build-docker-chromap.yml +++ b/.github/workflows/build-docker-chromap.yml @@ -2,7 +2,7 @@ name: IGVF Chromap CI on: push: - branches: [ "dev", "main" ] + branches: [ "dev", "main", "v1" ] paths: - 'modules/igvf-chromap/**' pull_request: diff --git a/.github/workflows/build-docker-kallisto-bustools.yml b/.github/workflows/build-docker-kallisto-bustools.yml index 5037296..587a62d 100644 --- a/.github/workflows/build-docker-kallisto-bustools.yml +++ b/.github/workflows/build-docker-kallisto-bustools.yml @@ -2,7 +2,7 @@ name: IGVF kallisto-bustools CI on: push: - branches: [ "dev", "main" ] + branches: [ "dev", "main", "v1" ] paths: - 'modules/igvf-kallisto-bustools/**' pull_request: From 39aef409b5a93dc2a094909939f4275884161663 Mon Sep 17 00:00:00 2001 From: Eugenio Mattei Date: Fri, 23 May 2025 10:51:16 -0400 Subject: [PATCH 2/5] - adding README to docker --- .../build-docker-kallisto-bustools.yml | 2 +- .../docker_builder.dockerfile | 3 +- modules/igvf-kallisto-bustools/docs/README.md | 57 +++++++++++++++++++ 3 files changed, 60 insertions(+), 2 deletions(-) create mode 100644 modules/igvf-kallisto-bustools/docs/README.md diff --git a/.github/workflows/build-docker-kallisto-bustools.yml b/.github/workflows/build-docker-kallisto-bustools.yml index 587a62d..650da4a 100644 --- a/.github/workflows/build-docker-kallisto-bustools.yml +++ b/.github/workflows/build-docker-kallisto-bustools.yml @@ -2,7 +2,7 @@ name: IGVF kallisto-bustools CI on: push: - branches: [ "dev", "main", "v1" ] + branches: [ "dev", "main", "v1", "v1.1" ] paths: - 'modules/igvf-kallisto-bustools/**' pull_request: diff --git a/modules/igvf-kallisto-bustools/docker_builder.dockerfile b/modules/igvf-kallisto-bustools/docker_builder.dockerfile index 3fe48c4..4ddb19a 100644 --- a/modules/igvf-kallisto-bustools/docker_builder.dockerfile +++ b/modules/igvf-kallisto-bustools/docker_builder.dockerfile @@ -8,13 +8,14 @@ FROM python:3.10.16-slim LABEL maintainer="Eugenio Mattei" LABEL software="IGVF single-cell pipeline" -LABEL software.version="1" +LABEL software.version="1.1" LABEL software.organization="IGVF consortium" LABEL software.version.is-production="Yes" LABEL software.task="run-kallisto-bustools-module" LABEL software.description="Run the kallisto-bustools module of the IGVF single-cell pipeline" RUN mkdir /software +COPY docs/README.md /software COPY run_kallisto.py /software COPY pyproject.toml /software RUN cd /software && pip install --upgrade pip && pip install --editable . diff --git a/modules/igvf-kallisto-bustools/docs/README.md b/modules/igvf-kallisto-bustools/docs/README.md new file mode 100644 index 0000000..da58ea6 --- /dev/null +++ b/modules/igvf-kallisto-bustools/docs/README.md @@ -0,0 +1,57 @@ +# Processed Single-Cell Gene Expression Data (kallisto-bustools) + +This archive contains processed single-cell gene expression data generated using **kallisto-bustools**. + +## File Descriptions + +All output files are located in the `counts_unfiltered` directory unless otherwise noted. + +### Count Matrices + +| File | Description | +|------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `cells_x_genes.mature.mtx` | Matrix of RNA counts from exonic reads spanning splice junctions (spliced/mature transcripts). Corresponds to the `"mature"` layer in the AnnData `.h5ad` file. | +| `cells_x_genes.ambiguous.mtx` | Matrix of RNA counts from ambiguous exonic reads (not spanning junctions). Included in the `"ambiguous"` layer of the AnnData file. | +| `cells_x_genes.cell.mtx` | Sum of mature and ambiguous counts (all exonic reads). | +| `cells_x_genes.nascent.mtx` | Matrix of RNA counts from intronic reads (unspliced/nascent transcripts). Included in the `"nascent"` layer of the AnnData file. | +| `cells_x_genes.nucleus.mtx` | Combination of nascent and ambiguous counts (intronic + ambiguous exonic reads). | +| `cells_x_genes.total.mtx` | Matrix containing all reads: mature, nascent, and ambiguous. This is the default matrix in the `.X` attribute of the AnnData object. | + +### Annotation Files + +| File | Description | +|------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `cells_x_genes.barcodes.txt` | List of cell barcodes (one per line), including sample-specific suffixes. Used as the observation index in AnnData and as row indices in all matrices. | +| `cells_x_genes.genes.txt` | List of unique ENSEMBL gene IDs (one per line). Used as the variable index in AnnData and as column indices in all matrices. | +| `cells_x_genes.genes.names.txt` | List of human-readable gene names corresponding to the ENSEMBL IDs in `genes.txt`. Names are not guaranteed to be unique. | + +### Additional Output Files + +| File | Description | +|------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `inspect.json` | Summary statistics: total reads, barcodes, UMIs, mean/median reads and UMIs per barcode, and percentages matching known barcode lists (if provided). | +| `kb_info.json` | Metadata for the `kb count` run: software versions, runtime, and command used. | +| `run_info.json` | Summary of kallisto pseudoalignment: processed reads, mapping percentages, reference targets, k-mer length, and kallisto version. | +| `transcripts.txt` | List of transcripts present in the data, in the same order as the transcriptome FASTA file. | +| `matrix.ec` | Two-column file: (1) equivalence class index (0-based), (2) set of transcript indices (0-based, matching order in `transcripts.txt`) in each equivalence class. | +| `output.bus` | Initial BUS file after pseudoalignment: uncorrected, unsorted barcode-UMI-equivalence class records. | +| `output.unfiltered.bus` | Processed and sorted BUS file for counting: corrected barcodes, no UMI filtering. | +| `output_modified.unfiltered.bus` | BUS file with barcodes modified using a replacement list (e.g., for merging oligo-dT and random hexamer barcodes in Parse Biosciences data). Present only if barcode collapsing was performed. | + +### Barcode Collapsing (Parse Biosciences) + +Some platforms (e.g., Parse Biosciences) assign two barcodes per cell (oligo-dT and random hexamer). These are typically collapsed into a single barcode during quantification using a replacement list, retaining the oligo-dT barcode. + +- Collapsed output is saved in `counts_unfiltered_modified/` (same file structure as above), used to generate the primary AnnData `.h5ad` file. +- The original, uncollapsed output is preserved in `counts_unfiltered/adata.h5ad` for reproducibility and transparency. + +--- + +## References + +- Kallisto files: + [PMC10690192](https://pmc.ncbi.nlm.nih.gov/articles/PMC10690192/), + [kb_species_mixing tutorial](https://pachterlab.github.io/kallistobustools/tutorials/kb_species_mixing/R/kb_mixed_species_10x_v2/) +- Parse Bio barcoding: + [Parse Biosciences blog](https://www.parsebiosciences.com/blog/getting-started-with-scrna-seq-post-sequencing-data-analysis/), + [Genome Biology article](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-021-02505-w) \ No newline at end of file From c89507aaf2fa20e2b565da737f1717cbe3634b3c Mon Sep 17 00:00:00 2001 From: Eugenio Mattei Date: Fri, 23 May 2025 11:26:21 -0400 Subject: [PATCH 3/5] - updating tar comand --- modules/igvf-kallisto-bustools/docs/README.md | 2 +- modules/igvf-kallisto-bustools/run_kallisto.py | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/modules/igvf-kallisto-bustools/docs/README.md b/modules/igvf-kallisto-bustools/docs/README.md index da58ea6..2e5813e 100644 --- a/modules/igvf-kallisto-bustools/docs/README.md +++ b/modules/igvf-kallisto-bustools/docs/README.md @@ -54,4 +54,4 @@ Some platforms (e.g., Parse Biosciences) assign two barcodes per cell (oligo-dT [kb_species_mixing tutorial](https://pachterlab.github.io/kallistobustools/tutorials/kb_species_mixing/R/kb_mixed_species_10x_v2/) - Parse Bio barcoding: [Parse Biosciences blog](https://www.parsebiosciences.com/blog/getting-started-with-scrna-seq-post-sequencing-data-analysis/), - [Genome Biology article](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-021-02505-w) \ No newline at end of file + [Genome Biology article](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-021-02505-w) diff --git a/modules/igvf-kallisto-bustools/run_kallisto.py b/modules/igvf-kallisto-bustools/run_kallisto.py index 702791e..a93b203 100644 --- a/modules/igvf-kallisto-bustools/run_kallisto.py +++ b/modules/igvf-kallisto-bustools/run_kallisto.py @@ -194,8 +194,10 @@ def quantify_nac(temp_dir, index_dir, read_format, output_dir, strand, subpool, except subprocess.CalledProcessError as e: logging.error(f"Command failed with error: {e.stderr}") + work_dir = os.environ["HOME"] + # Archive the directory - archive_cmd = f"tar -kzcvf {output_dir}.tar.gz {output_dir}" + archive_cmd = f"tar -zcvf {output_dir}.tar.gz -C /software README.md -C {work_dir} {output_dir}" logging.info(f"Running archive command: {archive_cmd}") try: result = subprocess.run(archive_cmd, shell=True, capture_output=True, text=True, check=True) From c78e17216e9ec4ef0ca884f7d6bcabb14ea4d5d8 Mon Sep 17 00:00:00 2001 From: Eugenio Mattei Date: Fri, 23 May 2025 11:49:33 -0400 Subject: [PATCH 4/5] - bumping version in toml --- modules/igvf-kallisto-bustools/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/igvf-kallisto-bustools/pyproject.toml b/modules/igvf-kallisto-bustools/pyproject.toml index 470d539..cf20a80 100644 --- a/modules/igvf-kallisto-bustools/pyproject.toml +++ b/modules/igvf-kallisto-bustools/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "igvf-kallisto-bustools" -version = "1.0.0" +version = "1.1.0" description = "Align scRNA using kallisto-bustools" requires-python = "==3.10.16" dependencies = [ From 2786a542afea583348b5a0d6ee7318515245bf65 Mon Sep 17 00:00:00 2001 From: Eugenio Mattei Date: Fri, 23 May 2025 16:04:56 -0400 Subject: [PATCH 5/5] - changing order of tar for simplicity --- modules/igvf-kallisto-bustools/run_kallisto.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/modules/igvf-kallisto-bustools/run_kallisto.py b/modules/igvf-kallisto-bustools/run_kallisto.py index a93b203..8bfacc6 100644 --- a/modules/igvf-kallisto-bustools/run_kallisto.py +++ b/modules/igvf-kallisto-bustools/run_kallisto.py @@ -194,10 +194,8 @@ def quantify_nac(temp_dir, index_dir, read_format, output_dir, strand, subpool, except subprocess.CalledProcessError as e: logging.error(f"Command failed with error: {e.stderr}") - work_dir = os.environ["HOME"] - # Archive the directory - archive_cmd = f"tar -zcvf {output_dir}.tar.gz -C /software README.md -C {work_dir} {output_dir}" + archive_cmd = f"tar -zcvf {output_dir}.tar.gz {output_dir} -C /software README.md " logging.info(f"Running archive command: {archive_cmd}") try: result = subprocess.run(archive_cmd, shell=True, capture_output=True, text=True, check=True)