diff --git a/.github/workflows/build-docker-chromap.yml b/.github/workflows/build-docker-chromap.yml index 1d7051d..296de15 100644 --- a/.github/workflows/build-docker-chromap.yml +++ b/.github/workflows/build-docker-chromap.yml @@ -2,7 +2,7 @@ name: IGVF Chromap CI on: push: - branches: [ "dev", "main" ] + branches: [ "dev", "main", "v1" ] paths: - 'modules/igvf-chromap/**' pull_request: diff --git a/.github/workflows/build-docker-kallisto-bustools.yml b/.github/workflows/build-docker-kallisto-bustools.yml index 5037296..650da4a 100644 --- a/.github/workflows/build-docker-kallisto-bustools.yml +++ b/.github/workflows/build-docker-kallisto-bustools.yml @@ -2,7 +2,7 @@ name: IGVF kallisto-bustools CI on: push: - branches: [ "dev", "main" ] + branches: [ "dev", "main", "v1", "v1.1" ] paths: - 'modules/igvf-kallisto-bustools/**' pull_request: diff --git a/modules/igvf-kallisto-bustools/docker_builder.dockerfile b/modules/igvf-kallisto-bustools/docker_builder.dockerfile index 3fe48c4..4ddb19a 100644 --- a/modules/igvf-kallisto-bustools/docker_builder.dockerfile +++ b/modules/igvf-kallisto-bustools/docker_builder.dockerfile @@ -8,13 +8,14 @@ FROM python:3.10.16-slim LABEL maintainer="Eugenio Mattei" LABEL software="IGVF single-cell pipeline" -LABEL software.version="1" +LABEL software.version="1.1" LABEL software.organization="IGVF consortium" LABEL software.version.is-production="Yes" LABEL software.task="run-kallisto-bustools-module" LABEL software.description="Run the kallisto-bustools module of the IGVF single-cell pipeline" RUN mkdir /software +COPY docs/README.md /software COPY run_kallisto.py /software COPY pyproject.toml /software RUN cd /software && pip install --upgrade pip && pip install --editable . diff --git a/modules/igvf-kallisto-bustools/docs/README.md b/modules/igvf-kallisto-bustools/docs/README.md new file mode 100644 index 0000000..2e5813e --- /dev/null +++ b/modules/igvf-kallisto-bustools/docs/README.md @@ -0,0 +1,57 @@ +# Processed Single-Cell Gene Expression Data (kallisto-bustools) + +This archive contains processed single-cell gene expression data generated using **kallisto-bustools**. + +## File Descriptions + +All output files are located in the `counts_unfiltered` directory unless otherwise noted. + +### Count Matrices + +| File | Description | +|------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `cells_x_genes.mature.mtx` | Matrix of RNA counts from exonic reads spanning splice junctions (spliced/mature transcripts). Corresponds to the `"mature"` layer in the AnnData `.h5ad` file. | +| `cells_x_genes.ambiguous.mtx` | Matrix of RNA counts from ambiguous exonic reads (not spanning junctions). Included in the `"ambiguous"` layer of the AnnData file. | +| `cells_x_genes.cell.mtx` | Sum of mature and ambiguous counts (all exonic reads). | +| `cells_x_genes.nascent.mtx` | Matrix of RNA counts from intronic reads (unspliced/nascent transcripts). Included in the `"nascent"` layer of the AnnData file. | +| `cells_x_genes.nucleus.mtx` | Combination of nascent and ambiguous counts (intronic + ambiguous exonic reads). | +| `cells_x_genes.total.mtx` | Matrix containing all reads: mature, nascent, and ambiguous. This is the default matrix in the `.X` attribute of the AnnData object. | + +### Annotation Files + +| File | Description | +|------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `cells_x_genes.barcodes.txt` | List of cell barcodes (one per line), including sample-specific suffixes. Used as the observation index in AnnData and as row indices in all matrices. | +| `cells_x_genes.genes.txt` | List of unique ENSEMBL gene IDs (one per line). Used as the variable index in AnnData and as column indices in all matrices. | +| `cells_x_genes.genes.names.txt` | List of human-readable gene names corresponding to the ENSEMBL IDs in `genes.txt`. Names are not guaranteed to be unique. | + +### Additional Output Files + +| File | Description | +|------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `inspect.json` | Summary statistics: total reads, barcodes, UMIs, mean/median reads and UMIs per barcode, and percentages matching known barcode lists (if provided). | +| `kb_info.json` | Metadata for the `kb count` run: software versions, runtime, and command used. | +| `run_info.json` | Summary of kallisto pseudoalignment: processed reads, mapping percentages, reference targets, k-mer length, and kallisto version. | +| `transcripts.txt` | List of transcripts present in the data, in the same order as the transcriptome FASTA file. | +| `matrix.ec` | Two-column file: (1) equivalence class index (0-based), (2) set of transcript indices (0-based, matching order in `transcripts.txt`) in each equivalence class. | +| `output.bus` | Initial BUS file after pseudoalignment: uncorrected, unsorted barcode-UMI-equivalence class records. | +| `output.unfiltered.bus` | Processed and sorted BUS file for counting: corrected barcodes, no UMI filtering. | +| `output_modified.unfiltered.bus` | BUS file with barcodes modified using a replacement list (e.g., for merging oligo-dT and random hexamer barcodes in Parse Biosciences data). Present only if barcode collapsing was performed. | + +### Barcode Collapsing (Parse Biosciences) + +Some platforms (e.g., Parse Biosciences) assign two barcodes per cell (oligo-dT and random hexamer). These are typically collapsed into a single barcode during quantification using a replacement list, retaining the oligo-dT barcode. + +- Collapsed output is saved in `counts_unfiltered_modified/` (same file structure as above), used to generate the primary AnnData `.h5ad` file. +- The original, uncollapsed output is preserved in `counts_unfiltered/adata.h5ad` for reproducibility and transparency. + +--- + +## References + +- Kallisto files: + [PMC10690192](https://pmc.ncbi.nlm.nih.gov/articles/PMC10690192/), + [kb_species_mixing tutorial](https://pachterlab.github.io/kallistobustools/tutorials/kb_species_mixing/R/kb_mixed_species_10x_v2/) +- Parse Bio barcoding: + [Parse Biosciences blog](https://www.parsebiosciences.com/blog/getting-started-with-scrna-seq-post-sequencing-data-analysis/), + [Genome Biology article](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-021-02505-w) diff --git a/modules/igvf-kallisto-bustools/pyproject.toml b/modules/igvf-kallisto-bustools/pyproject.toml index 470d539..cf20a80 100644 --- a/modules/igvf-kallisto-bustools/pyproject.toml +++ b/modules/igvf-kallisto-bustools/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "igvf-kallisto-bustools" -version = "1.0.0" +version = "1.1.0" description = "Align scRNA using kallisto-bustools" requires-python = "==3.10.16" dependencies = [ diff --git a/modules/igvf-kallisto-bustools/run_kallisto.py b/modules/igvf-kallisto-bustools/run_kallisto.py index 702791e..8bfacc6 100644 --- a/modules/igvf-kallisto-bustools/run_kallisto.py +++ b/modules/igvf-kallisto-bustools/run_kallisto.py @@ -195,7 +195,7 @@ def quantify_nac(temp_dir, index_dir, read_format, output_dir, strand, subpool, logging.error(f"Command failed with error: {e.stderr}") # Archive the directory - archive_cmd = f"tar -kzcvf {output_dir}.tar.gz {output_dir}" + archive_cmd = f"tar -zcvf {output_dir}.tar.gz {output_dir} -C /software README.md " logging.info(f"Running archive command: {archive_cmd}") try: result = subprocess.run(archive_cmd, shell=True, capture_output=True, text=True, check=True)