Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
ccef5a6
Synthesize terrible code for one-command hap indexing and sampling
adamnovak Mar 6, 2026
57c0696
Synthesize slightly better code that models more things in the graph
adamnovak Mar 6, 2026
c5d06dc
Synthesize code to avoid shell
adamnovak Mar 6, 2026
171c893
Automatically split haplotype indexing kmer defaults from short read …
adamnovak Mar 6, 2026
7c49f5e
Simplify casting around child process
adamnovak Mar 6, 2026
d62309f
Rationalize index graph and try and fail to get sampled GBZ to use th…
adamnovak Mar 7, 2026
f54a4a6
Get partway into a scope applying system that is bad
adamnovak Mar 9, 2026
6df0ca3
Implement scopes and fix storing new paths for aliases
adamnovak Mar 10, 2026
79a5672
Simplify alias fulfillment
adamnovak Mar 10, 2026
ebd38ae
Make plan really understand provided indexes and actually attach scop…
adamnovak Mar 10, 2026
9550308
Don't add empty scopes and quiet debugging
adamnovak Mar 10, 2026
5ae95d5
Add a terrible multi-extension system that needs hacky workarounds an…
adamnovak Mar 10, 2026
888d65c
Add missing dependency
adamnovak Mar 10, 2026
a486904
Hook up CLI for non-diploid sampling and controlling haplotype count
adamnovak Mar 10, 2026
14950bd
Wrap long help
adamnovak Mar 10, 2026
b8260f4
Fix CLI code to pass linting
adamnovak Mar 10, 2026
d8604be
Implement Snakemake-y wildcards to try and let scopes control what su…
adamnovak Mar 11, 2026
9272a72
Don't show wildcards in get_possible_filenames()
adamnovak Mar 11, 2026
2171d4a
Fix wildcard parsing
adamnovak Mar 11, 2026
293ce49
Add tests to keep wildcard system working
adamnovak Mar 11, 2026
46a83cc
Merge remote-tracking branch 'origin/master' into single-command-hap-…
adamnovak Mar 16, 2026
89b6bc6
Add missing filesystem include
adamnovak Mar 16, 2026
62dfb3c
Save Giraffe logs and clean up any wayward old outputs
adamnovak Mar 17, 2026
50369ca
Merge remote-tracking branch 'origin/master' into single-command-hap-…
adamnovak Mar 17, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Brewfile
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,5 @@ brew "boost"
brew "pybind11"
brew "pandoc"
brew "openssl"
tap "brewsci/bio"
brew "brewsci/bio/kmc"
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ RUN apt-get -qq -y update && apt-get -qq -y upgrade && apt-get -y install \
samtools curl unzip redland-utils librdf-dev cmake pkg-config wget gtk-doc-tools \
raptor2-utils rasqal-utils bison flex gawk libgoogle-perftools-dev liblz4-dev liblzma-dev \
libcairo2-dev libpixman-1-dev libffi-dev libcairo-dev libprotobuf-dev libboost-all-dev \
tabix bcftools libzstd-dev pybind11-dev python3-pybind11 pandoc libssl-dev
tabix bcftools libzstd-dev pybind11-dev python3-pybind11 pandoc libssl-dev kmc
###DEPS_END###

FROM packages AS build
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ On other distros, or if you do not have root access, you will need to perform th
automake gettext autopoint libtool jq bsdmainutils bc rs parallel \
npm curl unzip redland-utils librdf-dev bison flex gawk lzma-dev \
liblzma-dev liblz4-dev libffi-dev libcairo-dev libboost-all-dev \
libzstd-dev pybind11-dev python3-pybind11 libssl-dev
libzstd-dev pybind11-dev python3-pybind11 libssl-dev kmc

At present, you will need GCC version 9 or greater, with support for C++17, to compile vg. (Check your version with `gcc --version`.) GCC up to 11.4.0 is supported.

Expand Down
3 changes: 3 additions & 0 deletions doc/test-docs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ cd "${HERE}/../test"
echo txm --jobs 1 "${HERE}/../README.md"
txm --jobs 1 "${HERE}/../README.md"

# Test extra test READMEs
(cd haplotype-sampling && txm --jobs 1 "README.md")

# Run all the wiki tests
find "${HERE}/wiki" -name "*.md" | xargs -n 1 -t txm --jobs 1

Expand Down
875 changes: 762 additions & 113 deletions src/index_registry.cpp

Large diffs are not rendered by default.

114 changes: 101 additions & 13 deletions src/index_registry.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
#include <stdexcept>
#include <limits>

#include <gbwtgraph/utils.h>

namespace vg {

using namespace std;
Expand All @@ -36,6 +38,9 @@ using IndexGroup = set<IndexName>;

/**
* Names a recipe in the collection of registered recipes.
*
* A max-value number means the indexes are provided and not generated by any
* recipe.
*/
using RecipeName = pair<IndexGroup, size_t>;

Expand Down Expand Up @@ -128,6 +133,16 @@ struct IndexingParameters {
static double thread_chunk_inflation_factor;
// whether indexing algorithms will log progress (if available) [Basic]
static Verbosity verbosity;
// Set of samples to make references and retain during haplotype sampling [{}]
static gbwtgraph::sample_name_set haplotype_sampling_reference_samples;
// Number of haplotypes to sample during haplotype sampling [Recombinator::NUM_CANDIDATES]
static size_t haplotype_sampling_num_haplotypes;
// Whether to do diploid sampling during haplotype sampling [true]
static bool haplotype_sampling_diploid;
// length of k-mer used in haplotype index and kmer counting [29]
static int haplotype_sampling_minimizer_k;
// length of window used in haplotype index [11]
static int haplotype_sampling_minimizer_w;
};

/**
Expand Down Expand Up @@ -167,6 +182,13 @@ class IndexingPlan {
string output_filepath(const IndexName& identifier) const;

/// Get the suffix with which to save the given index's files.
///
/// The path will substitute scopes into any {wildcards} in the index's
/// suffix, and scopes not used in the suffix will appear in the
/// filename in key order.
///
/// The suffix used will be the first one for which all wildcards can be
/// substituted.
string output_filepath(const IndexName& identifier, size_t chunk, size_t num_chunks) const;

/// Ge the steps of the plan
Expand All @@ -175,24 +197,43 @@ class IndexingPlan {
/// Returns true if the given index is to be intermediate under the given
/// plan, and false if it is to be preserved.
bool is_intermediate(const IndexName& identifier) const;

/// Get the scopes that the given index will be qualified with when
/// generated. Only works for indexes that will be generated by the plan,
/// not for inputs.
const map<string, string> get_scopes(const IndexName& identifier) const;

/// Get the scopes that the given index will be qualified with when
/// generated. Only works for indexes that will be generated by the plan,
/// not for inputs.
const map<string, string>& get_scopes(const IndexName& identifier);

// TODO: is this where this function wants to live?
/// The memory limit, with a little slosh for prediction inaccuracy
int64_t target_memory_usage() const;
/// The mmeory limit with no slosh
/// The memory limit with no slosh
int64_t literal_target_memory_usage() const;

/// Returns the recipes in the plan that depend on this index, including the one in which
/// it was created (if any)
/// it was created (if any).
set<RecipeName> dependents(const IndexName& identifier) const;

protected:

/// Add a scope to the scopes the given index will be qualified with when
/// generated, if not present already.
void add_scope(const IndexName& identifier, const string& key, const string& value);

/// The steps to be invoked in the plan. May be empty before the plan is
/// actually planned.
vector<RecipeName> steps;
/// The indexes to create as outputs.
set<IndexName> targets;
/// The scopes qualifying each index in the plan (for example, if an index
/// applies only to a particular sample, it will be scoped to the sample
/// name). Stored using an ordered map to ensure a consistent iteration
/// order.
map<IndexName, map<string, string>> scopes;

/// The registry that the plan is using.
/// The registry must not move while the plan is in use.
Expand Down Expand Up @@ -244,6 +285,17 @@ class IndexRegistry {

/// Register an index containing the given identifier
void register_index(const IndexName& identifier, const string& suffix);

/// Register an index containing the given identifier, with multiple possible suffixes.
/// The first suffix where all {wildcards} can be substituted with scopes will be used.
void register_index(const IndexName& identifier, const vector<string>& suffixes);

/// Get the names of all brace-enclosed {wildcards} in the given pattern
static set<string> get_wildcards(const string& pattern);

/// Substitute wildcards into the given pattern. All wildcards must have
/// values assigned. Extra values not used are allowed.
static string substitute_wildcards(const string& pattern, const map<string, string> values);

/// Register a recipe to produce an index using other indexes
/// or input files. Recipes registered earlier will have higher priority.
Expand All @@ -254,12 +306,22 @@ class IndexRegistry {
/// Indicate one recipe is a broadened version of another. The indexes consumed and produced
/// by the generalization must be semantically identical to those of the generalizee
void register_generalization(const RecipeName& generalizer, const RecipeName& generalizee);

/// Indicate a serialized file that contains some identified index
void provide(const IndexName& identifier, const string& filename);

/// Indicate a list of serialized files that contains some identified index
void provide(const IndexName& identifier, const vector<string>& filenames);

/// Indicate a serialized file that contains some identified index,
/// optionally with scopes that propagates to descendant files.
///
/// TODO: If scopes contain ".", we can run into problems with combinations
/// of different scopes producing the same final string. Right now we only
/// use one kind of scope, which avoids this.
void provide(const IndexName& identifier, const string& filename, const map<string, string>& scopes = {});

/// Indicate a list of serialized files that contains some identified index,
/// optionally with scopes that propagates to descendant files.
///
/// TODO: If scopes contain ".", we can run into problems with combinations
/// of different scopes producing the same final string. Right now we only
/// use one kind of scope, which avoids this.
void provide(const IndexName& identifier, const vector<string>& filenames, const map<string, string>& scopes = {});

/// Remove a provided index
void reset(const IndexName& identifier);
Expand All @@ -269,6 +331,7 @@ class IndexRegistry {
bool available(const IndexName& identifier) const;

/// Get the possible filename(s) associated with the given index with the given prefix.
/// TODO: Get this to account for sample-scoped indexes.
vector<string> get_possible_filenames(const IndexName& identifier) const;

/// Get the filename(s) associated with the given index. Aborts if the
Expand Down Expand Up @@ -317,6 +380,14 @@ class IndexRegistry {
/// generate a plan to create the indexes
IndexingPlan make_plan(const IndexGroup& end_products) const;

/// Check if a recipe identifier correesponds to a recipe.
///
/// Recipe identifiers not corresponding to actual recipes are used during
/// planning to represent provided inputs.
///
/// TODO: Refactor that with some kind of tagged union or optional.
bool has_recipe(const RecipeName& recipe_name) const;

/// use a recipe identifier to get the recipe
const IndexRecipe& get_recipe(const RecipeName& recipe_name) const;

Expand Down Expand Up @@ -371,12 +442,15 @@ class IndexFile {

/// Create a new IndexFile with a unique identifier
IndexFile(const IndexName& identifier, const string& suffix);

/// Create a new IndexFile with a unique identifier, which may use one of several suffixes.
IndexFile(const IndexName& identifier, const vector<string>& suffixes);

/// Get the globally unique identifier for this index
const IndexName& get_identifier() const;

/// Returns the suffix to be used for this index
const string& get_suffix() const;
/// Returns the suffixes that can used for this index
const vector<string>& get_suffixes() const;

/// Get the filename(s) that contain this index
const vector<string>& get_filenames() const;
Expand All @@ -386,7 +460,18 @@ class IndexFile {

/// Assign constructed filenames to this index
void assign_constructed(const vector<string>& filenames);


/// Add a new scope to qualify the files in the index, if the scope is
/// not already on the index.
///
/// Should only be used on files that are actually filled in already;
/// scopes for files that don't exist yet are the responsibility of the
/// IndexingPlan.
void add_scope(const string& key, const string& value);

/// Get all scopes qualifying the index (such as a sample name).
const map<string, string>& get_scopes() const;

/// Returns true if the index has already been built or provided
bool is_finished() const;

Expand All @@ -401,11 +486,14 @@ class IndexFile {
// the global identifier for the
IndexName identifier;

// the suffix it adds to output files
const string suffix;
// the suffixes it can add to output files
const vector<string> suffixes;

// the filename(s) associated with the index
vector<string> filenames;

// The scopes qualifying the index.
map<string, string> scopes;

// keep track of whether the index was provided directly
bool provided_directly = false;
Expand Down
11 changes: 10 additions & 1 deletion src/subcommand/autoindex_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -390,7 +390,16 @@ int main_autoindex(int argc, char** argv) {
if (!registry.available(target)) {
vector<string> inferred_file_names = registry.get_possible_filenames(target);
for (const string& filename : inferred_file_names) {
if (ifstream(filename).is_open()) {
if (target == "Giraffe GBZ" && !ends_with(filename, ".giraffe.gbz")) {
// TODO: Giraffe GBZ indexes can be saved as .<sample>.gbz or .giraffe.gbz.
// But we can't pick up .<sample>.gbz automatically without possibly mistaking a plain GBZ for a Giraffe one.
// And we don't handle haplotype samplign in autoindex yet anyway.
// So only find Giraffe GBZs.
// TODO: Allow the siffixes to have Snakemake-style
// wildcards that populate scopes on the files.
continue;
}
if (file_exists(filename)) {
logger.info() << "Guessing that " << filename << " is " << target << endl;
registry.provide(target, filename);
break;
Expand Down
Loading
Loading