From ccef5a609bc6de1b81c6d1addf759afff72cabba Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Fri, 6 Mar 2026 16:10:06 -0500 Subject: [PATCH 01/22] Synthesize terrible code for one-command hap indexing and sampling --- Brewfile | 2 + src/index_registry.cpp | 258 +++++++++++++++++++++++++++++++- src/index_registry.hpp | 6 + src/subcommand/giraffe_main.cpp | 177 +++++++++------------- test/t/54_vg_haplotypes.t | 37 ++++- 5 files changed, 371 insertions(+), 109 deletions(-) diff --git a/Brewfile b/Brewfile index 2869f99db4..9a7376ad58 100644 --- a/Brewfile +++ b/Brewfile @@ -22,3 +22,5 @@ brew "boost" brew "pybind11" brew "pandoc" brew "openssl" +tap "brewsci/bio" +brew "brewsci/bio/kmc" diff --git a/src/index_registry.cpp b/src/index_registry.cpp index db3aafc896..6304650405 100644 --- a/src/index_registry.cpp +++ b/src/index_registry.cpp @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -54,6 +55,7 @@ #include "gfa.hpp" #include "job_schedule.hpp" #include "path.hpp" +#include "recombinator.hpp" #include "zip_code.hpp" #include "io/save_handle_graph.hpp" @@ -125,6 +127,9 @@ int IndexingParameters::downsample_context_length = gbwtgraph::PATH_COVER_DEFAUL double IndexingParameters::max_memory_proportion = 0.75; double IndexingParameters::thread_chunk_inflation_factor = 2.0; IndexingParameters::Verbosity IndexingParameters::verbosity = IndexingParameters::Basic; +std::unordered_set IndexingParameters::haplotype_reference_samples = {}; +std::string IndexingParameters::haplotype_fastq_1 = ""; +std::string IndexingParameters::haplotype_fastq_2 = ""; void copy_file(const string& from_fp, const string& to_fp) { require_exists(context, from_fp); @@ -619,6 +624,9 @@ IndexRegistry VGIndexes::get_vg_index_registry() { registry.register_index("GBWTGraph", "gg"); registry.register_index("GBZ", "gbz"); registry.register_index("Giraffe GBZ", "giraffe.gbz"); + registry.register_index("Unsampled Giraffe GBZ", "unsampled.gbz"); + registry.register_index("Haplotype Index", "hapl"); + registry.register_index("KFF Kmer Counts", "kff"); registry.register_index("Short Read Minimizers", "shortread.withzip.min"); registry.register_index("Short Read Zipcodes", "shortread.zipcodes"); @@ -3957,7 +3965,255 @@ IndexRegistry VGIndexes::get_vg_index_registry() { //////////////////////////////////// // GBZ Recipes //////////////////////////////////// - + + // Haplotype sampling: produce a personalized "Giraffe GBZ" by sampling + // haplotypes from a full-pangenome "Unsampled Giraffe GBZ" given + // haplotype information and kmer counts from reads. + registry.register_recipe({"Giraffe GBZ"}, {"Haplotype Index", "KFF Kmer Counts", "Unsampled Giraffe GBZ"}, + [](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + if (IndexingParameters::verbosity != IndexingParameters::None) { + info(context) << "Sampling haplotypes." << endl; + } + + assert(inputs.size() == 3); + auto hapl_filenames = inputs[0]->get_filenames(); + auto kff_filenames = inputs[1]->get_filenames(); + auto gbz_filenames = inputs[2]->get_filenames(); + assert(gbz_filenames.size() == 1); + assert(hapl_filenames.size() == 1); + assert(kff_filenames.size() == 1); + auto gbz_filename = gbz_filenames.front(); + auto hapl_filename = hapl_filenames.front(); + auto kff_filename = kff_filenames.front(); + + assert(constructing.size() == 1); + vector> all_outputs(constructing.size()); + auto gbz_output = *constructing.begin(); + auto& output_names = all_outputs[0]; + + // Load GBZ. + gbwtgraph::GBZ gbz; + load_gbz(gbz, gbz_filename, IndexingParameters::verbosity == IndexingParameters::Debug); + + // Override reference samples if requested. + if (!IndexingParameters::haplotype_reference_samples.empty()) { + if (IndexingParameters::verbosity != IndexingParameters::None) { + info(context) << "Updating reference samples." << endl; + } + size_t present = gbz.set_reference_samples(IndexingParameters::haplotype_reference_samples); + if (present != IndexingParameters::haplotype_reference_samples.size()) { + warn(context) << "Only " << present << " out of " + << IndexingParameters::haplotype_reference_samples.size() + << " reference samples are present." << endl; + } + } + + // Load haplotype information. + if (IndexingParameters::verbosity != IndexingParameters::None) { + info(context) << "Loading haplotype information from " << hapl_filename << "." << endl; + } + Haplotypes haplotypes; + haplotypes.load_from(hapl_filename); + + // Sample haplotypes. + Haplotypes::Verbosity hap_verbosity = (IndexingParameters::verbosity != IndexingParameters::None + ? Haplotypes::verbosity_basic + : Haplotypes::verbosity_silent); + Recombinator recombinator(gbz, haplotypes, hap_verbosity); + Recombinator::Parameters parameters(Recombinator::Parameters::preset_diploid); + gbwt::GBWT sampled_gbwt = recombinator.generate_haplotypes(kff_filename, parameters); + + // Build GBWTGraph and save GBZ. + if (IndexingParameters::verbosity != IndexingParameters::None) { + info(context) << "Building GBWTGraph." << endl; + } + gbwtgraph::GBZ sampled_graph(std::move(sampled_gbwt), gbz); + string output_name = plan->output_filepath(gbz_output); + save_gbz(sampled_graph, output_name, IndexingParameters::verbosity == IndexingParameters::Debug); + + output_names.push_back(output_name); + return all_outputs; + }); + + // Allow "Unsampled Giraffe GBZ" to be built from GBWT + GBWTGraph, + // so users who provide those two files can still do haplotype sampling. + registry.register_recipe({"Unsampled Giraffe GBZ"}, {"GBWTGraph", "Giraffe GBWT"}, + [](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + if (IndexingParameters::verbosity != IndexingParameters::None) { + info(context) << "Combining Giraffe GBWT and GBWTGraph into unsampled GBZ." << endl; + } + + assert(inputs.size() == 2); + auto gg_filenames = inputs[0]->get_filenames(); + auto gbwt_filenames = inputs[1]->get_filenames(); + assert(gg_filenames.size() == 1); + assert(gbwt_filenames.size() == 1); + auto gg_filename = gg_filenames.front(); + auto gbwt_filename = gbwt_filenames.front(); + + assert(constructing.size() == 1); + vector> all_outputs(constructing.size()); + auto gbz_output = *constructing.begin(); + auto& output_names = all_outputs[0]; + + gbwtgraph::GBZ gbz; + load_gbz(gbz, gbwt_filename, gg_filename, IndexingParameters::verbosity == IndexingParameters::Debug); + + string output_name = plan->output_filepath(gbz_output); + save_gbz(gbz, output_name, IndexingParameters::verbosity == IndexingParameters::Debug); + + output_names.push_back(output_name); + return all_outputs; + }); + + // Build "Haplotype Index" (.hapl) from an unsampled GBZ. + // The distance index is found at .dist or built from the GBZ. + // The r-index is constructed inline from the GBZ's GBWT. + // Using only "Unsampled Giraffe GBZ" as input avoids the cycle: + // Haplotype Index → Giraffe Distance Index → Giraffe GBZ → Haplotype Index + registry.register_recipe({"Haplotype Index"}, {"Unsampled Giraffe GBZ"}, + [](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + if (IndexingParameters::verbosity != IndexingParameters::None) { + info(context) << "Generating haplotype information." << endl; + } + + assert(inputs.size() == 1); + auto gbz_filenames = inputs[0]->get_filenames(); + assert(gbz_filenames.size() == 1); + auto gbz_filename = gbz_filenames.front(); + + assert(constructing.size() == 1); + vector> all_outputs(constructing.size()); + auto hapl_output = *constructing.begin(); + auto& output_names = all_outputs[0]; + + // Load GBZ. + gbwtgraph::GBZ gbz; + load_gbz(gbz, gbz_filename, IndexingParameters::verbosity == IndexingParameters::Debug); + + // Find the distance index by stripping known GBZ suffixes from the filename. + SnarlDistanceIndex distance_index; + string dist_candidate = gbz_filename; + for (const string& suffix : {".unsampled.gbz", ".giraffe.gbz", ".gbz"}) { + if (dist_candidate.size() > suffix.size() && + dist_candidate.substr(dist_candidate.size() - suffix.size()) == suffix) { + dist_candidate = dist_candidate.substr(0, dist_candidate.size() - suffix.size()) + ".dist"; + break; + } + } + if (ifstream(dist_candidate).is_open()) { + if (IndexingParameters::verbosity != IndexingParameters::None) { + info(context) << "Loading distance index from " << dist_candidate << "." << endl; + } + distance_index.deserialize(dist_candidate); + } else { + if (IndexingParameters::verbosity != IndexingParameters::None) { + info(context) << "Building distance index." << endl; + } + IntegratedSnarlFinder snarl_finder(gbz.graph); + fill_in_distance_index(&distance_index, &gbz.graph, &snarl_finder); + } + + // Build minimizer index without payload, using short-read parameters. + MinimizerIndexParameters minimizer_params; + minimizer_params.minimizers(IndexingParameters::short_read_minimizer_k, + IndexingParameters::short_read_minimizer_w) + .verbose(IndexingParameters::verbosity >= IndexingParameters::Debug); + HaplotypePartitioner::minimizer_index_type minimizer_index = + build_minimizer_index(gbz, nullptr, nullptr, minimizer_params); + + // Build r-index inline from the GBZ's GBWT. + gbwt::FastLocate r_index(gbz.index); + r_index.setGBWT(gbz.index); + + // Partition the haplotypes. + Haplotypes::Verbosity hap_verbosity = (IndexingParameters::verbosity != IndexingParameters::None + ? Haplotypes::verbosity_basic + : Haplotypes::verbosity_silent); + HaplotypePartitioner partitioner(gbz, r_index, distance_index, minimizer_index, hap_verbosity); + HaplotypePartitioner::Parameters partitioner_params; + Haplotypes haplotypes = partitioner.partition_haplotypes(partitioner_params); + + // Save the haplotype information. + string output_name = plan->output_filepath(hapl_output); + haplotypes.serialize_to(output_name); + + output_names.push_back(output_name); + return all_outputs; + }); + + // Build "KFF Kmer Counts" (.kff) by counting k-mers from reads using kmc. + // FASTQ paths are passed via IndexingParameters::haplotype_fastq_1/2. + registry.register_recipe({"KFF Kmer Counts"}, {}, + [](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + if (IndexingParameters::verbosity != IndexingParameters::None) { + info(context) << "Counting k-mers from reads for haplotype sampling." << endl; + } + + const string& fastq1 = IndexingParameters::haplotype_fastq_1; + const string& fastq2 = IndexingParameters::haplotype_fastq_2; + if (fastq1.empty()) { + error(context) << "No reads provided for k-mer counting. " + << "Use -f to specify input reads." << endl; + } + + assert(constructing.size() == 1); + vector> all_outputs(constructing.size()); + auto kff_output = *constructing.begin(); + auto& output_names = all_outputs[0]; + + string output_name = plan->output_filepath(kff_output); + + // Create a temp directory for kmc intermediate files. + string tmp_dir = temp_file::create_directory(); + + // Write a file listing the input reads. + string list_file = tmp_dir + "/reads.txt"; + { + ofstream list_out(list_file); + list_out << fastq1 << "\n"; + if (!fastq2.empty()) { + list_out << fastq2 << "\n"; + } + } + + // kmc produces output_prefix.kff; strip the ".kff" extension to get the prefix. + string kmc_prefix = output_name; + if (kmc_prefix.size() >= 4 && kmc_prefix.substr(kmc_prefix.size() - 4) == ".kff") { + kmc_prefix = kmc_prefix.substr(0, kmc_prefix.size() - 4); + } + + string kmc_tmp_dir = tmp_dir + "/kmc_tmp"; + mkdir(kmc_tmp_dir.c_str(), 0700); + + // Run kmc: count 29-mers in KFF format with canonical k-mers. + string cmd = "kmc -k29 -m128 -okff -hp @" + list_file + + " " + kmc_prefix + " " + kmc_tmp_dir; + if (IndexingParameters::verbosity == IndexingParameters::None) { + cmd += " > /dev/null 2>&1"; + } + int ret = system(cmd.c_str()); + if (ret != 0) { + error(context) << "kmc failed with exit code " << ret + << ". Make sure kmc is in your PATH." << endl; + } + + output_names.push_back(output_name); + return all_outputs; + }); + registry.register_recipe({"Giraffe GBZ"}, {"GBZ"}, [](const vector& inputs, const IndexingPlan* plan, diff --git a/src/index_registry.hpp b/src/index_registry.hpp index 1cb84a4d80..c5f7a03296 100644 --- a/src/index_registry.hpp +++ b/src/index_registry.hpp @@ -128,6 +128,12 @@ struct IndexingParameters { static double thread_chunk_inflation_factor; // whether indexing algorithms will log progress (if available) [Basic] static Verbosity verbosity; + // reference samples to override in GBZ during haplotype sampling [{}] + static std::unordered_set haplotype_reference_samples; + // first FASTQ file to use for k-mer counting in haplotype sampling [""] + static std::string haplotype_fastq_1; + // second FASTQ file to use for k-mer counting in haplotype sampling [""] + static std::string haplotype_fastq_2; }; /** diff --git a/src/subcommand/giraffe_main.cpp b/src/subcommand/giraffe_main.cpp index c68b938db5..7dc401fa38 100644 --- a/src/subcommand/giraffe_main.cpp +++ b/src/subcommand/giraffe_main.cpp @@ -31,7 +31,6 @@ #include #include "../gbwtgraph_helper.hpp" -#include "../recombinator.hpp" #include #include @@ -617,15 +616,6 @@ std::string strip_suffixes(std::string filename, const std::vector& return filename; } -// Returns the name of the sampled GBZ. -std::string sample_haplotypes( - const Logger& logger, - const std::vector>& indexes, - const std::unordered_set& reference_samples, - const std::string& basename, const std::string& sample_name, const std::string& haplotype_file, const std::string& kff_file, - bool progress -); - //---------------------------------------------------------------------------- void help_giraffe(char** argv, const BaseOptionGroup& parser, const std::map& presets, bool full_help) { @@ -668,6 +658,8 @@ void help_giraffe(char** argv, const BaseOptionGroup& parser, const std::map reference_samples; string output_basename; @@ -1071,6 +1065,7 @@ int main_giraffe(int argc, char** argv) { {"zipcode-name", required_argument, 0, 'z'}, {"dist-name", required_argument, 0, 'd'}, {"progress", no_argument, 0, 'p'}, + {"haplotype-sampling", no_argument, 0, OPT_HAPLOTYPE_SAMPLING}, {"haplotype-name", required_argument, 0, OPT_HAPLOTYPE_NAME}, {"kff-name", required_argument, 0, OPT_KFF_NAME}, {"index-basename", required_argument, 0, OPT_INDEX_BASENAME}, @@ -1186,6 +1181,9 @@ int main_giraffe(int argc, char** argv) { show_progress = true; break; + case OPT_HAPLOTYPE_SAMPLING: + haplotype_sampling_flag = true; + break; case OPT_HAPLOTYPE_NAME: haplotype_name = optarg; break; @@ -1405,6 +1403,12 @@ int main_giraffe(int argc, char** argv) { for (const auto& filename : provided_indexes) { require_exists(logger, filename.second); } + if (!haplotype_name.empty()) { + require_exists(logger, haplotype_name); + } + if (!kff_name.empty()) { + require_exists(logger, kff_name); + } // Decide if we are outputting to an htslib format bool hts_output = (output_format == "SAM" || output_format == "BAM" || output_format == "CRAM"); @@ -1459,21 +1463,72 @@ int main_giraffe(int argc, char** argv) { // We don't have file paths to load defined for recombination-aware short-read minimizers. logger.error() << "Path minimizers cannot be used with short reads." << endl; } - bool haplotype_sampling = !haplotype_name.empty() & !kff_name.empty(); + // Haplotype sampling is active when explicitly requested (--haplotype-sampling) + // or when either --haplotype-name or --kff-name is given. + bool haplotype_sampling = haplotype_sampling_flag || !haplotype_name.empty() || !kff_name.empty(); + // Save the GBZ-derived basename before any override, so we can find + // dist and other indexes co-located with the source GBZ during sampling. + string original_basename = index_basename; if (!index_basename_override.empty()) { index_basename = index_basename_override; } if (haplotype_sampling) { - // If we do haplotype sampling, we get a new GBZ and later build indexes for it. - string gbz_name = sample_haplotypes(logger, provided_indexes, reference_samples, index_basename, - sample_name, haplotype_name, kff_name, show_progress); - registry.provide("Giraffe GBZ", gbz_name); - index_basename = split_ext(gbz_name).first; + + // Determine sample name early so we can incorporate it into the + // index basename; downstream indexes (dist, min) will then be + // co-located with the sampled GBZ. + string sample = sample_name; + if (sample.empty()) { + if (!kff_name.empty()) { + sample = file_base_name(kff_name); + if (show_progress) { + logger.info() << "Guessing from " << kff_name + << " that sample name is " << sample << std::endl; + } + } else if (!fastq_filename_1.empty()) { + // Strip .gz then use file_base_name to strip .fq/.fastq + sample = file_base_name(strip_suffixes(fastq_filename_1, {".gz"})); + if (show_progress) { + logger.info() << "Guessing from " << fastq_filename_1 + << " that sample name is " << sample << std::endl; + } + } else { + sample = "sample"; + } + } + if (sample == "giraffe") { + logger.warn() << "Using \"giraffe\" as a sample name may lead to filename collisions." << std::endl; + } + index_basename = index_basename + "." + sample; + + // Pass parameters to the recipes through IndexingParameters. + IndexingParameters::haplotype_reference_samples = reference_samples; + IndexingParameters::haplotype_fastq_1 = fastq_filename_1; + IndexingParameters::haplotype_fastq_2 = fastq_filename_2; + + // Provide inputs to the registry. The full-pangenome GBZ is typed + // as "Unsampled Giraffe GBZ" so the sampling recipe can distinguish + // it from the personalized "Giraffe GBZ" it will produce. + for (auto& index : provided_indexes) { + if (index.first == "Giraffe GBZ") { + registry.provide("Unsampled Giraffe GBZ", index.second); + } else { + registry.provide(index.first, index.second); + } + } + + // Provide haplotype index and kff only if explicitly given. + if (!haplotype_name.empty()) { + registry.provide("Haplotype Index", haplotype_name); + } + if (!kff_name.empty()) { + registry.provide("KFF Kmer Counts", kff_name); + } + } else { - // Otherwise we use the provided indexes. + // Otherwise use the provided indexes directly. for (auto& index : provided_indexes) { registry.provide(index.first, index.second); - } } registry.set_prefix(index_basename); @@ -2296,91 +2351,5 @@ int main_giraffe(int argc, char** argv) { //---------------------------------------------------------------------------- -std::string sample_haplotypes( - const Logger& logger, - const std::vector>& indexes, - const std::unordered_set& reference_samples, - const std::string& basename, const std::string& sample_name, const std::string& haplotype_file, const std::string& kff_file, - bool progress -) { - if (progress) { - logger.info() << "Sampling haplotypes" << std::endl; - } - - // Sanity checks. - if (haplotype_file.empty() || kff_file.empty()) { - logger.error() << "Haplotype sampling requires --haplotype-name and --kff-name." << std::endl; - } - - // Determine output name. - std::string sample = sample_name; - if (sample.empty()) { - sample = file_base_name(kff_file); - if (progress) { - logger.info() << "Guessing from " << kff_file - << " that sample name is " << sample << std::endl; - } - } - if (sample == "giraffe") { - logger.warn() << "Using \"giraffe\" as a sample name may lead to filename collisions." << std::endl; - } - std::string output_name = basename + "." + sample + ".gbz"; - - // Load GBZ. - gbwtgraph::GBZ gbz; - if (indexes.size() == 1 && indexes[0].first == "Giraffe GBZ") { - load_gbz(gbz, indexes[0].second, progress); - } else if (indexes.size() == 2 && indexes[0].first == "Giraffe GBWT" && indexes[1].first == "GBWTGraph") { - load_gbz(gbz, indexes[0].second, indexes[1].second, progress); - } else if (indexes.size() == 2 && indexes[0].first == "GBWTGraph" && indexes[1].first == "Giraffe GBWT") { - load_gbz(gbz, indexes[1].second, indexes[0].second, progress); - } else { - logger.error() << "Haplotype sampling requires either -Z " - << "or (-g and -H) with no other indexes." << std::endl; - } - - // Override reference samples if necessary. - if (!reference_samples.empty()) { - if (progress) { - logger.info() << "Updating reference samples" << std::endl; - } - size_t present = gbz.set_reference_samples(reference_samples); - if (present != reference_samples.size()) { - logger.warn() << "Only " << present << " out of " << reference_samples.size() - << " reference samples are present" << std::endl; - } - } - - // Load haplotype information. - if (progress) { - logger.info() << "Loading haplotype information from " - << haplotype_file << std::endl; - } - Haplotypes haplotypes; - haplotypes.load_from(haplotype_file); - - // Sample haplotypes. - Haplotypes::Verbosity verbosity = (progress ? Haplotypes::verbosity_basic : Haplotypes::verbosity_silent); - Recombinator recombinator(gbz, haplotypes, verbosity); - Recombinator::Parameters parameters(Recombinator::Parameters::preset_diploid); - gbwt::GBWT sampled_gbwt; - try { - sampled_gbwt = recombinator.generate_haplotypes(kff_file, parameters); - } catch (const std::runtime_error& e) { - logger.error() << "Haplotype sampling failed: " << e.what() << std::endl; - } - - // Create GBWTGraph and save GBZ. - if (progress) { - logger.info() << "Building GBWTGraph" << std::endl; - } - gbwtgraph::GBZ sampled_graph(std::move(sampled_gbwt), gbz); - save_gbz(sampled_graph, output_name, progress); - - return output_name; -} - -//---------------------------------------------------------------------------- - // Register subcommand static Subcommand vg_giraffe("giraffe", "fast haplotype-aware read alignment", PIPELINE, 6, main_giraffe); diff --git a/test/t/54_vg_haplotypes.t b/test/t/54_vg_haplotypes.t index b951283800..eb82e1da08 100644 --- a/test/t/54_vg_haplotypes.t +++ b/test/t/54_vg_haplotypes.t @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 34 +plan tests 40 # The test graph consists of two subgraphs of the HPRC Minigraph-Cactus v1.1 graph: # - GRCh38#chr6:31498145-31511124 (micb) @@ -73,7 +73,7 @@ is $(vg gbwt -H -Z diploid3.gbz) 3 "2 generated + 1 reference haplotypes" vg giraffe -Z full.gbz --haplotype-name full.hapl --kff-name haplotype-sampling/HG003.kff \ -f haplotype-sampling/HG003.fq.gz > default.gam 2> /dev/null is $? 0 "Giraffe integration with a guessed output name" -cmp diploid.gbz full.HG003.gbz +cmp diploid.gbz full.HG003.giraffe.gbz is $? 0 "the sampled graph is identical to a manually sampled one" # Giraffe integration, specified output name @@ -81,7 +81,7 @@ vg giraffe -Z full.gbz --haplotype-name full.hapl --kff-name haplotype-sampling/ --index-basename sampled -N 003HG \ -f haplotype-sampling/HG003.fq.gz > specified.gam 2> /dev/null is $? 0 "Giraffe integration with a specified output name" -cmp full.HG003.gbz sampled.003HG.gbz +cmp full.HG003.giraffe.gbz sampled.003HG.giraffe.gbz is $? 0 "the sampled graphs are identical" # Giraffe integration, specified reference sample @@ -89,9 +89,35 @@ vg giraffe -Z full.gbz --haplotype-name full.hapl --kff-name haplotype-sampling/ --index-basename GRCh38 -N HG003 --set-reference GRCh38 \ -f haplotype-sampling/HG003.fq.gz > HG003_GRCh38.gam 2> /dev/null is $? 0 "Giraffe integration with a specified reference sample" -cmp diploid3.gbz GRCh38.HG003.gbz +cmp diploid3.gbz GRCh38.HG003.giraffe.gbz is $? 0 "the sampled graph is identical to a manually sampled one" +# Giraffe integration, haplotype index built automatically from the GBZ. +# Providing --kff-name without --haplotype-name implies haplotype sampling; +# the haplotype index (.hapl) is built by the IndexRegistry from the GBZ. +vg giraffe -Z full.gbz --kff-name haplotype-sampling/HG003.kff \ + --index-basename auto_hapl -N HG003 \ + -f haplotype-sampling/HG003.fq.gz > auto_hapl.gam 2> /dev/null +is $? 0 "Giraffe builds haplotype index automatically" +is $(vg gbwt -H -Z auto_hapl.HG003.giraffe.gbz) 4 "auto-built hapl produces 2 diploid + 2 reference haplotypes" + +# Giraffe integration, kmer counting done automatically from reads. +# Providing --haplotype-name without --kff-name implies haplotype sampling; +# the kmer counts (.kff) are built by the IndexRegistry from the reads using kmc. +vg giraffe -Z full.gbz --haplotype-name full.hapl \ + --index-basename auto_kff -N HG003 \ + -f haplotype-sampling/HG003.fq.gz > auto_kff.gam 2> /dev/null +is $? 0 "Giraffe counts kmers from reads automatically" +is $(vg gbwt -H -Z auto_kff.HG003.giraffe.gbz) 4 "auto kmer counting produces 2 diploid + 2 reference haplotypes" + +# Giraffe integration, fully automatic: both hapl and kff are built by the +# IndexRegistry. Triggered by --haplotype-sampling without either input file. +vg giraffe -Z full.gbz --haplotype-sampling \ + --index-basename auto_all -N HG003 \ + -f haplotype-sampling/HG003.fq.gz > auto_all.gam 2> /dev/null +is $? 0 "Giraffe does fully automatic haplotype sampling" +is $(vg gbwt -H -Z auto_all.HG003.giraffe.gbz) 4 "fully automatic haplotype sampling produces 2 diploid + 2 reference haplotypes" + # Attempts to use mismatched files fail vg haplotypes -i full.hapl -k haplotype-sampling/HG003.kff -g /dev/null diploid.gbz 2> log.txt is $? 1 "sampling with mismatched GBZ and haplotype information fails" @@ -104,4 +130,7 @@ rm -f diploid.gbz diploid2.gbz diploid3.gbz rm -f full.HG003.* default.gam rm -f sampled.003HG.* specified.gam rm -f GRCh38.HG003.* HG003_GRCh38.gam +rm -f auto_hapl.HG003.* auto_hapl.gam +rm -f auto_kff.HG003.* auto_kff.gam +rm -f auto_all.HG003.* auto_all.gam rm -f log.txt \ No newline at end of file From 57c06969b75aad76734035bf80f461263227724b Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Fri, 6 Mar 2026 16:37:48 -0500 Subject: [PATCH 02/22] Synthesize slightly better code that models more things in the graph --- src/index_registry.cpp | 161 +++++++++++++++++++++----------- src/index_registry.hpp | 4 - src/subcommand/giraffe_main.cpp | 27 +++++- 3 files changed, 134 insertions(+), 58 deletions(-) diff --git a/src/index_registry.cpp b/src/index_registry.cpp index 6304650405..da0284e574 100644 --- a/src/index_registry.cpp +++ b/src/index_registry.cpp @@ -128,8 +128,6 @@ double IndexingParameters::max_memory_proportion = 0.75; double IndexingParameters::thread_chunk_inflation_factor = 2.0; IndexingParameters::Verbosity IndexingParameters::verbosity = IndexingParameters::Basic; std::unordered_set IndexingParameters::haplotype_reference_samples = {}; -std::string IndexingParameters::haplotype_fastq_1 = ""; -std::string IndexingParameters::haplotype_fastq_2 = ""; void copy_file(const string& from_fp, const string& to_fp) { require_exists(context, from_fp); @@ -625,6 +623,10 @@ IndexRegistry VGIndexes::get_vg_index_registry() { registry.register_index("GBZ", "gbz"); registry.register_index("Giraffe GBZ", "giraffe.gbz"); registry.register_index("Unsampled Giraffe GBZ", "unsampled.gbz"); + registry.register_index("Unsampled Giraffe GBZ Distance Index", "unsampled.dist"); + registry.register_index("Unsampled Giraffe GBZ Top Level Chain Distance Index", "tcdist"); + registry.register_index("Unsampled Giraffe GBZ r index", "unsampled.ri"); + registry.register_index("Haplotype FASTQs", "hap.fastq"); registry.register_index("Haplotype Index", "hapl"); registry.register_index("KFF Kmer Counts", "kff"); @@ -4072,57 +4074,119 @@ IndexRegistry VGIndexes::get_vg_index_registry() { return all_outputs; }); - // Build "Haplotype Index" (.hapl) from an unsampled GBZ. - // The distance index is found at .dist or built from the GBZ. - // The r-index is constructed inline from the GBZ's GBWT. - // Using only "Unsampled Giraffe GBZ" as input avoids the cycle: - // Haplotype Index → Giraffe Distance Index → Giraffe GBZ → Haplotype Index - registry.register_recipe({"Haplotype Index"}, {"Unsampled Giraffe GBZ"}, + // Build the r-index for an unsampled GBZ from its GBWT. + registry.register_recipe({"Unsampled Giraffe GBZ r index"}, {"Unsampled Giraffe GBZ"}, [](const vector& inputs, const IndexingPlan* plan, AliasGraph& alias_graph, const IndexGroup& constructing) { if (IndexingParameters::verbosity != IndexingParameters::None) { - info(context) << "Generating haplotype information." << endl; + info(context) << "Building r-index for unsampled GBZ." << endl; } assert(inputs.size() == 1); - auto gbz_filenames = inputs[0]->get_filenames(); - assert(gbz_filenames.size() == 1); - auto gbz_filename = gbz_filenames.front(); + auto gbz_filename = inputs[0]->get_filenames().front(); assert(constructing.size() == 1); vector> all_outputs(constructing.size()); - auto hapl_output = *constructing.begin(); auto& output_names = all_outputs[0]; - // Load GBZ. gbwtgraph::GBZ gbz; load_gbz(gbz, gbz_filename, IndexingParameters::verbosity == IndexingParameters::Debug); - // Find the distance index by stripping known GBZ suffixes from the filename. - SnarlDistanceIndex distance_index; - string dist_candidate = gbz_filename; - for (const string& suffix : {".unsampled.gbz", ".giraffe.gbz", ".gbz"}) { - if (dist_candidate.size() > suffix.size() && - dist_candidate.substr(dist_candidate.size() - suffix.size()) == suffix) { - dist_candidate = dist_candidate.substr(0, dist_candidate.size() - suffix.size()) + ".dist"; - break; - } + gbwt::FastLocate r_index(gbz.index); + string output_name = plan->output_filepath(*constructing.begin()); + save_r_index(r_index, output_name, IndexingParameters::verbosity != IndexingParameters::None); + + output_names.push_back(output_name); + return all_outputs; + }); + + // Top-level-chain distance index: alias from a full unsampled distance index + // (higher priority — no extra computation needed). + registry.register_recipe({"Unsampled Giraffe GBZ Top Level Chain Distance Index"}, + {"Unsampled Giraffe GBZ Distance Index"}, + [](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + alias_graph.register_alias(*constructing.begin(), inputs[0]); + return vector>(1, inputs.front()->get_filenames()); + }); + + // Top-level-chain distance index: build cheaply from the GBZ with + // only_top_level_chain_distances=true (lower priority). + registry.register_recipe({"Unsampled Giraffe GBZ Top Level Chain Distance Index"}, + {"Unsampled Giraffe GBZ"}, + [](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + if (IndexingParameters::verbosity != IndexingParameters::None) { + info(context) << "Building top-level-chain distance index for unsampled GBZ." << endl; } - if (ifstream(dist_candidate).is_open()) { - if (IndexingParameters::verbosity != IndexingParameters::None) { - info(context) << "Loading distance index from " << dist_candidate << "." << endl; - } - distance_index.deserialize(dist_candidate); - } else { - if (IndexingParameters::verbosity != IndexingParameters::None) { - info(context) << "Building distance index." << endl; - } - IntegratedSnarlFinder snarl_finder(gbz.graph); - fill_in_distance_index(&distance_index, &gbz.graph, &snarl_finder); + + assert(inputs.size() == 1); + auto gbz_filename = inputs[0]->get_filenames().front(); + + assert(constructing.size() == 1); + vector> all_outputs(constructing.size()); + auto& output_names = all_outputs[0]; + + gbwtgraph::GBZ gbz; + load_gbz(gbz, gbz_filename, IndexingParameters::verbosity == IndexingParameters::Debug); + + string output_name = plan->output_filepath(*constructing.begin()); + SnarlDistanceIndex distance_index; + IntegratedSnarlFinder snarl_finder(gbz.graph); + fill_in_distance_index(&distance_index, &gbz.graph, &snarl_finder, + /*size_limit=*/50000, /*only_top_level_chain_distances=*/true); + distance_index.serialize(output_name); + + output_names.push_back(output_name); + return all_outputs; + }); + + // Build "Haplotype Index" (.hapl) from the unsampled GBZ, its top-level-chain + // distance index, and its r-index. + // Alphabetical input order: GBZ < Top Level Chain Distance Index < r index + registry.register_recipe({"Haplotype Index"}, + {"Unsampled Giraffe GBZ", + "Unsampled Giraffe GBZ Top Level Chain Distance Index", + "Unsampled Giraffe GBZ r index"}, + [](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + if (IndexingParameters::verbosity != IndexingParameters::None) { + info(context) << "Generating haplotype information." << endl; } + assert(inputs.size() == 3); + // inputs[0] = Unsampled Giraffe GBZ + // inputs[1] = Unsampled Giraffe GBZ Top Level Chain Distance Index + // inputs[2] = Unsampled Giraffe GBZ r index + auto gbz_filename = inputs[0]->get_filenames().front(); + auto dist_filename = inputs[1]->get_filenames().front(); + auto ri_filename = inputs[2]->get_filenames().front(); + + assert(constructing.size() == 1); + vector> all_outputs(constructing.size()); + auto& output_names = all_outputs[0]; + + // Load GBZ. + gbwtgraph::GBZ gbz; + load_gbz(gbz, gbz_filename, IndexingParameters::verbosity == IndexingParameters::Debug); + + // Load distance index. + SnarlDistanceIndex distance_index; + distance_index.deserialize(dist_filename); + + // Load r-index and restore the GBWT pointer. + gbwt::FastLocate r_index; + load_r_index(r_index, ri_filename, IndexingParameters::verbosity != IndexingParameters::None); + r_index.setGBWT(gbz.index); + // Build minimizer index without payload, using short-read parameters. MinimizerIndexParameters minimizer_params; minimizer_params.minimizers(IndexingParameters::short_read_minimizer_k, @@ -4131,10 +4195,6 @@ IndexRegistry VGIndexes::get_vg_index_registry() { HaplotypePartitioner::minimizer_index_type minimizer_index = build_minimizer_index(gbz, nullptr, nullptr, minimizer_params); - // Build r-index inline from the GBZ's GBWT. - gbwt::FastLocate r_index(gbz.index); - r_index.setGBWT(gbz.index); - // Partition the haplotypes. Haplotypes::Verbosity hap_verbosity = (IndexingParameters::verbosity != IndexingParameters::None ? Haplotypes::verbosity_basic @@ -4144,16 +4204,16 @@ IndexRegistry VGIndexes::get_vg_index_registry() { Haplotypes haplotypes = partitioner.partition_haplotypes(partitioner_params); // Save the haplotype information. - string output_name = plan->output_filepath(hapl_output); + string output_name = plan->output_filepath(*constructing.begin()); haplotypes.serialize_to(output_name); output_names.push_back(output_name); return all_outputs; }); - // Build "KFF Kmer Counts" (.kff) by counting k-mers from reads using kmc. - // FASTQ paths are passed via IndexingParameters::haplotype_fastq_1/2. - registry.register_recipe({"KFF Kmer Counts"}, {}, + // Build "KFF Kmer Counts" (.kff) by counting k-mers from the provided reads + // using kmc. The reads are registered as "Haplotype FASTQs". + registry.register_recipe({"KFF Kmer Counts"}, {"Haplotype FASTQs"}, [](const vector& inputs, const IndexingPlan* plan, AliasGraph& alias_graph, @@ -4162,19 +4222,17 @@ IndexRegistry VGIndexes::get_vg_index_registry() { info(context) << "Counting k-mers from reads for haplotype sampling." << endl; } - const string& fastq1 = IndexingParameters::haplotype_fastq_1; - const string& fastq2 = IndexingParameters::haplotype_fastq_2; - if (fastq1.empty()) { - error(context) << "No reads provided for k-mer counting. " - << "Use -f to specify input reads." << endl; + assert(inputs.size() == 1); + auto fastq_filenames = inputs[0]->get_filenames(); + if (fastq_filenames.empty()) { + error(context) << "No reads provided for k-mer counting." << endl; } assert(constructing.size() == 1); vector> all_outputs(constructing.size()); - auto kff_output = *constructing.begin(); auto& output_names = all_outputs[0]; - string output_name = plan->output_filepath(kff_output); + string output_name = plan->output_filepath(*constructing.begin()); // Create a temp directory for kmc intermediate files. string tmp_dir = temp_file::create_directory(); @@ -4183,9 +4241,8 @@ IndexRegistry VGIndexes::get_vg_index_registry() { string list_file = tmp_dir + "/reads.txt"; { ofstream list_out(list_file); - list_out << fastq1 << "\n"; - if (!fastq2.empty()) { - list_out << fastq2 << "\n"; + for (const string& fq : fastq_filenames) { + list_out << fq << "\n"; } } diff --git a/src/index_registry.hpp b/src/index_registry.hpp index c5f7a03296..c11d283469 100644 --- a/src/index_registry.hpp +++ b/src/index_registry.hpp @@ -130,10 +130,6 @@ struct IndexingParameters { static Verbosity verbosity; // reference samples to override in GBZ during haplotype sampling [{}] static std::unordered_set haplotype_reference_samples; - // first FASTQ file to use for k-mer counting in haplotype sampling [""] - static std::string haplotype_fastq_1; - // second FASTQ file to use for k-mer counting in haplotype sampling [""] - static std::string haplotype_fastq_2; }; /** diff --git a/src/subcommand/giraffe_main.cpp b/src/subcommand/giraffe_main.cpp index 7dc401fa38..7fb032bb83 100644 --- a/src/subcommand/giraffe_main.cpp +++ b/src/subcommand/giraffe_main.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -1503,15 +1504,18 @@ int main_giraffe(int argc, char** argv) { // Pass parameters to the recipes through IndexingParameters. IndexingParameters::haplotype_reference_samples = reference_samples; - IndexingParameters::haplotype_fastq_1 = fastq_filename_1; - IndexingParameters::haplotype_fastq_2 = fastq_filename_2; // Provide inputs to the registry. The full-pangenome GBZ is typed // as "Unsampled Giraffe GBZ" so the sampling recipe can distinguish // it from the personalized "Giraffe GBZ" it will produce. + // Likewise, a provided "Giraffe Distance Index" is re-typed as + // "Unsampled Giraffe GBZ Distance Index" since it belongs to the + // unsampled graph. for (auto& index : provided_indexes) { if (index.first == "Giraffe GBZ") { registry.provide("Unsampled Giraffe GBZ", index.second); + } else if (index.first == "Giraffe Distance Index") { + registry.provide("Unsampled Giraffe GBZ Distance Index", index.second); } else { registry.provide(index.first, index.second); } @@ -1525,6 +1529,25 @@ int main_giraffe(int argc, char** argv) { registry.provide("KFF Kmer Counts", kff_name); } + // Provide FASTQs as a registry node so the KFF kmer counting recipe + // can discover them without going through IndexingParameters. + if (!fastq_filename_1.empty()) { + vector fastqs = {fastq_filename_1}; + if (!fastq_filename_2.empty()) { + fastqs.push_back(fastq_filename_2); + } + registry.provide("Haplotype FASTQs", fastqs); + } + + // Auto-provide the unsampled distance index from the original GBZ + // basename if a .dist file exists there and we don't already have one. + if (!registry.available("Unsampled Giraffe GBZ Distance Index")) { + string dist_candidate = original_basename + ".dist"; + if (ifstream(dist_candidate).good()) { + registry.provide("Unsampled Giraffe GBZ Distance Index", dist_candidate); + } + } + } else { // Otherwise use the provided indexes directly. for (auto& index : provided_indexes) { From c5d06dcb614bbf7dcca710b60f38794c1f578b01 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Fri, 6 Mar 2026 17:08:44 -0500 Subject: [PATCH 03/22] Synthesize code to avoid shell --- README.md | 2 +- src/index_registry.cpp | 63 ++++++++++++++++++++++++++------- src/subcommand/giraffe_main.cpp | 13 ++++--- 3 files changed, 61 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 592270d463..a3e1d5e4cd 100644 --- a/README.md +++ b/README.md @@ -91,7 +91,7 @@ On other distros, or if you do not have root access, you will need to perform th automake gettext autopoint libtool jq bsdmainutils bc rs parallel \ npm curl unzip redland-utils librdf-dev bison flex gawk lzma-dev \ liblzma-dev liblz4-dev libffi-dev libcairo-dev libboost-all-dev \ - libzstd-dev pybind11-dev python3-pybind11 libssl-dev + libzstd-dev pybind11-dev python3-pybind11 libssl-dev kmc At present, you will need GCC version 9 or greater, with support for C++17, to compile vg. (Check your version with `gcc --version`.) GCC up to 11.4.0 is supported. diff --git a/src/index_registry.cpp b/src/index_registry.cpp index da0284e574..e16f6d0b44 100644 --- a/src/index_registry.cpp +++ b/src/index_registry.cpp @@ -17,8 +17,10 @@ #include #include #include +#include #include #include +#include #include #include @@ -626,7 +628,7 @@ IndexRegistry VGIndexes::get_vg_index_registry() { registry.register_index("Unsampled Giraffe GBZ Distance Index", "unsampled.dist"); registry.register_index("Unsampled Giraffe GBZ Top Level Chain Distance Index", "tcdist"); registry.register_index("Unsampled Giraffe GBZ r index", "unsampled.ri"); - registry.register_index("Haplotype FASTQs", "hap.fastq"); + registry.register_index("Sample FASTQ", "sample.fastq"); registry.register_index("Haplotype Index", "hapl"); registry.register_index("KFF Kmer Counts", "kff"); @@ -4212,8 +4214,8 @@ IndexRegistry VGIndexes::get_vg_index_registry() { }); // Build "KFF Kmer Counts" (.kff) by counting k-mers from the provided reads - // using kmc. The reads are registered as "Haplotype FASTQs". - registry.register_recipe({"KFF Kmer Counts"}, {"Haplotype FASTQs"}, + // using kmc. The reads are registered as "Sample FASTQ". + registry.register_recipe({"KFF Kmer Counts"}, {"Sample FASTQ"}, [](const vector& inputs, const IndexingPlan* plan, AliasGraph& alias_graph, @@ -4256,15 +4258,52 @@ IndexRegistry VGIndexes::get_vg_index_registry() { mkdir(kmc_tmp_dir.c_str(), 0700); // Run kmc: count 29-mers in KFF format with canonical k-mers. - string cmd = "kmc -k29 -m128 -okff -hp @" + list_file - + " " + kmc_prefix + " " + kmc_tmp_dir; - if (IndexingParameters::verbosity == IndexingParameters::None) { - cmd += " > /dev/null 2>&1"; - } - int ret = system(cmd.c_str()); - if (ret != 0) { - error(context) << "kmc failed with exit code " << ret - << ". Make sure kmc is in your PATH." << endl; + // Use fork()+execvp() instead of system() to avoid shell quoting issues. + string reads_arg = "@" + list_file; + { + // Flush all stdio streams before forking so the child doesn't + // inherit buffered data that would be flushed twice. + fflush(nullptr); + cout.flush(); + cerr.flush(); + + pid_t pid = fork(); + if (pid == -1) { + error(context) << "fork() failed for kmc: " << strerror(errno) << endl; + } else if (pid == 0) { + // Child: redirect stdout/stderr to /dev/null when not verbose. + if (IndexingParameters::verbosity == IndexingParameters::None) { + int devnull = open("/dev/null", O_WRONLY); + if (devnull != -1) { + dup2(devnull, STDOUT_FILENO); + dup2(devnull, STDERR_FILENO); + close(devnull); + } + } + char* kmc_argv[] = { + const_cast("kmc"), + const_cast("-k29"), + const_cast("-m128"), + const_cast("-okff"), + const_cast("-hp"), + const_cast(reads_arg.c_str()), + const_cast(kmc_prefix.c_str()), + const_cast(kmc_tmp_dir.c_str()), + nullptr + }; + execvp("kmc", kmc_argv); + // execvp only returns on failure. + _exit(127); + } else { + int child_stat; + waitpid(pid, &child_stat, 0); + int ret = WIFEXITED(child_stat) ? WEXITSTATUS(child_stat) : -1; + if (ret == 127) { + error(context) << "kmc could not be executed. Make sure kmc is installed and in your PATH." << endl; + } else if (ret != 0) { + error(context) << "kmc failed with exit code " << ret << "." << endl; + } + } } output_names.push_back(output_name); diff --git a/src/subcommand/giraffe_main.cpp b/src/subcommand/giraffe_main.cpp index 7fb032bb83..956733aa92 100644 --- a/src/subcommand/giraffe_main.cpp +++ b/src/subcommand/giraffe_main.cpp @@ -27,6 +27,7 @@ #include "../hts_alignment_emitter.hpp" #include "../minimizer_mapper.hpp" #include "../index_registry.hpp" +#include "../utility.hpp" #include "../watchdog.hpp" #include "../crash.hpp" #include @@ -1536,14 +1537,18 @@ int main_giraffe(int argc, char** argv) { if (!fastq_filename_2.empty()) { fastqs.push_back(fastq_filename_2); } - registry.provide("Haplotype FASTQs", fastqs); + registry.provide("Sample FASTQ", fastqs); } // Auto-provide the unsampled distance index from the original GBZ - // basename if a .dist file exists there and we don't already have one. + // basename if a dist file exists there and we don't already have one. if (!registry.available("Unsampled Giraffe GBZ Distance Index")) { - string dist_candidate = original_basename + ".dist"; - if (ifstream(dist_candidate).good()) { + // Prefer a top-level-chain-only .tcdist, then fall back to full .dist. + string tcdist_candidate = original_basename + ".tcdist"; + string dist_candidate = original_basename + ".dist"; + if (file_exists(tcdist_candidate)) { + registry.provide("Unsampled Giraffe GBZ Top Level Chain Distance Index", tcdist_candidate); + } else if (file_exists(dist_candidate)) { registry.provide("Unsampled Giraffe GBZ Distance Index", dist_candidate); } } From 171c89347df6752aeb3c2e41673751b809323a2f Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Fri, 6 Mar 2026 17:17:43 -0500 Subject: [PATCH 04/22] Automatically split haplotype indexing kmer defaults from short read ones --- src/index_registry.cpp | 11 +++++++---- src/index_registry.hpp | 4 ++++ 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/src/index_registry.cpp b/src/index_registry.cpp index e16f6d0b44..9d17496e58 100644 --- a/src/index_registry.cpp +++ b/src/index_registry.cpp @@ -130,6 +130,8 @@ double IndexingParameters::max_memory_proportion = 0.75; double IndexingParameters::thread_chunk_inflation_factor = 2.0; IndexingParameters::Verbosity IndexingParameters::verbosity = IndexingParameters::Basic; std::unordered_set IndexingParameters::haplotype_reference_samples = {}; +int IndexingParameters::haplotype_minimizer_k = 29; +int IndexingParameters::haplotype_minimizer_w = 11; void copy_file(const string& from_fp, const string& to_fp) { require_exists(context, from_fp); @@ -4191,8 +4193,8 @@ IndexRegistry VGIndexes::get_vg_index_registry() { // Build minimizer index without payload, using short-read parameters. MinimizerIndexParameters minimizer_params; - minimizer_params.minimizers(IndexingParameters::short_read_minimizer_k, - IndexingParameters::short_read_minimizer_w) + minimizer_params.minimizers(IndexingParameters::haplotype_minimizer_k, + IndexingParameters::haplotype_minimizer_w) .verbose(IndexingParameters::verbosity >= IndexingParameters::Debug); HaplotypePartitioner::minimizer_index_type minimizer_index = build_minimizer_index(gbz, nullptr, nullptr, minimizer_params); @@ -4257,9 +4259,10 @@ IndexRegistry VGIndexes::get_vg_index_registry() { string kmc_tmp_dir = tmp_dir + "/kmc_tmp"; mkdir(kmc_tmp_dir.c_str(), 0700); - // Run kmc: count 29-mers in KFF format with canonical k-mers. + // Run kmc: count k-mers in KFF format with canonical k-mers. // Use fork()+execvp() instead of system() to avoid shell quoting issues. string reads_arg = "@" + list_file; + string kmc_k_arg = "-k" + to_string(IndexingParameters::haplotype_minimizer_k); { // Flush all stdio streams before forking so the child doesn't // inherit buffered data that would be flushed twice. @@ -4282,7 +4285,7 @@ IndexRegistry VGIndexes::get_vg_index_registry() { } char* kmc_argv[] = { const_cast("kmc"), - const_cast("-k29"), + const_cast(kmc_k_arg.c_str()), const_cast("-m128"), const_cast("-okff"), const_cast("-hp"), diff --git a/src/index_registry.hpp b/src/index_registry.hpp index c11d283469..82ea89a2c1 100644 --- a/src/index_registry.hpp +++ b/src/index_registry.hpp @@ -130,6 +130,10 @@ struct IndexingParameters { static Verbosity verbosity; // reference samples to override in GBZ during haplotype sampling [{}] static std::unordered_set haplotype_reference_samples; + // length of k-mer used in haplotype index and kmer counting [29] + static int haplotype_minimizer_k; + // length of window used in haplotype index [11] + static int haplotype_minimizer_w; }; /** From 7c49f5ee00d6ac6f94b003f0c0ea3bf192f555a8 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Fri, 6 Mar 2026 17:34:23 -0500 Subject: [PATCH 05/22] Simplify casting around child process --- src/index_registry.cpp | 84 ++++++++++++++++++++---------------------- 1 file changed, 39 insertions(+), 45 deletions(-) diff --git a/src/index_registry.cpp b/src/index_registry.cpp index 9d17496e58..2a3cc43893 100644 --- a/src/index_registry.cpp +++ b/src/index_registry.cpp @@ -4251,9 +4251,11 @@ IndexRegistry VGIndexes::get_vg_index_registry() { } // kmc produces output_prefix.kff; strip the ".kff" extension to get the prefix. - string kmc_prefix = output_name; - if (kmc_prefix.size() >= 4 && kmc_prefix.substr(kmc_prefix.size() - 4) == ".kff") { - kmc_prefix = kmc_prefix.substr(0, kmc_prefix.size() - 4); + string kmc_prefix; + if (output_name.size() >= 4 && output_name.substr(output_name.size() - 4) == ".kff") { + kmc_prefix = output_name.substr(0, output_name.size() - 4); + } else { + error(context) << "KFF file has wrong extension: " << output_name << endl; } string kmc_tmp_dir = tmp_dir + "/kmc_tmp"; @@ -4263,50 +4265,42 @@ IndexRegistry VGIndexes::get_vg_index_registry() { // Use fork()+execvp() instead of system() to avoid shell quoting issues. string reads_arg = "@" + list_file; string kmc_k_arg = "-k" + to_string(IndexingParameters::haplotype_minimizer_k); - { - // Flush all stdio streams before forking so the child doesn't - // inherit buffered data that would be flushed twice. - fflush(nullptr); - cout.flush(); - cerr.flush(); - - pid_t pid = fork(); - if (pid == -1) { - error(context) << "fork() failed for kmc: " << strerror(errno) << endl; - } else if (pid == 0) { - // Child: redirect stdout/stderr to /dev/null when not verbose. - if (IndexingParameters::verbosity == IndexingParameters::None) { - int devnull = open("/dev/null", O_WRONLY); - if (devnull != -1) { - dup2(devnull, STDOUT_FILENO); - dup2(devnull, STDERR_FILENO); - close(devnull); - } - } - char* kmc_argv[] = { - const_cast("kmc"), - const_cast(kmc_k_arg.c_str()), - const_cast("-m128"), - const_cast("-okff"), - const_cast("-hp"), - const_cast(reads_arg.c_str()), - const_cast(kmc_prefix.c_str()), - const_cast(kmc_tmp_dir.c_str()), - nullptr - }; - execvp("kmc", kmc_argv); - // execvp only returns on failure. - _exit(127); - } else { - int child_stat; - waitpid(pid, &child_stat, 0); - int ret = WIFEXITED(child_stat) ? WEXITSTATUS(child_stat) : -1; - if (ret == 127) { - error(context) << "kmc could not be executed. Make sure kmc is installed and in your PATH." << endl; - } else if (ret != 0) { - error(context) << "kmc failed with exit code " << ret << "." << endl; + const char* kmc_argv[] = {"kmc", kmc_k_arg.c_str(), "-m128", "-okff", "-hp", + reads_arg.c_str(), kmc_prefix.c_str(), kmc_tmp_dir.c_str(), nullptr}; + + // Flush shared output streams before forking + cout.flush(); + cerr.flush(); + fflush(nullptr); + + pid_t pid = fork(); + if (pid == -1) { + error(context) << "fork() failed for kmc: " << strerror(errno) << endl; + } else if (pid == 0) { + // Child: redirect stdout/stderr to /dev/null when not verbose. + if (IndexingParameters::verbosity == IndexingParameters::None) { + int devnull = open("/dev/null", O_WRONLY); + if (devnull != -1) { + dup2(devnull, STDOUT_FILENO); + dup2(devnull, STDERR_FILENO); + close(devnull); } } + // execvp promises never to modify its arguments but casn't say + // that in the C++ type system because it causes problems in the C + // type system. See https://stackoverflow.com/a/19505361 + execvp("kmc", (char**)kmc_argv); + // execvp only returns on failure. + _exit(127); + } else { + int child_stat; + waitpid(pid, &child_stat, 0); + int ret = WIFEXITED(child_stat) ? WEXITSTATUS(child_stat) : -1; + if (ret == 127) { + error(context) << "kmc could not be executed. Make sure kmc is installed and in your PATH." << endl; + } else if (ret != 0) { + error(context) << "kmc failed with exit code " << ret << "." << endl; + } } output_names.push_back(output_name); From d62309fb8865a7ac4a2f792b217a9378e156c5bc Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Fri, 6 Mar 2026 19:32:41 -0500 Subject: [PATCH 06/22] Rationalize index graph and try and fail to get sampled GBZ to use the documented name --- doc/test-docs.sh | 3 + src/index_registry.cpp | 510 ++++++++++++++---------------- src/subcommand/giraffe_main.cpp | 153 +++++---- test/haplotype-sampling/README.md | 26 +- test/t/54_vg_haplotypes.t | 14 +- 5 files changed, 346 insertions(+), 360 deletions(-) diff --git a/doc/test-docs.sh b/doc/test-docs.sh index 246e7c07aa..c9f0e6f9b0 100755 --- a/doc/test-docs.sh +++ b/doc/test-docs.sh @@ -16,6 +16,9 @@ cd "${HERE}/../test" echo txm --jobs 1 "${HERE}/../README.md" txm --jobs 1 "${HERE}/../README.md" +# Test extra test READMEs +(cd haplotype-sampling && txm --jobs 1 "README.md") + # Run all the wiki tests find "${HERE}/wiki" -name "*.md" | xargs -n 1 -t txm --jobs 1 diff --git a/src/index_registry.cpp b/src/index_registry.cpp index 2a3cc43893..021567bdec 100644 --- a/src/index_registry.cpp +++ b/src/index_registry.cpp @@ -626,11 +626,10 @@ IndexRegistry VGIndexes::get_vg_index_registry() { registry.register_index("GBWTGraph", "gg"); registry.register_index("GBZ", "gbz"); registry.register_index("Giraffe GBZ", "giraffe.gbz"); - registry.register_index("Unsampled Giraffe GBZ", "unsampled.gbz"); - registry.register_index("Unsampled Giraffe GBZ Distance Index", "unsampled.dist"); - registry.register_index("Unsampled Giraffe GBZ Top Level Chain Distance Index", "tcdist"); - registry.register_index("Unsampled Giraffe GBZ r index", "unsampled.ri"); - registry.register_index("Sample FASTQ", "sample.fastq"); + registry.register_index("Haplotype-Sampled GBZ", "sampled.gbz"); + registry.register_index("Top Level Chain Distance Index", "tcdist"); + registry.register_index("r Index", "ri"); + registry.register_index("FASTQ", "fastq"); registry.register_index("Haplotype Index", "hapl"); registry.register_index("KFF Kmer Counts", "kff"); @@ -3969,13 +3968,223 @@ IndexRegistry VGIndexes::get_vg_index_registry() { }); //////////////////////////////////// - // GBZ Recipes + // Giraffe GBZ Recipes //////////////////////////////////// - // Haplotype sampling: produce a personalized "Giraffe GBZ" by sampling - // haplotypes from a full-pangenome "Unsampled Giraffe GBZ" given - // haplotype information and kmer counts from reads. - registry.register_recipe({"Giraffe GBZ"}, {"Haplotype Index", "KFF Kmer Counts", "Unsampled Giraffe GBZ"}, + // Top priority: If we have the inputs to do haplotype sampling, use a + // haloptype sampled GBZ. + registry.register_recipe({"Giraffe GBZ"}, {"Haplotype-Sampled GBZ"}, + [](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + alias_graph.register_alias(*constructing.begin(), inputs[0]); + return vector>(1, inputs.front()->get_filenames()); + }); + + // Next priority: if we were handed a GBZ, use it as the Giraffe GBZ + registry.register_recipe({"Giraffe GBZ"}, {"GBZ"}, + [](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + alias_graph.register_alias(*constructing.begin(), inputs[0]); + return vector>(1, inputs.front()->get_filenames()); + }); + + // After that, start trying the old GBWT-based workflows + registry.register_recipe({"Giraffe GBZ"}, {"GBWTGraph", "Giraffe GBWT"}, + [](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + if (IndexingParameters::verbosity != IndexingParameters::None) { + info(context) << "Combining Giraffe GBWT and GBWTGraph into GBZ." << endl; + } + + assert(inputs.size() == 2); + auto gbwt_filenames = inputs[1]->get_filenames(); + auto gg_filenames = inputs[0]->get_filenames(); + assert(gbwt_filenames.size() == 1); + assert(gg_filenames.size() == 1); + auto gbwt_filename = gbwt_filenames.front(); + auto gg_filename = gg_filenames.front(); + + assert(constructing.size() == 1); + vector> all_outputs(constructing.size()); + auto gbz_output = *constructing.begin(); + auto& output_names = all_outputs[0]; + + gbwtgraph::GBZ gbz; + load_gbz(gbz, gbwt_filename, gg_filename, IndexingParameters::verbosity == IndexingParameters::Debug); + + string output_name = plan->output_filepath(gbz_output); + save_gbz(gbz, output_name, IndexingParameters::verbosity == IndexingParameters::Debug); + + output_names.push_back(output_name); + return all_outputs; + }); + + // These used to be a GBWTGraph recipe, but we don't want to produce GBWTGraphs anymore. + + registry.register_recipe({"Giraffe GBZ"}, {"Giraffe GBWT", "NamedNodeBackTranslation", "XG"}, + [](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + if (IndexingParameters::verbosity != IndexingParameters::None) { + info(context) << "Constructing GBZ using NamedNodeBackTranslation." << endl; + } + + assert(inputs.size() == 3); + auto gbwt_filenames = inputs[0]->get_filenames(); + auto translation_filenames = inputs[1]->get_filenames(); + auto xg_filenames = inputs[2]->get_filenames(); + assert(gbwt_filenames.size() == 1); + assert(translation_filenames.size() == 1); + assert(xg_filenames.size() == 1); + auto gbwt_filename = gbwt_filenames.front(); + auto translation_filename = translation_filenames.front(); + auto xg_filename = xg_filenames.front(); + + assert(constructing.size() == 1); + vector> all_outputs(constructing.size()); + auto gbz_output = *constructing.begin(); + auto& output_names = all_outputs[0]; + + ifstream infile_xg; + init_in(infile_xg, xg_filename); + auto xg_index = vg::io::VPKG::load_one(infile_xg); + + ifstream infile_translation; + init_in(infile_translation, translation_filename); + // There's only one implementation we can use here at the moment, so + // don't bother with the normal loader/saver system. + FlatFileBackTranslation translation(infile_translation); + + gbwt::GBWT index; + load_gbwt(index, gbwt_filename, IndexingParameters::verbosity == IndexingParameters::Debug); + gbwtgraph::GBZ gbz(std::move(index), *xg_index, &translation); + + // We need to compute pggname manually, because a generic HandleGraph + // does not contain the name of the parent graph. And we use nullptr, + // because we currently cannot determine the name from another souce. + gbz.compute_pggname(nullptr); + + string output_name = plan->output_filepath(gbz_output); + save_gbz(gbz, output_name, IndexingParameters::verbosity == IndexingParameters::Debug); + + output_names.push_back(output_name); + return all_outputs; + }); + + registry.register_recipe({"Giraffe GBZ"}, {"Giraffe GBWT", "XG"}, + [](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + if (IndexingParameters::verbosity != IndexingParameters::None) { + info(context) << "Constructing GBZ." << endl; + } + + assert(inputs.size() == 2); + auto gbwt_filenames = inputs[0]->get_filenames(); + auto xg_filenames = inputs[1]->get_filenames(); + assert(gbwt_filenames.size() == 1); + assert(xg_filenames.size() == 1); + auto gbwt_filename = gbwt_filenames.front(); + auto xg_filename = xg_filenames.front(); + + assert(constructing.size() == 1); + vector> all_outputs(constructing.size()); + auto gbz_output = *constructing.begin(); + auto& output_names = all_outputs[0]; + + ifstream infile_xg; + init_in(infile_xg, xg_filename); + auto xg_index = vg::io::VPKG::load_one(infile_xg); + + gbwt::GBWT index; + load_gbwt(index, gbwt_filename, IndexingParameters::verbosity == IndexingParameters::Debug); + gbwtgraph::GBZ gbz(std::move(index), *xg_index, algorithms::find_translation(xg_index.get())); + + // We need to compute pggname manually, because a generic HandleGraph + // does not contain the name of the parent graph. And we use nullptr, + // because we currently cannot determine the name from another souce. + gbz.compute_pggname(nullptr); + + string output_name = plan->output_filepath(gbz_output); + save_gbz(gbz, output_name, IndexingParameters::verbosity == IndexingParameters::Debug); + + output_names.push_back(output_name); + return all_outputs; + }); + + //////////////////////////////////// + // General GBZ Recipes + //////////////////////////////////// + + registry.register_recipe({"GBZ"}, {"Reference GFA w/ Haplotypes"}, + [](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + if (IndexingParameters::verbosity != IndexingParameters::None) { + info(context) << "Constructing a GBZ from GFA input." << endl; + } + + assert(inputs.size() == 1); + if (inputs[0]->get_filenames().size() != 1) { + error(context) << "Graph construction does not support multiple GFAs at this time." << endl; + } + auto gfa_filename = inputs[0]->get_filenames().front(); + + assert(constructing.size() == 1); + vector> all_outputs(constructing.size()); + auto gbz_output = *constructing.begin(); + auto& output_names = all_outputs[0]; + + string output_name = plan->output_filepath(gbz_output); + + gbwtgraph::GFAParsingParameters params = get_best_gbwtgraph_gfa_parsing_parameters(); + // TODO: there's supposedly a heuristic to set batch size that could perform better than this global param, + // but it would be kind of a pain to update it like we do the global param + params.batch_size = IndexingParameters::gbwt_insert_batch_size; + params.sample_interval = IndexingParameters::gbwt_sampling_interval; + params.max_node_length = IndexingParameters::max_node_size; + // TODO: Here we assume that the GFA file contains 100 haplotypes. If there are more, + // we could safely launch more jobs. Using 600 as the divisor would be a better + // estimate of the number of segments, but we chop long segments to 32 bp nodes. + params.parallel_jobs = guess_parallel_gbwt_jobs( + std::max(get_file_size(gfa_filename) / 300, std::int64_t(1)), + 100, + plan->target_memory_usage(), + params.batch_size + ); + params.show_progress = IndexingParameters::verbosity == IndexingParameters::Debug; + if (IndexingParameters::verbosity >= IndexingParameters::Debug) { + info(context) << "Running " << params.parallel_jobs << " jobs in parallel" << std::endl; + } + + // jointly generate the GBWT and record sequences + unique_ptr gbwt_index; + unique_ptr seq_source; + tie(gbwt_index, seq_source) = gbwtgraph::gfa_to_gbwt(gfa_filename, params); + + // convert sequences into GBZ and save + gbwtgraph::GBZ gbz(gbwt_index, seq_source); + save_gbz(gbz, output_name, IndexingParameters::verbosity == IndexingParameters::Debug); + + output_names.push_back(output_name); + return all_outputs; + }); + + //////////////////////////////////// + // Haplotype Sampling Recipes + //////////////////////////////////// + + // Haplotype sampling: produce a personalized "Haplotype-Sampled GBZ" + registry.register_recipe({"Haplotype-Sampled GBZ"}, {"GBZ", "Haplotype Index", "KFF Kmer Counts"}, [](const vector& inputs, const IndexingPlan* plan, AliasGraph& alias_graph, @@ -3985,9 +4194,9 @@ IndexRegistry VGIndexes::get_vg_index_registry() { } assert(inputs.size() == 3); - auto hapl_filenames = inputs[0]->get_filenames(); - auto kff_filenames = inputs[1]->get_filenames(); - auto gbz_filenames = inputs[2]->get_filenames(); + auto gbz_filenames = inputs[0]->get_filenames(); + auto hapl_filenames = inputs[1]->get_filenames(); + auto kff_filenames = inputs[2]->get_filenames(); assert(gbz_filenames.size() == 1); assert(hapl_filenames.size() == 1); assert(kff_filenames.size() == 1); @@ -4044,48 +4253,14 @@ IndexRegistry VGIndexes::get_vg_index_registry() { return all_outputs; }); - // Allow "Unsampled Giraffe GBZ" to be built from GBWT + GBWTGraph, - // so users who provide those two files can still do haplotype sampling. - registry.register_recipe({"Unsampled Giraffe GBZ"}, {"GBWTGraph", "Giraffe GBWT"}, + // Build the r-index for a GBZ from its GBWT. + registry.register_recipe({"r Index"}, {"GBZ"}, [](const vector& inputs, const IndexingPlan* plan, AliasGraph& alias_graph, const IndexGroup& constructing) { if (IndexingParameters::verbosity != IndexingParameters::None) { - info(context) << "Combining Giraffe GBWT and GBWTGraph into unsampled GBZ." << endl; - } - - assert(inputs.size() == 2); - auto gg_filenames = inputs[0]->get_filenames(); - auto gbwt_filenames = inputs[1]->get_filenames(); - assert(gg_filenames.size() == 1); - assert(gbwt_filenames.size() == 1); - auto gg_filename = gg_filenames.front(); - auto gbwt_filename = gbwt_filenames.front(); - - assert(constructing.size() == 1); - vector> all_outputs(constructing.size()); - auto gbz_output = *constructing.begin(); - auto& output_names = all_outputs[0]; - - gbwtgraph::GBZ gbz; - load_gbz(gbz, gbwt_filename, gg_filename, IndexingParameters::verbosity == IndexingParameters::Debug); - - string output_name = plan->output_filepath(gbz_output); - save_gbz(gbz, output_name, IndexingParameters::verbosity == IndexingParameters::Debug); - - output_names.push_back(output_name); - return all_outputs; - }); - - // Build the r-index for an unsampled GBZ from its GBWT. - registry.register_recipe({"Unsampled Giraffe GBZ r index"}, {"Unsampled Giraffe GBZ"}, - [](const vector& inputs, - const IndexingPlan* plan, - AliasGraph& alias_graph, - const IndexGroup& constructing) { - if (IndexingParameters::verbosity != IndexingParameters::None) { - info(context) << "Building r-index for unsampled GBZ." << endl; + info(context) << "Building r-index for GBZ." << endl; } assert(inputs.size() == 1); @@ -4106,28 +4281,14 @@ IndexRegistry VGIndexes::get_vg_index_registry() { return all_outputs; }); - // Top-level-chain distance index: alias from a full unsampled distance index - // (higher priority — no extra computation needed). - registry.register_recipe({"Unsampled Giraffe GBZ Top Level Chain Distance Index"}, - {"Unsampled Giraffe GBZ Distance Index"}, - [](const vector& inputs, - const IndexingPlan* plan, - AliasGraph& alias_graph, - const IndexGroup& constructing) { - alias_graph.register_alias(*constructing.begin(), inputs[0]); - return vector>(1, inputs.front()->get_filenames()); - }); - - // Top-level-chain distance index: build cheaply from the GBZ with - // only_top_level_chain_distances=true (lower priority). - registry.register_recipe({"Unsampled Giraffe GBZ Top Level Chain Distance Index"}, - {"Unsampled Giraffe GBZ"}, + registry.register_recipe({"Top Level Chain Distance Index"}, + {"GBZ"}, [](const vector& inputs, const IndexingPlan* plan, AliasGraph& alias_graph, const IndexGroup& constructing) { if (IndexingParameters::verbosity != IndexingParameters::None) { - info(context) << "Building top-level-chain distance index for unsampled GBZ." << endl; + info(context) << "Building top-level-chain distance index for GBZ." << endl; } assert(inputs.size() == 1); @@ -4143,21 +4304,21 @@ IndexRegistry VGIndexes::get_vg_index_registry() { string output_name = plan->output_filepath(*constructing.begin()); SnarlDistanceIndex distance_index; IntegratedSnarlFinder snarl_finder(gbz.graph); + // Only do top level chains fill_in_distance_index(&distance_index, &gbz.graph, &snarl_finder, - /*size_limit=*/50000, /*only_top_level_chain_distances=*/true); + 50000, true); distance_index.serialize(output_name); output_names.push_back(output_name); return all_outputs; }); - // Build "Haplotype Index" (.hapl) from the unsampled GBZ, its top-level-chain + // Build "Haplotype Index" (.hapl) from the GBZ, its top-level-chain // distance index, and its r-index. - // Alphabetical input order: GBZ < Top Level Chain Distance Index < r index registry.register_recipe({"Haplotype Index"}, - {"Unsampled Giraffe GBZ", - "Unsampled Giraffe GBZ Top Level Chain Distance Index", - "Unsampled Giraffe GBZ r index"}, + {"GBZ", + "Top Level Chain Distance Index", + "r Index"}, [](const vector& inputs, const IndexingPlan* plan, AliasGraph& alias_graph, @@ -4167,9 +4328,6 @@ IndexRegistry VGIndexes::get_vg_index_registry() { } assert(inputs.size() == 3); - // inputs[0] = Unsampled Giraffe GBZ - // inputs[1] = Unsampled Giraffe GBZ Top Level Chain Distance Index - // inputs[2] = Unsampled Giraffe GBZ r index auto gbz_filename = inputs[0]->get_filenames().front(); auto dist_filename = inputs[1]->get_filenames().front(); auto ri_filename = inputs[2]->get_filenames().front(); @@ -4191,7 +4349,7 @@ IndexRegistry VGIndexes::get_vg_index_registry() { load_r_index(r_index, ri_filename, IndexingParameters::verbosity != IndexingParameters::None); r_index.setGBWT(gbz.index); - // Build minimizer index without payload, using short-read parameters. + // Build minimizer index without payload. MinimizerIndexParameters minimizer_params; minimizer_params.minimizers(IndexingParameters::haplotype_minimizer_k, IndexingParameters::haplotype_minimizer_w) @@ -4216,8 +4374,12 @@ IndexRegistry VGIndexes::get_vg_index_registry() { }); // Build "KFF Kmer Counts" (.kff) by counting k-mers from the provided reads - // using kmc. The reads are registered as "Sample FASTQ". - registry.register_recipe({"KFF Kmer Counts"}, {"Sample FASTQ"}, + // using kmc. + // + // Having the FASTQ available to the indexing logic is the limiting reagent + // that should determine if we do haplotype sampling or not, when given a + // GBZ. + registry.register_recipe({"KFF Kmer Counts"}, {"FASTQ"}, [](const vector& inputs, const IndexingPlan* plan, AliasGraph& alias_graph, @@ -4307,196 +4469,6 @@ IndexRegistry VGIndexes::get_vg_index_registry() { return all_outputs; }); - registry.register_recipe({"Giraffe GBZ"}, {"GBZ"}, - [](const vector& inputs, - const IndexingPlan* plan, - AliasGraph& alias_graph, - const IndexGroup& constructing) { - alias_graph.register_alias(*constructing.begin(), inputs[0]); - return vector>(1, inputs.front()->get_filenames()); - }); - - registry.register_recipe({"GBZ"}, {"Reference GFA w/ Haplotypes"}, - [](const vector& inputs, - const IndexingPlan* plan, - AliasGraph& alias_graph, - const IndexGroup& constructing) { - if (IndexingParameters::verbosity != IndexingParameters::None) { - info(context) << "Constructing a GBZ from GFA input." << endl; - } - - assert(inputs.size() == 1); - if (inputs[0]->get_filenames().size() != 1) { - error(context) << "Graph construction does not support multiple GFAs at this time." << endl; - } - auto gfa_filename = inputs[0]->get_filenames().front(); - - assert(constructing.size() == 1); - vector> all_outputs(constructing.size()); - auto gbz_output = *constructing.begin(); - auto& output_names = all_outputs[0]; - - string output_name = plan->output_filepath(gbz_output); - - gbwtgraph::GFAParsingParameters params = get_best_gbwtgraph_gfa_parsing_parameters(); - // TODO: there's supposedly a heuristic to set batch size that could perform better than this global param, - // but it would be kind of a pain to update it like we do the global param - params.batch_size = IndexingParameters::gbwt_insert_batch_size; - params.sample_interval = IndexingParameters::gbwt_sampling_interval; - params.max_node_length = IndexingParameters::max_node_size; - // TODO: Here we assume that the GFA file contains 100 haplotypes. If there are more, - // we could safely launch more jobs. Using 600 as the divisor would be a better - // estimate of the number of segments, but we chop long segments to 32 bp nodes. - params.parallel_jobs = guess_parallel_gbwt_jobs( - std::max(get_file_size(gfa_filename) / 300, std::int64_t(1)), - 100, - plan->target_memory_usage(), - params.batch_size - ); - params.show_progress = IndexingParameters::verbosity == IndexingParameters::Debug; - if (IndexingParameters::verbosity >= IndexingParameters::Debug) { - info(context) << "Running " << params.parallel_jobs << " jobs in parallel" << std::endl; - } - - // jointly generate the GBWT and record sequences - unique_ptr gbwt_index; - unique_ptr seq_source; - tie(gbwt_index, seq_source) = gbwtgraph::gfa_to_gbwt(gfa_filename, params); - - // convert sequences into GBZ and save - gbwtgraph::GBZ gbz(gbwt_index, seq_source); - save_gbz(gbz, output_name, IndexingParameters::verbosity == IndexingParameters::Debug); - - output_names.push_back(output_name); - return all_outputs; - }); - - registry.register_recipe({"Giraffe GBZ"}, {"GBWTGraph", "Giraffe GBWT"}, - [](const vector& inputs, - const IndexingPlan* plan, - AliasGraph& alias_graph, - const IndexGroup& constructing) { - if (IndexingParameters::verbosity != IndexingParameters::None) { - info(context) << "Combining Giraffe GBWT and GBWTGraph into GBZ." << endl; - } - - assert(inputs.size() == 2); - auto gbwt_filenames = inputs[1]->get_filenames(); - auto gg_filenames = inputs[0]->get_filenames(); - assert(gbwt_filenames.size() == 1); - assert(gg_filenames.size() == 1); - auto gbwt_filename = gbwt_filenames.front(); - auto gg_filename = gg_filenames.front(); - - assert(constructing.size() == 1); - vector> all_outputs(constructing.size()); - auto gbz_output = *constructing.begin(); - auto& output_names = all_outputs[0]; - - gbwtgraph::GBZ gbz; - load_gbz(gbz, gbwt_filename, gg_filename, IndexingParameters::verbosity == IndexingParameters::Debug); - - string output_name = plan->output_filepath(gbz_output); - save_gbz(gbz, output_name, IndexingParameters::verbosity == IndexingParameters::Debug); - - output_names.push_back(output_name); - return all_outputs; - }); - - // Thses used to be a GBWTGraph recipe, but we don't want to produce GBWTGraphs anymore. - - registry.register_recipe({"Giraffe GBZ"}, {"Giraffe GBWT", "NamedNodeBackTranslation", "XG"}, - [](const vector& inputs, - const IndexingPlan* plan, - AliasGraph& alias_graph, - const IndexGroup& constructing) { - if (IndexingParameters::verbosity != IndexingParameters::None) { - info(context) << "Constructing GBZ using NamedNodeBackTranslation." << endl; - } - - assert(inputs.size() == 3); - auto gbwt_filenames = inputs[0]->get_filenames(); - auto translation_filenames = inputs[1]->get_filenames(); - auto xg_filenames = inputs[2]->get_filenames(); - assert(gbwt_filenames.size() == 1); - assert(translation_filenames.size() == 1); - assert(xg_filenames.size() == 1); - auto gbwt_filename = gbwt_filenames.front(); - auto translation_filename = translation_filenames.front(); - auto xg_filename = xg_filenames.front(); - - assert(constructing.size() == 1); - vector> all_outputs(constructing.size()); - auto gbz_output = *constructing.begin(); - auto& output_names = all_outputs[0]; - - ifstream infile_xg; - init_in(infile_xg, xg_filename); - auto xg_index = vg::io::VPKG::load_one(infile_xg); - - ifstream infile_translation; - init_in(infile_translation, translation_filename); - // There's only one implementation we can use here at the moment, so - // don't bother with the normal loader/saver system. - FlatFileBackTranslation translation(infile_translation); - - gbwt::GBWT index; - load_gbwt(index, gbwt_filename, IndexingParameters::verbosity == IndexingParameters::Debug); - gbwtgraph::GBZ gbz(std::move(index), *xg_index, &translation); - - // We need to compute pggname manually, because a generic HandleGraph - // does not contain the name of the parent graph. And we use nullptr, - // because we currently cannot determine the name from another souce. - gbz.compute_pggname(nullptr); - - string output_name = plan->output_filepath(gbz_output); - save_gbz(gbz, output_name, IndexingParameters::verbosity == IndexingParameters::Debug); - - output_names.push_back(output_name); - return all_outputs; - }); - - registry.register_recipe({"Giraffe GBZ"}, {"Giraffe GBWT", "XG"}, - [](const vector& inputs, - const IndexingPlan* plan, - AliasGraph& alias_graph, - const IndexGroup& constructing) { - if (IndexingParameters::verbosity != IndexingParameters::None) { - info(context) << "Constructing GBZ." << endl; - } - - assert(inputs.size() == 2); - auto gbwt_filenames = inputs[0]->get_filenames(); - auto xg_filenames = inputs[1]->get_filenames(); - assert(gbwt_filenames.size() == 1); - assert(xg_filenames.size() == 1); - auto gbwt_filename = gbwt_filenames.front(); - auto xg_filename = xg_filenames.front(); - - assert(constructing.size() == 1); - vector> all_outputs(constructing.size()); - auto gbz_output = *constructing.begin(); - auto& output_names = all_outputs[0]; - - ifstream infile_xg; - init_in(infile_xg, xg_filename); - auto xg_index = vg::io::VPKG::load_one(infile_xg); - - gbwt::GBWT index; - load_gbwt(index, gbwt_filename, IndexingParameters::verbosity == IndexingParameters::Debug); - gbwtgraph::GBZ gbz(std::move(index), *xg_index, algorithms::find_translation(xg_index.get())); - - // We need to compute pggname manually, because a generic HandleGraph - // does not contain the name of the parent graph. And we use nullptr, - // because we currently cannot determine the name from another souce. - gbz.compute_pggname(nullptr); - - string output_name = plan->output_filepath(gbz_output); - save_gbz(gbz, output_name, IndexingParameters::verbosity == IndexingParameters::Debug); - - output_names.push_back(output_name); - return all_outputs; - }); //////////////////////////////////// // Minimizers Recipes diff --git a/src/subcommand/giraffe_main.cpp b/src/subcommand/giraffe_main.cpp index 956733aa92..079947185d 100644 --- a/src/subcommand/giraffe_main.cpp +++ b/src/subcommand/giraffe_main.cpp @@ -764,8 +764,10 @@ int main_giraffe(int argc, char** argv) { IndexRegistry registry = VGIndexes::get_vg_index_registry(); // Indexes provided to IndexRegistry in the arguments. We do not apply them - // immediately, because we may want to do haplotype sampling. - vector> provided_indexes; + // immediately, because we may want to do haplotype sampling. We need them + // indexed for lookup. We only support one file per index (FASTQs get + // handled separately). + std::unordered_map provided_indexes; string index_basename, index_basename_override; // For haplotype sampling. @@ -1134,7 +1136,7 @@ int main_giraffe(int argc, char** argv) { case 'Z': // All provided_indexes are later validated // with require_exists() - provided_indexes.emplace_back("Giraffe GBZ", optarg); + provided_indexes.emplace("GBZ", optarg); // If we have a GBZ we probably want to use its name as the base name. // But see -g. @@ -1143,7 +1145,7 @@ int main_giraffe(int argc, char** argv) { break; case 'x': - provided_indexes.emplace_back("XG", optarg); + provided_indexes.emplace("XG", optarg); // If we have an XG we probably want to use its name as the base name. // But see -g. @@ -1152,7 +1154,7 @@ int main_giraffe(int argc, char** argv) { break; case 'g': - provided_indexes.emplace_back("GBWTGraph", optarg); + provided_indexes.emplace("GBWTGraph", optarg); // But if we have a GBWTGraph we probably want to use *its* name as the base name. // Whichever is specified last will win, unless we also have a FASTA input name. @@ -1161,22 +1163,22 @@ int main_giraffe(int argc, char** argv) { break; case 'H': - provided_indexes.emplace_back("Giraffe GBWT", optarg); + provided_indexes.emplace("Giraffe GBWT", optarg); break; case 'm': - provided_indexes.emplace_back("Long Read Minimizers", optarg); - provided_indexes.emplace_back("Short Read Minimizers", optarg); - provided_indexes.emplace_back("Long Read PathMinimizers", optarg); + provided_indexes.emplace("Long Read Minimizers", optarg); + provided_indexes.emplace("Short Read Minimizers", optarg); + provided_indexes.emplace("Long Read PathMinimizers", optarg); break; case 'z': - provided_indexes.emplace_back("Long Read Zipcodes", optarg); - provided_indexes.emplace_back("Short Read Zipcodes", optarg); - provided_indexes.emplace_back("Long Read PathZipcodes", optarg); + provided_indexes.emplace("Long Read Zipcodes", optarg); + provided_indexes.emplace("Short Read Zipcodes", optarg); + provided_indexes.emplace("Long Read PathZipcodes", optarg); break; case 'd': - provided_indexes.emplace_back("Giraffe Distance Index", optarg); + provided_indexes.emplace("Giraffe Distance Index", optarg); break; case 'p': @@ -1187,10 +1189,10 @@ int main_giraffe(int argc, char** argv) { haplotype_sampling_flag = true; break; case OPT_HAPLOTYPE_NAME: - haplotype_name = optarg; + provided_indexes.emplace("Haplotype Index", optarg); break; case OPT_KFF_NAME: - kff_name = optarg; + provided_indexes.emplace("KFF Kmer Counts", optarg); break; case OPT_INDEX_BASENAME: index_basename_override = optarg; @@ -1366,7 +1368,7 @@ int main_giraffe(int argc, char** argv) { << " is not named like a FASTA" << std::endl; } - provided_indexes.emplace_back("Reference FASTA", fasta_filename); + provided_indexes.emplace("Reference FASTA", fasta_filename); // Everything else should be named like the FASTA by default index_basename = fasta_parts.first; @@ -1389,7 +1391,7 @@ int main_giraffe(int argc, char** argv) { string file_type = IndexRegistry::vcf_is_phased(vcf_filename) ? "VCF w/ Phasing" : "VCF"; // Feed it to the index registry to maybe use - provided_indexes.emplace_back(file_type, vcf_filename); + provided_indexes.emplace(file_type, vcf_filename); } } @@ -1402,14 +1404,8 @@ int main_giraffe(int argc, char** argv) { // Now all the arguments are parsed, so see if they make sense - for (const auto& filename : provided_indexes) { - require_exists(logger, filename.second); - } - if (!haplotype_name.empty()) { - require_exists(logger, haplotype_name); - } - if (!kff_name.empty()) { - require_exists(logger, kff_name); + for (const auto& type_and_filename : provided_indexes) { + require_exists(logger, type_and_filename.second); } // Decide if we are outputting to an htslib format @@ -1465,9 +1461,15 @@ int main_giraffe(int argc, char** argv) { // We don't have file paths to load defined for recombination-aware short-read minimizers. logger.error() << "Path minimizers cannot be used with short reads." << endl; } + + // Use all the provided indexes + for (auto& index : provided_indexes) { + registry.provide(index.first, index.second); + } + // Haplotype sampling is active when explicitly requested (--haplotype-sampling) // or when either --haplotype-name or --kff-name is given. - bool haplotype_sampling = haplotype_sampling_flag || !haplotype_name.empty() || !kff_name.empty(); + bool haplotype_sampling = haplotype_sampling_flag || provided_indexes.count("Haplotype Index") || provided_indexes.count("KFF Kmer Counts"); // Save the GBZ-derived basename before any override, so we can find // dist and other indexes co-located with the source GBZ during sampling. string original_basename = index_basename; @@ -1488,7 +1490,8 @@ int main_giraffe(int argc, char** argv) { << " that sample name is " << sample << std::endl; } } else if (!fastq_filename_1.empty()) { - // Strip .gz then use file_base_name to strip .fq/.fastq + // Strip .gz then use file_base_name to strip .fq/.fastq, since + // strip_suffixes stops at the first one that's not there. sample = file_base_name(strip_suffixes(fastq_filename_1, {".gz"})); if (show_progress) { logger.info() << "Guessing from " << fastq_filename_1 @@ -1506,59 +1509,50 @@ int main_giraffe(int argc, char** argv) { // Pass parameters to the recipes through IndexingParameters. IndexingParameters::haplotype_reference_samples = reference_samples; - // Provide inputs to the registry. The full-pangenome GBZ is typed - // as "Unsampled Giraffe GBZ" so the sampling recipe can distinguish - // it from the personalized "Giraffe GBZ" it will produce. - // Likewise, a provided "Giraffe Distance Index" is re-typed as - // "Unsampled Giraffe GBZ Distance Index" since it belongs to the - // unsampled graph. - for (auto& index : provided_indexes) { - if (index.first == "Giraffe GBZ") { - registry.provide("Unsampled Giraffe GBZ", index.second); - } else if (index.first == "Giraffe Distance Index") { - registry.provide("Unsampled Giraffe GBZ Distance Index", index.second); - } else { - registry.provide(index.first, index.second); - } - } - - // Provide haplotype index and kff only if explicitly given. - if (!haplotype_name.empty()) { - registry.provide("Haplotype Index", haplotype_name); - } - if (!kff_name.empty()) { - registry.provide("KFF Kmer Counts", kff_name); - } - - // Provide FASTQs as a registry node so the KFF kmer counting recipe - // can discover them without going through IndexingParameters. if (!fastq_filename_1.empty()) { + // If FASTQs are available, send those and we will do haplotype sampling. vector fastqs = {fastq_filename_1}; if (!fastq_filename_2.empty()) { fastqs.push_back(fastq_filename_2); } - registry.provide("Sample FASTQ", fastqs); + registry.provide("FASTQ", fastqs); + } else if (!provided_indexes.count("KFF Kmer Counts")) { + // TODO: Eventually learn to kmer count from GAM + logger.error() << "Cannot do haplotype sampling without kmer counts (--kff-name) or FASTQ input (--fastq-in/-f)" << std::endl; } - - // Auto-provide the unsampled distance index from the original GBZ - // basename if a dist file exists there and we don't already have one. - if (!registry.available("Unsampled Giraffe GBZ Distance Index")) { - // Prefer a top-level-chain-only .tcdist, then fall back to full .dist. + // Otherwise, we know we're sending kmer counts and will still do haplotype sampling + + if (!registry.available("Top Level Chain Distance Index")) { + // Look for base GBZ distance index string tcdist_candidate = original_basename + ".tcdist"; - string dist_candidate = original_basename + ".dist"; + string dist_candidate = original_basename + ".dist"; if (file_exists(tcdist_candidate)) { - registry.provide("Unsampled Giraffe GBZ Top Level Chain Distance Index", tcdist_candidate); + registry.provide("Top Level Chain Distance Index", tcdist_candidate); } else if (file_exists(dist_candidate)) { - registry.provide("Unsampled Giraffe GBZ Distance Index", dist_candidate); + // Provide it as a top level chain index even though it has more stuff + registry.provide("Top Level Chain Distance Index", dist_candidate); + } + } + + if (!registry.available("Haplotype Index")) { + // Look for hapl index + string hapl_candidate = original_basename + ".hapl"; + if (file_exists(hapl_candidate)) { + registry.provide("Haplotype Index", hapl_candidate); } } - } else { - // Otherwise use the provided indexes directly. - for (auto& index : provided_indexes) { - registry.provide(index.first, index.second); + if (!registry.available("r Index")) { + // Look for r index + string r_candidate = original_basename + ".ri"; + if (file_exists(r_candidate)) { + registry.provide("r Index", r_candidate); + } } + } + + // Use the basename we've determined for generated indexes registry.set_prefix(index_basename); // The IndexRegistry doesn't try to infer index files based on the @@ -1595,18 +1589,13 @@ int main_giraffe(int argc, char** argv) { for (auto& extension : index_and_extensions->second) { // For each extension in priority order string inferred_filename = registry.get_prefix() + "." + extension; - if (ifstream(inferred_filename).is_open()) { + if (file_exists(inferred_filename)) { // A file with the appropriate name exists and we can read it. - if (haplotype_sampling) { - // If we did haplotype sampling, we are going to overwrite existing indexes. - logger.warn() << inferred_filename << " exists and will be overwritten" << endl; - } else { - // Report it because this may not be desired behavior. - logger.info() << "Guessing that " << inferred_filename - << " is " << index_and_extensions->first << endl; - registry.provide(index_and_extensions->first, inferred_filename); - found = true; - } + // Report it because this may not be desired behavior. + logger.info() << "Guessing that " << inferred_filename + << " is " << index_and_extensions->first << endl; + registry.provide(index_and_extensions->first, inferred_filename); + found = true; // Skip other extension options for the index break; } @@ -1620,6 +1609,16 @@ int main_giraffe(int argc, char** argv) { } } + if (haplotype_sampling && !registry.available("Giraffe GBZ") && !registry.available("GBZ")) { + // When doing haplotype sampling, we need to name our sampled GBZ just + // .gbz on top of the basename we put the sample name in, because the + // tests require it. This means we can't build a base .gbz and a + // sampled .gbz in the same indexing run, or they will fight over the + // name. So make sure we don't try. + logger.error() << "Cannot do haplotype sampling without a GBZ available" << std::endl; + } + // TODO: Actually force the index registry to use the name we want for the sampled GBZ even though it's not the one it would generate. + //If we're making new zipcodes, we should rebuild the minimizers too if (!(indexes_and_extensions.count(std::string("Long Read Minimizers")) || indexes_and_extensions.count(std::string("Long Read PathMinimizers"))) && indexes_and_extensions.count(std::string("Long Read Zipcodes"))) { logger.info() << "Rebuilding minimizer index to include zipcodes" << endl; diff --git a/test/haplotype-sampling/README.md b/test/haplotype-sampling/README.md index 89e0d3e6b6..406e9e9b82 100644 --- a/test/haplotype-sampling/README.md +++ b/test/haplotype-sampling/README.md @@ -1,3 +1,4 @@ + # Haplotype sampling example This directory contains a small test case for haplotype sampling using the `vg haplotypes` subcommand. @@ -17,6 +18,7 @@ File `HG003.kff` contains 29-mer counts for the reads computed using KMC. First we need to build the indexes used by Giraffe: + ``` vg autoindex --workflow giraffe --prefix micb-kir3dl1 -g micb-kir3dl1.gfa ``` @@ -25,6 +27,7 @@ This converts the graph into GBZ format (`micb-kir3dl1.giraffe.gbz`) and builds Then we can map the reads, specifying the GBZ file, the fastq file, and the output file: + ``` vg giraffe -Z micb-kir3dl1.giraffe.gbz -f HG003.fq.gz -p > to-original.gam ``` @@ -35,6 +38,7 @@ However, because the graph contains two sets of reference paths (GRCh38 and CHM1 We can use the `vg stats` subcommand to compute statistics on the aligned reads: + ``` vg stats -a to-original.gam micb-kir3dl1.giraffe.gbz ``` @@ -46,6 +50,7 @@ vg stats -a to-original.gam micb-kir3dl1.giraffe.gbz In order to build the haplotype information used for haplotype sampling, we need a GBZ graph, a distance index, and an r-index. We build them using the following commands: + ``` vg gbwt -G micb-kir3dl1.gfa --max-node 32 --gbz-format -g micb-kir3dl1.gbz -r micb-kir3dl1.ri vg index -j micb-kir3dl1.dist micb-kir3dl1.gbz @@ -56,6 +61,7 @@ This makes the resulting GBZ graph (and hence the distance index) indentical to Now we can build the haplotype information file `micb-kir3dl1.hapl`: + ``` vg haplotypes -v 2 --subchain-length 300 -H micb-kir3dl1.hapl micb-kir3dl1.gbz ``` @@ -67,6 +73,7 @@ The latter makes the example more interesting than the default 10000 bp. We can now sample the haplotypes using the haplotype information, the k-mer counts, and the original graph: + ``` vg haplotypes -v 2 -i micb-kir3dl1.hapl -k HG003.kff \ --include-reference --diploid-sampling \ @@ -79,9 +86,10 @@ Now we have the sampled graph `HG003.gbz`. We could build the indexes for Giraffe manually. But because the graph is a temporary object intended only for mapping a specific set of reads, we can let Giraffe handle index construction: + ``` # Set the temporary directory as you wish -export TMPDIR=/scratch/tmp +# export TMPDIR=/scratch/tmp vg giraffe -Z HG003.gbz -f HG003.fq.gz -p --index-basename ${TMPDIR}/HG003 > to-sampled.gam ``` @@ -90,40 +98,44 @@ Here we specify that the indexes go to the temporary directory with base name `H We can again use `vg stats` to compute statistics: + ``` vg stats -a to-sampled.gam HG003.gbz ``` ### Giraffe integration -Instead of sampling the haplotypes manually, we can let Giraffe handle everything: +Instead of sampling the haplotypes manually, we can let Giraffe handle the sampling: + ``` # Set the temporary directory as you wish -export TMPDIR=/scratch/tmp +# export TMPDIR=/scratch/tmp vg giraffe -Z micb-kir3dl1.gbz --haplotype-name micb-kir3dl1.hapl --kff-name HG003.kff \ - -f HG003.fq.gz -p --index-basename ${TMPDIR}/integration > giraffe-integration.gam + -f HG003.fq.gz -p --index-basename integration > giraffe-integration.gam ``` This uses the best practices for haplotype sampling, which currently means including the reference paths and using diploid sampling. We put the sampled graph and the automatically built indexes into the temporary directory. -The naming scheme (e.g. `${TMPDIR}/integration.HG003.gbz`) consists of three parts: +The naming scheme (e.g. `integration.HG003.gbz`) consists of three parts: -* Base name `${TMPDIR}/integration` specified with option `--index-basename`. +* Base name `integration` specified with option `--index-basename`. * Sample name `HG003` guessed from the name of the KFF file or specified with option `--sample`. * Extension (e.g. `gbz`) that depends on the file. The statistics should be similar to those in the previous section: + ``` -vg stats -a giraffe-integration.gam ${TMPDIR}/integration.HG003.gbz +vg stats -a giraffe-integration.gam integration.HG003.gbz ``` ## Mapping reads to a linear reference Given a GBZ file, we can extract the paths corresponding to a given sample in fasta format: + ``` vg paths -x micb-kir3dl1.gbz --extract-fasta --sample GRCh38 > reference.fa ``` diff --git a/test/t/54_vg_haplotypes.t b/test/t/54_vg_haplotypes.t index eb82e1da08..421d61f89e 100644 --- a/test/t/54_vg_haplotypes.t +++ b/test/t/54_vg_haplotypes.t @@ -73,7 +73,7 @@ is $(vg gbwt -H -Z diploid3.gbz) 3 "2 generated + 1 reference haplotypes" vg giraffe -Z full.gbz --haplotype-name full.hapl --kff-name haplotype-sampling/HG003.kff \ -f haplotype-sampling/HG003.fq.gz > default.gam 2> /dev/null is $? 0 "Giraffe integration with a guessed output name" -cmp diploid.gbz full.HG003.giraffe.gbz +cmp diploid.gbz full.HG003.gbz is $? 0 "the sampled graph is identical to a manually sampled one" # Giraffe integration, specified output name @@ -81,7 +81,7 @@ vg giraffe -Z full.gbz --haplotype-name full.hapl --kff-name haplotype-sampling/ --index-basename sampled -N 003HG \ -f haplotype-sampling/HG003.fq.gz > specified.gam 2> /dev/null is $? 0 "Giraffe integration with a specified output name" -cmp full.HG003.giraffe.gbz sampled.003HG.giraffe.gbz +cmp full.HG003.gbz sampled.003HG.gbz is $? 0 "the sampled graphs are identical" # Giraffe integration, specified reference sample @@ -89,7 +89,7 @@ vg giraffe -Z full.gbz --haplotype-name full.hapl --kff-name haplotype-sampling/ --index-basename GRCh38 -N HG003 --set-reference GRCh38 \ -f haplotype-sampling/HG003.fq.gz > HG003_GRCh38.gam 2> /dev/null is $? 0 "Giraffe integration with a specified reference sample" -cmp diploid3.gbz GRCh38.HG003.giraffe.gbz +cmp diploid3.gbz GRCh38.HG003.gbz is $? 0 "the sampled graph is identical to a manually sampled one" # Giraffe integration, haplotype index built automatically from the GBZ. @@ -99,7 +99,7 @@ vg giraffe -Z full.gbz --kff-name haplotype-sampling/HG003.kff \ --index-basename auto_hapl -N HG003 \ -f haplotype-sampling/HG003.fq.gz > auto_hapl.gam 2> /dev/null is $? 0 "Giraffe builds haplotype index automatically" -is $(vg gbwt -H -Z auto_hapl.HG003.giraffe.gbz) 4 "auto-built hapl produces 2 diploid + 2 reference haplotypes" +is $(vg gbwt -H -Z auto_hapl.HG003.gbz) 4 "auto-built hapl produces 2 diploid + 2 reference haplotypes" # Giraffe integration, kmer counting done automatically from reads. # Providing --haplotype-name without --kff-name implies haplotype sampling; @@ -108,7 +108,7 @@ vg giraffe -Z full.gbz --haplotype-name full.hapl \ --index-basename auto_kff -N HG003 \ -f haplotype-sampling/HG003.fq.gz > auto_kff.gam 2> /dev/null is $? 0 "Giraffe counts kmers from reads automatically" -is $(vg gbwt -H -Z auto_kff.HG003.giraffe.gbz) 4 "auto kmer counting produces 2 diploid + 2 reference haplotypes" +is $(vg gbwt -H -Z auto_kff.HG003.gbz) 4 "auto kmer counting produces 2 diploid + 2 reference haplotypes" # Giraffe integration, fully automatic: both hapl and kff are built by the # IndexRegistry. Triggered by --haplotype-sampling without either input file. @@ -116,7 +116,7 @@ vg giraffe -Z full.gbz --haplotype-sampling \ --index-basename auto_all -N HG003 \ -f haplotype-sampling/HG003.fq.gz > auto_all.gam 2> /dev/null is $? 0 "Giraffe does fully automatic haplotype sampling" -is $(vg gbwt -H -Z auto_all.HG003.giraffe.gbz) 4 "fully automatic haplotype sampling produces 2 diploid + 2 reference haplotypes" +is $(vg gbwt -H -Z auto_all.HG003.gbz) 4 "fully automatic haplotype sampling produces 2 diploid + 2 reference haplotypes" # Attempts to use mismatched files fail vg haplotypes -i full.hapl -k haplotype-sampling/HG003.kff -g /dev/null diploid.gbz 2> log.txt @@ -133,4 +133,4 @@ rm -f GRCh38.HG003.* HG003_GRCh38.gam rm -f auto_hapl.HG003.* auto_hapl.gam rm -f auto_kff.HG003.* auto_kff.gam rm -f auto_all.HG003.* auto_all.gam -rm -f log.txt \ No newline at end of file +rm -f log.txt From f54a4a6041e90bf6673ac049a854cda80fbe1487 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Mon, 9 Mar 2026 18:02:11 -0400 Subject: [PATCH 07/22] Get partway into a scope applying system that is bad --- src/index_registry.cpp | 65 ++++++++++++++++++++++++++----- src/index_registry.hpp | 17 +++++--- src/subcommand/autoindex_main.cpp | 2 +- 3 files changed, 68 insertions(+), 16 deletions(-) diff --git a/src/index_registry.cpp b/src/index_registry.cpp index 021567bdec..276433f310 100644 --- a/src/index_registry.cpp +++ b/src/index_registry.cpp @@ -4598,23 +4598,30 @@ string IndexingPlan::output_filepath(const IndexName& identifier) const { string IndexingPlan::output_filepath(const IndexName& identifier, size_t chunk, size_t num_chunks) const { - string filepath; + std::stringstream filepath; if (registry->keep_intermediates || (!is_intermediate(identifier) && !registry->get_index(identifier)->was_provided_directly())) { // we're saving this file, put it at the output prefix - filepath = registry->output_prefix; + filepath << registry->output_prefix; } else { // we're not saving this file, make it temporary - filepath = registry->get_work_dir() + "/" + sha1sum(identifier); + filepath << registry->get_work_dir() + "/" + sha1sum(identifier); + } + auto found = scopes.find(identifier); + if (found != scopes.end()) { + for (auto& scope : found->second) { + // This index belongs to just a sample or something. + filepath << "." << scope; + } } if (num_chunks > 1) { // we add digits to make the suffix unique for this chunk (the setup disallows suffixes // that start with digits) - filepath += "." + to_string(chunk); + filepath << "." << to_string(chunk); } - filepath += "." + registry->get_index(identifier)->get_suffix(); - return filepath; + filepath << "." << registry->get_index(identifier)->get_suffix(); + return filepath.str(); } const vector& IndexingPlan::get_steps() const { @@ -4842,11 +4849,11 @@ void IndexRegistry::register_index(const IndexName& identifier, const string& su } -void IndexRegistry::provide(const IndexName& identifier, const string& filename) { - provide(identifier, vector(1, filename)); +void IndexRegistry::provide(const IndexName& identifier, const string& filename, const std::string& scope) { + provide(identifier, vector(1, filename), scope); } -void IndexRegistry::provide(const IndexName& identifier, const vector& filenames) { +void IndexRegistry::provide(const IndexName& identifier, const vector& filenames, const std::string& scope) { if (IndexingParameters::verbosity >= IndexingParameters::Debug) { info(context) << "Provided: " << identifier << endl; } @@ -5737,11 +5744,49 @@ IndexingPlan IndexRegistry::make_plan(const IndexGroup& end_products) const { } #endif + // Figure out if any of the input files apply a sample scope to descendant files in the plan. + // This holds sets of later recipes and the scopes to give them when we get there. + // TODO: This is complicated and the queueing is not needed; we can just apply the scopes when we find the descendants. + std::vector, std::vector> scope_queue; + for (auto plan_it = plan.steps.begin(); plan_it != plan.steps.end(); ++plan_it) { + // For each step in ther plan in order + + for (auto& names_and_scopes : scope_queue) { + if (names_and_scopes.first.count(plan_it->first)) { + // We have scopes from a previous step to apply to the indexes + // generated by this step. + // + // TODO: We assume a single scope only ever comes from one + // input file, and we don't enforce a particular scope order + // other than rule order. + for (const IndexName& index_name : plan_it->first) { + // Append all the new scopes to any existing scopes on this index. + std::copy(names_and_scopes.second().begin(), names_and_scopes.second().end(), std::back_inserter(scopes[index_name])); + } + } + } + + for (const IndexName& index_name : plan_it->first) { + // For each index that step would generate + const IndexFile* index = get_index(index_name); + if (!index->was_provided_directly()) { + // Only directly provided indexes introduce new scopes + continue; + } + auto& provided_scopes = index->provided_scopes(); + if (!provided_scopes.empty()) { + // We need to attach these scopes to anything based on this index. + set dependent_recipes = dependents(index_name); + } + } + } + + // Now remove the input data from the plan plan.steps.resize(remove_if(plan.steps.begin(), plan.steps.end(), [&](const RecipeName& recipe_choice) { return all_finished(recipe_choice.first); }) - plan.steps.begin()); - + // The plan has methods that can come back and modify the registry. // We're not going to call any of them, but we have to hand off a non-const // pointer to ourselves so the plan can modify us later. diff --git a/src/index_registry.hpp b/src/index_registry.hpp index 82ea89a2c1..b1f4afa991 100644 --- a/src/index_registry.hpp +++ b/src/index_registry.hpp @@ -185,7 +185,7 @@ class IndexingPlan { // TODO: is this where this function wants to live? /// The memory limit, with a little slosh for prediction inaccuracy int64_t target_memory_usage() const; - /// The mmeory limit with no slosh + /// The memory limit with no slosh int64_t literal_target_memory_usage() const; /// Returns the recipes in the plan that depend on this index, including the one in which @@ -199,6 +199,10 @@ class IndexingPlan { vector steps; /// The indexes to create as outputs. set targets; + /// The scope qualifying each index in the plan (for example, if an index + /// applies only to a particular sample, it will be scoped to the sample + /// name). + map> scopes; /// The registry that the plan is using. /// The registry must not move while the plan is in use. @@ -261,11 +265,13 @@ class IndexRegistry { /// by the generalization must be semantically identical to those of the generalizee void register_generalization(const RecipeName& generalizer, const RecipeName& generalizee); - /// Indicate a serialized file that contains some identified index - void provide(const IndexName& identifier, const string& filename); + /// Indicate a serialized file that contains some identified index, + /// optionally with a scope that propagates to descendant files. + void provide(const IndexName& identifier, const string& filename, const std::string& scope = ""); - /// Indicate a list of serialized files that contains some identified index - void provide(const IndexName& identifier, const vector& filenames); + /// Indicate a list of serialized files that contains some identified index, + /// optionally with a scope that propagates to descendant files. + void provide(const IndexName& identifier, const vector& filenames, const std::string& scope = ""); /// Remove a provided index void reset(const IndexName& identifier); @@ -275,6 +281,7 @@ class IndexRegistry { bool available(const IndexName& identifier) const; /// Get the possible filename(s) associated with the given index with the given prefix. + /// TODO: Get this to account for sample-scoped indexes. vector get_possible_filenames(const IndexName& identifier) const; /// Get the filename(s) associated with the given index. Aborts if the diff --git a/src/subcommand/autoindex_main.cpp b/src/subcommand/autoindex_main.cpp index 31777f0d11..fc16be62fd 100644 --- a/src/subcommand/autoindex_main.cpp +++ b/src/subcommand/autoindex_main.cpp @@ -390,7 +390,7 @@ int main_autoindex(int argc, char** argv) { if (!registry.available(target)) { vector inferred_file_names = registry.get_possible_filenames(target); for (const string& filename : inferred_file_names) { - if (ifstream(filename).is_open()) { + if (file_exists(filename)) { logger.info() << "Guessing that " << filename << " is " << target << endl; registry.provide(target, filename); break; From 6df0ca341e5a2a43435fb05e4eeb92c8af8644b2 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Tue, 10 Mar 2026 13:25:45 -0400 Subject: [PATCH 08/22] Implement scopes and fix storing new paths for aliases --- src/index_registry.cpp | 165 ++++++++++++++++++++---------- src/index_registry.hpp | 37 ++++++- src/subcommand/giraffe_main.cpp | 94 +++++++++-------- test/haplotype-sampling/README.md | 6 ++ 4 files changed, 202 insertions(+), 100 deletions(-) diff --git a/src/index_registry.cpp b/src/index_registry.cpp index 276433f310..8a476ede15 100644 --- a/src/index_registry.cpp +++ b/src/index_registry.cpp @@ -67,9 +67,9 @@ #include "algorithms/component.hpp" #include "algorithms/find_translation.hpp" -//#define debug_index_registry +#define debug_index_registry //#define debug_index_registry_setup -//#define debug_index_registry_recipes +#define debug_index_registry_recipes //#define debug_index_registry_path_state namespace std { @@ -4241,13 +4241,17 @@ IndexRegistry VGIndexes::get_vg_index_registry() { Recombinator::Parameters parameters(Recombinator::Parameters::preset_diploid); gbwt::GBWT sampled_gbwt = recombinator.generate_haplotypes(kff_filename, parameters); - // Build GBWTGraph and save GBZ. + // Build GBZ. if (IndexingParameters::verbosity != IndexingParameters::None) { - info(context) << "Building GBWTGraph." << endl; + info(context) << "Building GBZ." << endl; } gbwtgraph::GBZ sampled_graph(std::move(sampled_gbwt), gbz); string output_name = plan->output_filepath(gbz_output); save_gbz(sampled_graph, output_name, IndexingParameters::verbosity == IndexingParameters::Debug); + + if (IndexingParameters::verbosity != IndexingParameters::None) { + info(context) << "Saved GBZ to " << output_name << endl; + } output_names.push_back(output_name); return all_outputs; @@ -4584,6 +4588,15 @@ bool IndexingPlan::is_intermediate(const IndexName& identifier) const { return !targets.count(identifier); } +const vector IndexingPlan::get_scopes(const IndexName& identifier) const { + auto found = scopes.find(identifier); + if (found == scopes.end()) { + return {}; + } + // TODO: we can't really return an empty vector reference so we are stuck doing a copy here. + return found->second; +} + int64_t IndexingPlan::target_memory_usage() const { return IndexingParameters::max_memory_proportion * registry->get_target_memory_usage(); } @@ -4608,12 +4621,9 @@ string IndexingPlan::output_filepath(const IndexName& identifier, size_t chunk, // we're not saving this file, make it temporary filepath << registry->get_work_dir() + "/" + sha1sum(identifier); } - auto found = scopes.find(identifier); - if (found != scopes.end()) { - for (auto& scope : found->second) { - // This index belongs to just a sample or something. - filepath << "." << scope; - } + for (auto& scope : get_scopes(identifier)) { + // This index belongs to just a sample or something. + filepath << "." << scope; } if (num_chunks > 1) { // we add digits to make the suffix unique for this chunk (the setup disallows suffixes @@ -4621,6 +4631,7 @@ string IndexingPlan::output_filepath(const IndexName& identifier, size_t chunk, filepath << "." << to_string(chunk); } filepath << "." << registry->get_index(identifier)->get_suffix(); + std::cerr << "Considering output file path " << filepath.str() << std::endl; return filepath.str(); } @@ -4657,6 +4668,16 @@ set IndexingPlan::dependents(const IndexName& identifier) const { return dependent_steps; } +void IndexingPlan::add_scope(const IndexName& identifier, const string& scope) { + auto index_scopes = scopes[identifier]; + + // TODO: Change the data structure here if we anticipate an appreciable number of scopes. + auto found = std::find(index_scopes.begin(), index_scopes.end(), scope); + if (found == index_scopes.end()) { + index_scopes.push_back(scope); + } +} + IndexRegistry::~IndexRegistry() { if (!work_dir.empty()) { // Clean up our work directory with its temporary indexes. @@ -4736,6 +4757,10 @@ void IndexRegistry::make_indexes(const vector& identifiers) { if (!index->was_provided_directly()) { // and assign the new (or first) ones index->assign_constructed(results); + for (auto& scope : plan.get_scopes(*it)) { + // and keep their scopes (in case we ever need them) + index->add_scope(scope); + } } ++it; } @@ -4792,7 +4817,10 @@ void IndexRegistry::make_indexes(const vector& identifiers) { auto f = find(aliasors.begin(), aliasors.end(), aliasee); bool can_move = f == aliasors.end() && !get_index(aliasee)->was_provided_directly(); if (!can_move) { - // just remove the "alias" so we don't need to deal with it + // We need to copy. + // Just remove the "alias" for the index itself so that from here + // on we only need to wory about things that *do* need to be + // created. std::swap(*f, aliasors.back()); aliasors.pop_back(); } @@ -4801,15 +4829,21 @@ void IndexRegistry::make_indexes(const vector& identifiers) { // copy aliases for any that we need to (start past index 0 if we can move it) for (size_t i = can_move; i < aliasors.size(); ++i) { + // We want the aliasor to point at the new paths + vector new_aliasor_files; for (size_t j = 0; j < aliasee_filenames.size(); ++j) { auto copy_filename = plan.output_filepath(aliasors[i], j, aliasee_filenames.size()); copy_file(aliasee_filenames[j], copy_filename); + new_aliasor_files.push_back(copy_filename); } + // Point the aliasor index at the new paths + get_index(aliasors[i])->assign_constructed(new_aliasor_files); } // if we can move the aliasee (i.e. it is intermediate), then make - // one index by moving instead of copying + // one index (the 0th one) by moving instead of copying if (can_move) { + vector new_aliasor_files; for (size_t j = 0; j < aliasee_filenames.size(); ++j) { auto move_filename = plan.output_filepath(aliasors[0], j, aliasee_filenames.size()); int code = rename(aliasee_filenames[j].c_str(), move_filename.c_str()); @@ -4817,7 +4851,10 @@ void IndexRegistry::make_indexes(const vector& identifiers) { // moving failed (maybe because the files on separate drives?) fall back on copying copy_file(aliasee_filenames[j], move_filename); } + new_aliasor_files.push_back(move_filename); } + // Point the aliasor index at the new paths + get_index(aliasors[0])->assign_constructed(new_aliasor_files); } } @@ -5714,6 +5751,14 @@ IndexingPlan IndexRegistry::make_plan(const IndexGroup& end_products) const { // Now fill in the plan struct that the recipes need to know how to run. IndexingPlan plan; + + // The plan has methods that need to interact with the registry, including + // some we need to use here. + // + // Some of them modify the registry. We're not going to use any of those, + // but we have to hand off a non-const pointer to ourselves so the plan can + // modify us later. + plan.registry = const_cast(this); // Copy over the end products std::copy(end_products.begin(), end_products.end(), std::inserter(plan.targets, plan.targets.begin())); @@ -5744,54 +5789,48 @@ IndexingPlan IndexRegistry::make_plan(const IndexGroup& end_products) const { } #endif - // Figure out if any of the input files apply a sample scope to descendant files in the plan. - // This holds sets of later recipes and the scopes to give them when we get there. - // TODO: This is complicated and the queueing is not needed; we can just apply the scopes when we find the descendants. - std::vector, std::vector> scope_queue; + // Now remove the input data from the plan + plan.steps.resize(remove_if(plan.steps.begin(), plan.steps.end(), [&](const RecipeName& recipe_choice) { + return all_finished(recipe_choice.first); + }) - plan.steps.begin()); + + // Now we're allowed to use dependents() + + // Propagate scopes from input files for (auto plan_it = plan.steps.begin(); plan_it != plan.steps.end(); ++plan_it) { // For each step in ther plan in order - for (auto& names_and_scopes : scope_queue) { - if (names_and_scopes.first.count(plan_it->first)) { - // We have scopes from a previous step to apply to the indexes - // generated by this step. - // - // TODO: We assume a single scope only ever comes from one - // input file, and we don't enforce a particular scope order - // other than rule order. - for (const IndexName& index_name : plan_it->first) { - // Append all the new scopes to any existing scopes on this index. - std::copy(names_and_scopes.second().begin(), names_and_scopes.second().end(), std::back_inserter(scopes[index_name])); - } - } - } - for (const IndexName& index_name : plan_it->first) { // For each index that step would generate const IndexFile* index = get_index(index_name); - if (!index->was_provided_directly()) { - // Only directly provided indexes introduce new scopes + if (!index->is_finished()) { + // Only finished indexes bring in new scopes continue; } - auto& provided_scopes = index->provided_scopes(); - if (!provided_scopes.empty()) { - // We need to attach these scopes to anything based on this index. - set dependent_recipes = dependents(index_name); + + // Get the scopes that came in with this input + auto& provided_scopes = index->get_scopes(); + + // We need to attach these scopes to anything based on this index. + // TODO: Should we just propagate scopes step by step as we scan the plan instead? + for (const RecipeName& dependent_recipe : plan.dependents(index_name)) { + // For each recipe transitively depending on this input + for (const IndexName& dependent_index_name : dependent_recipe.first) { + // For each index the recipe generates + + for (auto& scope : provided_scopes) { + // Make sure that index is scoped with all scopes on the input. + plan.add_scope(dependent_index_name, scope); + } + + // Note that all outputs will get their scopes from the + // inputs added in the same order (the order that the + // inputs appear in the plan). + } } } } - - // Now remove the input data from the plan - plan.steps.resize(remove_if(plan.steps.begin(), plan.steps.end(), [&](const RecipeName& recipe_choice) { - return all_finished(recipe_choice.first); - }) - plan.steps.begin()); - - // The plan has methods that can come back and modify the registry. - // We're not going to call any of them, but we have to hand off a non-const - // pointer to ourselves so the plan can modify us later. - plan.registry = const_cast(this); - return plan; } @@ -5932,10 +5971,6 @@ IndexFile::IndexFile(const IndexName& identifier, const string& suffix) : identi // nothing more to do } -bool IndexFile::is_finished() const { - return !filenames.empty(); -} - const IndexName& IndexFile::get_identifier() const { return identifier; } @@ -5958,6 +5993,11 @@ void IndexFile::provide(const vector& filenames) { } void IndexFile::assign_constructed(const vector& filenames) { + std::cerr << "Set " << get_identifier() << " to be at:"; + for (auto& f : filenames) { + std::cerr << " " << f; + } + std::cerr << std::endl; this->filenames = filenames; provided_directly = false; } @@ -5966,6 +6006,27 @@ bool IndexFile::was_provided_directly() const { return provided_directly; } +void IndexFile::add_scope(const string& scope) { + if (!is_finished()) { + // Scopes for nonexistent files belong to the plan, not us. + throw std::logic_error("Cannot assign " + scope + " scope to unfinished " + get_identifier() + " index"); + } + + // TODO: Change the data structure here if we anticipate an appreciable number of scopes. + auto found = std::find(scopes.begin(), scopes.end(), scope); + if (found == scopes.end()) { + scopes.push_back(scope); + } +} + +const vector& IndexFile::get_scopes() const { + return scopes; +} + +bool IndexFile::is_finished() const { + return !filenames.empty(); +} + void IndexFile::reset() { filenames.clear(); provided_directly = false; diff --git a/src/index_registry.hpp b/src/index_registry.hpp index b1f4afa991..6726a419d0 100644 --- a/src/index_registry.hpp +++ b/src/index_registry.hpp @@ -181,6 +181,11 @@ class IndexingPlan { /// Returns true if the given index is to be intermediate under the given /// plan, and false if it is to be preserved. bool is_intermediate(const IndexName& identifier) const; + + /// Get the deduplicated scopes, in addition order, that the given index + /// will be qualified with when generated. Only works for indexes that will + /// be generated by the plan, not for inputs. + const vector get_scopes(const IndexName& identifier) const; // TODO: is this where this function wants to live? /// The memory limit, with a little slosh for prediction inaccuracy @@ -189,19 +194,26 @@ class IndexingPlan { int64_t literal_target_memory_usage() const; /// Returns the recipes in the plan that depend on this index, including the one in which - /// it was created (if any) + /// it was created (if any). + /// + /// Requires that all indexes in the plan be generated by a recipe; inputs + /// must not be in the plan as steps in order to call this. set dependents(const IndexName& identifier) const; protected: + /// Add a scope to the scopes the given index will be qualified with when + /// generated, if not present already. + void add_scope(const IndexName& identifier, const string& scope); + /// The steps to be invoked in the plan. May be empty before the plan is /// actually planned. vector steps; /// The indexes to create as outputs. set targets; - /// The scope qualifying each index in the plan (for example, if an index + /// The scopes qualifying each index in the plan (for example, if an index /// applies only to a particular sample, it will be scoped to the sample - /// name). + /// name). Stored deduplicated and in the order added. map> scopes; /// The registry that the plan is using. @@ -399,7 +411,19 @@ class IndexFile { /// Assign constructed filenames to this index void assign_constructed(const vector& filenames); - + + /// Append a new scope to qualify the files in the index, if the scope is + /// not already on the index. + /// + /// Should only be used on files that are actually filled in already; + /// scopes for files that don't exist yet are the responsibility of the + /// IndexingPlan. + void add_scope(const string& scope); + + /// Get all scopes qualifying the index (such as a sample name). + /// Scopes are deduplicated and produced in the order added. + const vector& get_scopes() const; + /// Returns true if the index has already been built or provided bool is_finished() const; @@ -419,6 +443,11 @@ class IndexFile { // the filename(s) associated with the index vector filenames; + + // The dedplicated ordered scopes qualifying the index. + // TODO: Use an ordered set for good algorithmics for deduplicating scopes. + // Right now we have exactly one kind of scope so this doesn't matter. + vector scopes; // keep track of whether the index was provided directly bool provided_directly = false; diff --git a/src/subcommand/giraffe_main.cpp b/src/subcommand/giraffe_main.cpp index 079947185d..c83d4930be 100644 --- a/src/subcommand/giraffe_main.cpp +++ b/src/subcommand/giraffe_main.cpp @@ -60,6 +60,8 @@ static long perf_event_open(struct perf_event_attr* hw_event, pid_t pid, int cpu } #endif +#define debug + using namespace std; using namespace vg; using namespace vg::subcommand; @@ -768,7 +770,7 @@ int main_giraffe(int argc, char** argv) { // indexed for lookup. We only support one file per index (FASTQs get // handled separately). std::unordered_map provided_indexes; - string index_basename, index_basename_override; + string index_basename_guess, index_basename; // For haplotype sampling. string haplotype_name, kff_name; @@ -1139,8 +1141,7 @@ int main_giraffe(int argc, char** argv) { provided_indexes.emplace("GBZ", optarg); // If we have a GBZ we probably want to use its name as the base name. - // But see -g. - index_basename = strip_suffixes(std::string(optarg), { ".gbz", ".giraffe" }); + index_basename_guess = strip_suffixes(std::string(optarg), { ".gbz", ".giraffe" }); break; @@ -1148,17 +1149,15 @@ int main_giraffe(int argc, char** argv) { provided_indexes.emplace("XG", optarg); // If we have an XG we probably want to use its name as the base name. - // But see -g. - index_basename = split_ext(optarg).first; + index_basename_guess = split_ext(optarg).first; break; case 'g': provided_indexes.emplace("GBWTGraph", optarg); - // But if we have a GBWTGraph we probably want to use *its* name as the base name. - // Whichever is specified last will win, unless we also have a FASTA input name. - index_basename = split_ext(optarg).first; + // If we have a GBWTGraph we probably want to use its name as the base name. + index_basename_guess = split_ext(optarg).first; break; @@ -1195,7 +1194,7 @@ int main_giraffe(int argc, char** argv) { provided_indexes.emplace("KFF Kmer Counts", optarg); break; case OPT_INDEX_BASENAME: - index_basename_override = optarg; + index_basename = optarg; break; case 'G': @@ -1369,8 +1368,9 @@ int main_giraffe(int argc, char** argv) { } provided_indexes.emplace("Reference FASTA", fasta_filename); - // Everything else should be named like the FASTA by default - index_basename = fasta_parts.first; + // Everything else should be named like the FASTA by default. This is a + // better guess than the other siurces. + index_basename_guess = fasta_parts.first; if (have_input_file(optind, argc, argv)) { // Next one must be VCF, but check. @@ -1395,6 +1395,11 @@ int main_giraffe(int argc, char** argv) { } } + if (index_basename.empty()) { + // Try to pick an index basename now + index_basename = index_basename_guess; + } + // If we don't want rescue, let the user see we don't try it. if (parser->get_option_value("rescue-attempts") == 0 || rescue_algorithm == MinimizerMapper::rescue_none) { // Replace any parsed values @@ -1470,12 +1475,7 @@ int main_giraffe(int argc, char** argv) { // Haplotype sampling is active when explicitly requested (--haplotype-sampling) // or when either --haplotype-name or --kff-name is given. bool haplotype_sampling = haplotype_sampling_flag || provided_indexes.count("Haplotype Index") || provided_indexes.count("KFF Kmer Counts"); - // Save the GBZ-derived basename before any override, so we can find - // dist and other indexes co-located with the source GBZ during sampling. - string original_basename = index_basename; - if (!index_basename_override.empty()) { - index_basename = index_basename_override; - } + if (haplotype_sampling) { // Determine sample name early so we can incorporate it into the @@ -1504,7 +1504,6 @@ int main_giraffe(int argc, char** argv) { if (sample == "giraffe") { logger.warn() << "Using \"giraffe\" as a sample name may lead to filename collisions." << std::endl; } - index_basename = index_basename + "." + sample; // Pass parameters to the recipes through IndexingParameters. IndexingParameters::haplotype_reference_samples = reference_samples; @@ -1515,45 +1514,52 @@ int main_giraffe(int argc, char** argv) { if (!fastq_filename_2.empty()) { fastqs.push_back(fastq_filename_2); } - registry.provide("FASTQ", fastqs); + // Provide the FASTQs to the index registry, but mark everything + // derived from them as being scoped to the particular sample. + registry.provide("FASTQ", fastqs, sample); } else if (!provided_indexes.count("KFF Kmer Counts")) { // TODO: Eventually learn to kmer count from GAM logger.error() << "Cannot do haplotype sampling without kmer counts (--kff-name) or FASTQ input (--fastq-in/-f)" << std::endl; } // Otherwise, we know we're sending kmer counts and will still do haplotype sampling - if (!registry.available("Top Level Chain Distance Index")) { - // Look for base GBZ distance index - string tcdist_candidate = original_basename + ".tcdist"; - string dist_candidate = original_basename + ".dist"; - if (file_exists(tcdist_candidate)) { - registry.provide("Top Level Chain Distance Index", tcdist_candidate); - } else if (file_exists(dist_candidate)) { - // Provide it as a top level chain index even though it has more stuff - registry.provide("Top Level Chain Distance Index", dist_candidate); + if (!index_basename.empty()) { + // We have some idea where indexes might be, so guess them. + if (!registry.available("Top Level Chain Distance Index")) { + // Look for base GBZ distance index + string tcdist_candidate = index_basename + ".tcdist"; + string dist_candidate = index_basename + ".dist"; + if (file_exists(tcdist_candidate)) { + registry.provide("Top Level Chain Distance Index", tcdist_candidate); + } else if (file_exists(dist_candidate)) { + // Provide it as a top level chain index even though it has more stuff + registry.provide("Top Level Chain Distance Index", dist_candidate); + } } - } - - if (!registry.available("Haplotype Index")) { - // Look for hapl index - string hapl_candidate = original_basename + ".hapl"; - if (file_exists(hapl_candidate)) { - registry.provide("Haplotype Index", hapl_candidate); + + if (!registry.available("Haplotype Index")) { + // Look for hapl index + string hapl_candidate = index_basename + ".hapl"; + if (file_exists(hapl_candidate)) { + registry.provide("Haplotype Index", hapl_candidate); + } } - } - if (!registry.available("r Index")) { - // Look for r index - string r_candidate = original_basename + ".ri"; - if (file_exists(r_candidate)) { - registry.provide("r Index", r_candidate); + if (!registry.available("r Index")) { + // Look for r index + string r_candidate = index_basename + ".ri"; + if (file_exists(r_candidate)) { + registry.provide("r Index", r_candidate); + } } } } - - // Use the basename we've determined for generated indexes - registry.set_prefix(index_basename); + + if (!index_basename.empty()) { + // Use the basename we've determined for generated indexes + registry.set_prefix(index_basename); + } // The IndexRegistry doesn't try to infer index files based on the // basename, so do that here. We can have multiple extension options that diff --git a/test/haplotype-sampling/README.md b/test/haplotype-sampling/README.md index 406e9e9b82..e188cf8a9c 100644 --- a/test/haplotype-sampling/README.md +++ b/test/haplotype-sampling/README.md @@ -147,3 +147,9 @@ The resulting fasta file can be used as the reference with any linear aligner. Documentation for the `vg haplotypes` subcommand can be found in the [vg wiki](https://github.com/vgteam/vg/wiki/Haplotype-Sampling). You may also want to read about the [best practices](https://github.com/vgteam/vg/wiki/Giraffe-best-practices) for mapping reads with Giraffe. + +Remember to clean up after trying all the examples: + +``` +rm -f HG003.gbz giraffe-integration.gam micb-kir3dl1.dist micb-kir3dl1.gbz micb-kir3dl1.giraffe.gbz micb-kir3dl1.hapl micb-kir3dl1.ri micb-kir3dl1.shortread.withzip.min micb-kir3dl1.shortread.zipcodes reference.fa to-original.gam to-sampled.gam +``` From 79a5672f93e925e427787cc859c2fe6ac4819763 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Tue, 10 Mar 2026 13:33:48 -0400 Subject: [PATCH 09/22] Simplify alias fulfillment --- src/index_registry.cpp | 40 ++++++++++++++++------------------------ 1 file changed, 16 insertions(+), 24 deletions(-) diff --git a/src/index_registry.cpp b/src/index_registry.cpp index 8a476ede15..c574640f4c 100644 --- a/src/index_registry.cpp +++ b/src/index_registry.cpp @@ -4826,35 +4826,27 @@ void IndexRegistry::make_indexes(const vector& identifiers) { } const auto& aliasee_filenames = get_index(aliasee)->get_filenames(); - - // copy aliases for any that we need to (start past index 0 if we can move it) - for (size_t i = can_move; i < aliasors.size(); ++i) { - // We want the aliasor to point at the new paths - vector new_aliasor_files; - for (size_t j = 0; j < aliasee_filenames.size(); ++j) { - - auto copy_filename = plan.output_filepath(aliasors[i], j, aliasee_filenames.size()); - copy_file(aliasee_filenames[j], copy_filename); - new_aliasor_files.push_back(copy_filename); - } - // Point the aliasor index at the new paths - get_index(aliasors[i])->assign_constructed(new_aliasor_files); - } - // if we can move the aliasee (i.e. it is intermediate), then make - // one index (the 0th one) by moving instead of copying - if (can_move) { + + for (auto it = aliasors.rbegin(); it != aliasors.rend(); ++it) { + // Fulfill the aliasor from the aliasee (in reverse order, so the + // 0th aliasor gets to have the moved files). vector new_aliasor_files; for (size_t j = 0; j < aliasee_filenames.size(); ++j) { - auto move_filename = plan.output_filepath(aliasors[0], j, aliasee_filenames.size()); - int code = rename(aliasee_filenames[j].c_str(), move_filename.c_str()); - if (code) { - // moving failed (maybe because the files on separate drives?) fall back on copying - copy_file(aliasee_filenames[j], move_filename); + auto dest_filename = plan.output_filepath(*it, j, aliasee_filenames.size()); + bool moved = false; + if (can_move && (it + 1 == aliasors.rend())) { + // We can move and this is the last aliasor we will do, so try moving + moved = rename(aliasee_filenames[j].c_str(), dest_filename.c_str()) == 0; } - new_aliasor_files.push_back(move_filename); + if (!moved) { + // moving failed or is not allowed, fall back on copying + copy_file(aliasee_filenames[j], dest_filename); + } + + new_aliasor_files.push_back(dest_filename); } // Point the aliasor index at the new paths - get_index(aliasors[0])->assign_constructed(new_aliasor_files); + get_index(*it)->assign_constructed(new_aliasor_files); } } From ebd38aeacdb52dbd91c4eb2e11c0f401ba72c4fc Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Tue, 10 Mar 2026 14:31:31 -0400 Subject: [PATCH 10/22] Make plan really understand provided indexes and actually attach scopes to inputs --- src/index_registry.cpp | 63 +++++++++++++++++++++++++-------- src/index_registry.hpp | 18 ++++++++-- src/subcommand/giraffe_main.cpp | 60 +++++++++++++++++++------------ 3 files changed, 102 insertions(+), 39 deletions(-) diff --git a/src/index_registry.cpp b/src/index_registry.cpp index c574640f4c..b34c093a6a 100644 --- a/src/index_registry.cpp +++ b/src/index_registry.cpp @@ -4647,6 +4647,10 @@ set IndexingPlan::dependents(const IndexName& identifier) const { IndexGroup successor_indexes{identifier}; for (const auto& step : steps) { + if (!registry->has_recipe(step)) { + // This step is just a provided input and not a real recipe step. + continue; + } // TODO: should this behavior change if some of the inputs were provided directly? @@ -4669,12 +4673,14 @@ set IndexingPlan::dependents(const IndexName& identifier) const { } void IndexingPlan::add_scope(const IndexName& identifier, const string& scope) { - auto index_scopes = scopes[identifier]; + // Get a mutable reference to the scopes we need to add to. + auto& index_scopes = scopes[identifier]; // TODO: Change the data structure here if we anticipate an appreciable number of scopes. auto found = std::find(index_scopes.begin(), index_scopes.end(), scope); if (found == index_scopes.end()) { index_scopes.push_back(scope); + std::cerr << "Plan scope " << scope << " for " << identifier << std::endl; } } @@ -4894,7 +4900,9 @@ void IndexRegistry::provide(const IndexName& identifier, const vector& f require_exists(context, filename); } } - get_index(identifier)->provide(filenames); + auto index = get_index(identifier); + index->provide(filenames); + index->add_scope(scope); } void IndexRegistry::reset(const IndexName& identifier) { @@ -5503,6 +5511,8 @@ IndexingPlan IndexRegistry::make_plan(const IndexGroup& end_products) const { IndexGroup product_group{product}; // records of (identifier, requesters, ordinal index of recipe selected) + // A max-value sentinel index means no recipe is used and the index is + // provided. vector, size_t>> plan_path; // map dependency priority to requesters @@ -5643,6 +5653,7 @@ IndexingPlan IndexRegistry::make_plan(const IndexGroup& end_products) const { // get the latest file in the dependency order that we have left to build auto it = queue.begin(); + // Imagine starting with the first recipe for it plan_path.emplace_back(it->first, it->second, 0); #ifdef debug_index_registry @@ -5666,6 +5677,10 @@ IndexingPlan IndexRegistry::make_plan(const IndexGroup& end_products) const { #ifdef debug_index_registry cerr << "index has been provided as input" << endl; #endif + + // Don't point to a recipe for it that might exist; point to a + // nonexistent sentinel recipe. + get<2>(plan_path.back()) = std::numeric_limits::max(); continue; } else if (recipe_registry.count(index_group)) { @@ -5728,7 +5743,13 @@ IndexingPlan IndexRegistry::make_plan(const IndexGroup& end_products) const { #ifdef debug_index_registry cerr << "final plan path for index " << product << ":" << endl; for (auto path_elem : plan_path) { - cerr << "\t" << to_string(identifier_order[get<0>(path_elem)]) << ", recipe " << get<2>(path_elem) << ", from:" << endl; + cerr << "\t" << to_string(identifier_order[get<0>(path_elem)]); + if (get<2>(path_elem) == std::numeric_limits::max()) { + cerr << ", provided"; + } else { + cerr << ", recipe " << get<2>(path_elem); + } + cerr << ", from:" << endl; for (auto d : get<1>(path_elem)) { cerr << "\t\t" << to_string(identifier_order[d]) << endl; } @@ -5764,7 +5785,7 @@ IndexingPlan IndexRegistry::make_plan(const IndexGroup& end_products) const { #ifdef debug_index_registry cerr << "plan before applying generalizations:" << endl; for (auto plan_elem : plan.steps) { - cerr << "\t" << to_string(plan_elem.first) << " " << plan_elem.second << endl; + cerr << "\t" << to_string(plan_elem.first) << " " << (plan_elem.second == std::numeric_limits::max() ? "Provided" : std::to_string(plan_elem.second)) << endl; } #endif @@ -5777,17 +5798,10 @@ IndexingPlan IndexRegistry::make_plan(const IndexGroup& end_products) const { #ifdef debug_index_registry cerr << "full plan including provided files:" << endl; for (auto plan_elem : plan.steps) { - cerr << "\t" << to_string(plan_elem.first) << " " << plan_elem.second << endl; + cerr << "\t" << to_string(plan_elem.first) << " " << (plan_elem.second == std::numeric_limits::max() ? "Provided" : std::to_string(plan_elem.second)) << endl; } #endif - // Now remove the input data from the plan - plan.steps.resize(remove_if(plan.steps.begin(), plan.steps.end(), [&](const RecipeName& recipe_choice) { - return all_finished(recipe_choice.first); - }) - plan.steps.begin()); - - // Now we're allowed to use dependents() - // Propagate scopes from input files for (auto plan_it = plan.steps.begin(); plan_it != plan.steps.end(); ++plan_it) { // For each step in ther plan in order @@ -5797,15 +5811,18 @@ IndexingPlan IndexRegistry::make_plan(const IndexGroup& end_products) const { const IndexFile* index = get_index(index_name); if (!index->is_finished()) { // Only finished indexes bring in new scopes + std::cerr << "Index " << index_name << " isn't finished and can't bring in scopes" << std::endl; continue; } // Get the scopes that came in with this input auto& provided_scopes = index->get_scopes(); - + // We need to attach these scopes to anything based on this index. // TODO: Should we just propagate scopes step by step as we scan the plan instead? - for (const RecipeName& dependent_recipe : plan.dependents(index_name)) { + auto dependents = plan.dependents(index_name); + std::cerr << "Index " << index_name << " adds " << provided_scopes.size() << " scopes to " << dependents.size() << " dependents" << std::endl; + for (const RecipeName& dependent_recipe : dependents) { // For each recipe transitively depending on this input for (const IndexName& dependent_index_name : dependent_recipe.first) { // For each index the recipe generates @@ -5823,9 +5840,25 @@ IndexingPlan IndexRegistry::make_plan(const IndexGroup& end_products) const { } } + // Now remove the input data from the plan + plan.steps.resize(remove_if(plan.steps.begin(), plan.steps.end(), [&](const RecipeName& recipe_choice) { + return all_finished(recipe_choice.first); + }) - plan.steps.begin()); + return plan; } +bool IndexRegistry::has_recipe(const RecipeName& recipe_name) const { + auto found = recipe_registry.find(recipe_name.first); + if (found == recipe_registry.end()) { + // No recipes at all for this + return false; + } + // Detemrine if this recipe index is in range. + // Sentinel max-value names are always out of range. + return recipe_name.second < found->second.size(); +} + const IndexRecipe& IndexRegistry::get_recipe(const RecipeName& recipe_name) const { const auto& recipes = recipe_registry.at(recipe_name.first); assert(recipe_name.second < recipes.size()); @@ -6008,6 +6041,7 @@ void IndexFile::add_scope(const string& scope) { auto found = std::find(scopes.begin(), scopes.end(), scope); if (found == scopes.end()) { scopes.push_back(scope); + std::cerr << "Added scope " << scope << " to " << get_identifier() << std::endl; } } @@ -6022,6 +6056,7 @@ bool IndexFile::is_finished() const { void IndexFile::reset() { filenames.clear(); provided_directly = false; + scopes.clear(); } IndexRecipe::IndexRecipe(const vector& inputs, diff --git a/src/index_registry.hpp b/src/index_registry.hpp index 6726a419d0..29bdafd481 100644 --- a/src/index_registry.hpp +++ b/src/index_registry.hpp @@ -36,6 +36,9 @@ using IndexGroup = set; /** * Names a recipe in the collection of registered recipes. + * + * A max-value number means the indexes are provided and not generated by any + * recipe. */ using RecipeName = pair; @@ -195,9 +198,6 @@ class IndexingPlan { /// Returns the recipes in the plan that depend on this index, including the one in which /// it was created (if any). - /// - /// Requires that all indexes in the plan be generated by a recipe; inputs - /// must not be in the plan as steps in order to call this. set dependents(const IndexName& identifier) const; protected: @@ -279,10 +279,14 @@ class IndexRegistry { /// Indicate a serialized file that contains some identified index, /// optionally with a scope that propagates to descendant files. + /// + /// The empty string represents no scope. void provide(const IndexName& identifier, const string& filename, const std::string& scope = ""); /// Indicate a list of serialized files that contains some identified index, /// optionally with a scope that propagates to descendant files. + /// + /// The empty string represents no scope. void provide(const IndexName& identifier, const vector& filenames, const std::string& scope = ""); /// Remove a provided index @@ -342,6 +346,14 @@ class IndexRegistry { /// generate a plan to create the indexes IndexingPlan make_plan(const IndexGroup& end_products) const; + /// Check if a recipe identifier correesponds to a recipe. + /// + /// Recipe identifiers not corresponding to actual recipes are used during + /// planning to represent provided inputs. + /// + /// TODO: Refactor that with some kind of tagged union or optional. + bool has_recipe(const RecipeName& recipe_name) const; + /// use a recipe identifier to get the recipe const IndexRecipe& get_recipe(const RecipeName& recipe_name) const; diff --git a/src/subcommand/giraffe_main.cpp b/src/subcommand/giraffe_main.cpp index c83d4930be..e16efe0c76 100644 --- a/src/subcommand/giraffe_main.cpp +++ b/src/subcommand/giraffe_main.cpp @@ -773,7 +773,7 @@ int main_giraffe(int argc, char** argv) { string index_basename_guess, index_basename; // For haplotype sampling. - string haplotype_name, kff_name; + string kff_filename; bool haplotype_sampling_flag = false; std::unordered_set reference_samples; @@ -1191,7 +1191,9 @@ int main_giraffe(int argc, char** argv) { provided_indexes.emplace("Haplotype Index", optarg); break; case OPT_KFF_NAME: - provided_indexes.emplace("KFF Kmer Counts", optarg); + // We need to provide the KFF with sample scope if we provide + // it, but we're not sure of the sample name yet. + kff_filename = optarg; break; case OPT_INDEX_BASENAME: index_basename = optarg; @@ -1474,40 +1476,50 @@ int main_giraffe(int argc, char** argv) { // Haplotype sampling is active when explicitly requested (--haplotype-sampling) // or when either --haplotype-name or --kff-name is given. - bool haplotype_sampling = haplotype_sampling_flag || provided_indexes.count("Haplotype Index") || provided_indexes.count("KFF Kmer Counts"); + // Otherwise we'd always sample when we had reads available. + bool haplotype_sampling = haplotype_sampling_flag || provided_indexes.count("Haplotype Index") || !kff_filename.empty(); + string sample_scope; if (haplotype_sampling) { - // Determine sample name early so we can incorporate it into the + // Determine sample scope so we can incorporate it into the // index basename; downstream indexes (dist, min) will then be - // co-located with the sampled GBZ. - string sample = sample_name; - if (sample.empty()) { - if (!kff_name.empty()) { - sample = file_base_name(kff_name); + // scoped to the sample. + sample_scope = sample_name; + if (sample_scope.empty()) { + if (!kff_filename.empty()) { + sample_scope = file_base_name(kff_filename); if (show_progress) { - logger.info() << "Guessing from " << kff_name - << " that sample name is " << sample << std::endl; + logger.info() << "Guessing from " << kff_filename + << " that sample name is " << sample_scope << std::endl; } } else if (!fastq_filename_1.empty()) { // Strip .gz then use file_base_name to strip .fq/.fastq, since // strip_suffixes stops at the first one that's not there. - sample = file_base_name(strip_suffixes(fastq_filename_1, {".gz"})); + sample_scope = file_base_name(strip_suffixes(fastq_filename_1, {".gz"})); if (show_progress) { logger.info() << "Guessing from " << fastq_filename_1 - << " that sample name is " << sample << std::endl; + << " that sample name is " << sample_scope << std::endl; } } else { - sample = "sample"; + sample_scope = "sample"; } } - if (sample == "giraffe") { + if (sample_scope == "giraffe") { logger.warn() << "Using \"giraffe\" as a sample name may lead to filename collisions." << std::endl; } // Pass parameters to the recipes through IndexingParameters. IndexingParameters::haplotype_reference_samples = reference_samples; + + if (kff_filename.empty() && fastq_filename_1.empty()) { + // TODO: Eventually learn to kmer count from GAM + logger.error() << "Cannot do haplotype sampling without kmer counts (--kff-name) or FASTQ input (--fastq-in/-f)" << std::endl; + } + if (!kff_filename.empty()) { + registry.provide("KFF Kmer Counts", kff_filename, sample_scope); + } if (!fastq_filename_1.empty()) { // If FASTQs are available, send those and we will do haplotype sampling. vector fastqs = {fastq_filename_1}; @@ -1516,12 +1528,9 @@ int main_giraffe(int argc, char** argv) { } // Provide the FASTQs to the index registry, but mark everything // derived from them as being scoped to the particular sample. - registry.provide("FASTQ", fastqs, sample); - } else if (!provided_indexes.count("KFF Kmer Counts")) { - // TODO: Eventually learn to kmer count from GAM - logger.error() << "Cannot do haplotype sampling without kmer counts (--kff-name) or FASTQ input (--fastq-in/-f)" << std::endl; + registry.provide("FASTQ", fastqs, sample_scope); } - // Otherwise, we know we're sending kmer counts and will still do haplotype sampling + if (!index_basename.empty()) { // We have some idea where indexes might be, so guess them. @@ -1594,13 +1603,20 @@ int main_giraffe(int argc, char** argv) { bool found = false; for (auto& extension : index_and_extensions->second) { // For each extension in priority order - string inferred_filename = registry.get_prefix() + "." + extension; + string inferred_filename = registry.get_prefix(); + if (haplotype_sampling) { + // All these are sample-scoped if we're haplotype sampling. + // TODO: Let index registry generate the filenames, which it knows how to do. + inferred_filename += "." + sample_scope; + } + + inferred_filename += "." + extension; if (file_exists(inferred_filename)) { // A file with the appropriate name exists and we can read it. // Report it because this may not be desired behavior. logger.info() << "Guessing that " << inferred_filename << " is " << index_and_extensions->first << endl; - registry.provide(index_and_extensions->first, inferred_filename); + registry.provide(index_and_extensions->first, inferred_filename, haplotype_sampling ? sample_scope : ""); found = true; // Skip other extension options for the index break; From 955030861484f872c271394312199fcecf644844 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Tue, 10 Mar 2026 14:37:18 -0400 Subject: [PATCH 11/22] Don't add empty scopes and quiet debugging --- src/index_registry.cpp | 17 +++++++++++------ src/subcommand/giraffe_main.cpp | 2 +- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/src/index_registry.cpp b/src/index_registry.cpp index b34c093a6a..5b21a8d6ff 100644 --- a/src/index_registry.cpp +++ b/src/index_registry.cpp @@ -67,9 +67,9 @@ #include "algorithms/component.hpp" #include "algorithms/find_translation.hpp" -#define debug_index_registry +//#define debug_index_registry //#define debug_index_registry_setup -#define debug_index_registry_recipes +//#define debug_index_registry_recipes //#define debug_index_registry_path_state namespace std { @@ -4631,7 +4631,6 @@ string IndexingPlan::output_filepath(const IndexName& identifier, size_t chunk, filepath << "." << to_string(chunk); } filepath << "." << registry->get_index(identifier)->get_suffix(); - std::cerr << "Considering output file path " << filepath.str() << std::endl; return filepath.str(); } @@ -4680,7 +4679,6 @@ void IndexingPlan::add_scope(const IndexName& identifier, const string& scope) { auto found = std::find(index_scopes.begin(), index_scopes.end(), scope); if (found == index_scopes.end()) { index_scopes.push_back(scope); - std::cerr << "Plan scope " << scope << " for " << identifier << std::endl; } } @@ -4902,7 +4900,9 @@ void IndexRegistry::provide(const IndexName& identifier, const vector& f } auto index = get_index(identifier); index->provide(filenames); - index->add_scope(scope); + if (!scope.empty()) { + index->add_scope(scope); + } } void IndexRegistry::reset(const IndexName& identifier) { @@ -5811,7 +5811,9 @@ IndexingPlan IndexRegistry::make_plan(const IndexGroup& end_products) const { const IndexFile* index = get_index(index_name); if (!index->is_finished()) { // Only finished indexes bring in new scopes +#ifdef debug_index_registry std::cerr << "Index " << index_name << " isn't finished and can't bring in scopes" << std::endl; +#endif continue; } @@ -5821,7 +5823,9 @@ IndexingPlan IndexRegistry::make_plan(const IndexGroup& end_products) const { // We need to attach these scopes to anything based on this index. // TODO: Should we just propagate scopes step by step as we scan the plan instead? auto dependents = plan.dependents(index_name); +#ifdef debug_index_registry std::cerr << "Index " << index_name << " adds " << provided_scopes.size() << " scopes to " << dependents.size() << " dependents" << std::endl; +#endif for (const RecipeName& dependent_recipe : dependents) { // For each recipe transitively depending on this input for (const IndexName& dependent_index_name : dependent_recipe.first) { @@ -6018,7 +6022,6 @@ void IndexFile::provide(const vector& filenames) { } void IndexFile::assign_constructed(const vector& filenames) { - std::cerr << "Set " << get_identifier() << " to be at:"; for (auto& f : filenames) { std::cerr << " " << f; } @@ -6041,7 +6044,9 @@ void IndexFile::add_scope(const string& scope) { auto found = std::find(scopes.begin(), scopes.end(), scope); if (found == scopes.end()) { scopes.push_back(scope); +#ifdef debug_index_registry std::cerr << "Added scope " << scope << " to " << get_identifier() << std::endl; +#endif } } diff --git a/src/subcommand/giraffe_main.cpp b/src/subcommand/giraffe_main.cpp index e16efe0c76..8846c28530 100644 --- a/src/subcommand/giraffe_main.cpp +++ b/src/subcommand/giraffe_main.cpp @@ -60,7 +60,7 @@ static long perf_event_open(struct perf_event_attr* hw_event, pid_t pid, int cpu } #endif -#define debug +//#define debug using namespace std; using namespace vg; From 5ae95d5e0569310dc7274b50f5b3f077e7072fe1 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Tue, 10 Mar 2026 15:59:33 -0400 Subject: [PATCH 12/22] Add a terrible multi-extension system that needs hacky workarounds and give kmc a useful file limit --- src/index_registry.cpp | 116 ++++++++++++++++++++++++------ src/index_registry.hpp | 27 +++++-- src/subcommand/autoindex_main.cpp | 9 +++ 3 files changed, 126 insertions(+), 26 deletions(-) diff --git a/src/index_registry.cpp b/src/index_registry.cpp index 5b21a8d6ff..a2feedb04e 100644 --- a/src/index_registry.cpp +++ b/src/index_registry.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -625,7 +626,7 @@ IndexRegistry VGIndexes::get_vg_index_registry() { registry.register_index("GBWTGraph", "gg"); registry.register_index("GBZ", "gbz"); - registry.register_index("Giraffe GBZ", "giraffe.gbz"); + registry.register_index("Giraffe GBZ", std::vector{"gbz", "giraffe.gbz"}); registry.register_index("Haplotype-Sampled GBZ", "sampled.gbz"); registry.register_index("Top Level Chain Distance Index", "tcdist"); registry.register_index("r Index", "ri"); @@ -4426,6 +4427,7 @@ IndexRegistry VGIndexes::get_vg_index_registry() { string kmc_tmp_dir = tmp_dir + "/kmc_tmp"; mkdir(kmc_tmp_dir.c_str(), 0700); + assert(std::filesystem::exists(kmc_tmp_dir)); // Run kmc: count k-mers in KFF format with canonical k-mers. // Use fork()+execvp() instead of system() to avoid shell quoting issues. @@ -4452,6 +4454,16 @@ IndexRegistry VGIndexes::get_vg_index_registry() { close(devnull); } } + + // Allow enough open files for kmc to work; see + // + // and + // TODO: Probably this isn't allowed after fork but we do it anyway. + struct rlimit new_limit; + new_limit.rlim_cur = 2048; + new_limit.rlim_max = RLIM_INFINITY; + setrlimit(RLIMIT_NOFILE, &new_limit); + // execvp promises never to modify its arguments but casn't say // that in the C++ type system because it causes problems in the C // type system. See https://stackoverflow.com/a/19505361 @@ -4630,7 +4642,9 @@ string IndexingPlan::output_filepath(const IndexName& identifier, size_t chunk, // that start with digits) filepath << "." << to_string(chunk); } - filepath << "." << registry->get_index(identifier)->get_suffix(); + // Use the suffix we've assigned, which is the first one we have that's + // unique within the scopes. + filepath << "." << assigned_suffix.at(identifier); return filepath.str(); } @@ -4859,26 +4873,41 @@ void IndexRegistry::make_indexes(const vector& identifiers) { } void IndexRegistry::register_index(const IndexName& identifier, const string& suffix) { + register_index(identifier, vector{suffix}); +} + +void IndexRegistry::register_index(const IndexName& identifier, const vector& suffixes) { // Add this index to the registry if (identifier.empty()) { error(context) << "indexes must have a non-empty identifier" << endl; } - if (suffix.empty()) { - error(context) << "indexes must have a non-empty suffix" << endl; - } - if (isdigit(suffix.front())) { - // this ensures that we can add numbers to the suffix to create a unique suffix - // for chunked workflows - error(context) << "suffixes cannot start with a digit" << endl; - } if (index_registry.count(identifier)) { error(context) << "index registry contains a duplicated identifier: " << identifier << endl; } - if (registered_suffixes.count(suffix)) { - error(context) << "index registry contains a duplicated suffix: " << suffix << endl; + bool has_unique_suffix = false; + for (auto suffix : suffixes) { + if (suffix.empty()) { + error(context) << "indexes must have a non-empty suffix" << endl; + } + if (isdigit(suffix.front())) { + // this ensures that we can add numbers to the suffix to create a unique suffix + // for chunked workflows + error(context) << "suffixes cannot start with a digit" << endl; + } + if (!registered_suffixes.count(suffix)) { + has_unique_suffix = true; + } + } + if (!has_unique_suffix) { + // We need each index to have at least one new suffix not yet used or + // we might not be able to find a place to put it when making the + // indexes in the order registered. + error(context) << "index registry contains other indexes for all suffixes for " << identifier << endl; + } + index_registry[identifier] = unique_ptr(new IndexFile(identifier, suffixes)); + for (auto suffix : suffixes) { + registered_suffixes.insert(suffix); } - index_registry[identifier] = unique_ptr(new IndexFile(identifier, suffix)); - registered_suffixes.insert(suffix); } @@ -4934,7 +4963,12 @@ vector IndexRegistry::get_possible_filenames(const IndexName& identifier error(context) << "cannot require unregistered index: " << identifier << endl; } const IndexFile* index = get_index(identifier); - return {get_prefix() + "." + index->get_suffix()}; + vector filenames; + for (auto& suffix : index->get_suffixes()) { + // Try all the possible suffixes + filenames.push_back(get_prefix() + "." + suffix); + } + return filenames; } vector IndexRegistry::require(const IndexName& identifier) const { @@ -5844,6 +5878,44 @@ IndexingPlan IndexRegistry::make_plan(const IndexGroup& end_products) const { } } + // Assign each index a unique scoped suffix, so we can use ..gbz + // for the Giraffe GBZ and .gbz for the plain one. + unordered_set used_scoped_suffixes; + for (auto& step : plan.steps) { + for (const IndexName& index_name : step.first) { + auto index = get_index(index_name); + // If scopes were provided, this is an input, and just use those + vector scopes = index->get_scopes(); + if (scopes.empty()) { + // Otherwise see if we have planned scopes + scopes = plan.get_scopes(index_name); + } + + // Build the part that is any scopes (either provided or planned) + std::stringstream suffix_base; + for (auto& scope : scopes) { + suffix_base << scope << "."; + } + + for (auto& suffix : index->get_suffixes()) { + // Condider this suffix option on top of the scopes + string candidate_suffix = suffix_base.str() + suffix; + + auto found = used_scoped_suffixes.find(candidate_suffix); + if (found == used_scoped_suffixes.end()) { + // This is a fresh one. Use it. + used_scoped_suffixes.emplace_hint(found, candidate_suffix); + // We only remember the suffix itself; we reconstruct the + // actual name later, with chunk info inserted. + // TODO: Instead of building an essentially fake string key + // should we do something else? + plan.assigned_suffix[index_name] = suffix; + break; + } + } + } + } + // Now remove the input data from the plan plan.steps.resize(remove_if(plan.steps.begin(), plan.steps.end(), [&](const RecipeName& recipe_choice) { return all_finished(recipe_choice.first); @@ -5996,7 +6068,11 @@ string IndexRegistry::to_dot(const vector& targets) const { return strm.str(); } -IndexFile::IndexFile(const IndexName& identifier, const string& suffix) : identifier(identifier), suffix(suffix) { +IndexFile::IndexFile(const IndexName& identifier, const string& suffix) : IndexFile(identifier, vector{suffix}) { + // Nothing to do! +} + +IndexFile::IndexFile(const IndexName& identifier, const vector& suffixes) : identifier(identifier), suffixes(suffixes) { // nothing more to do } @@ -6004,8 +6080,8 @@ const IndexName& IndexFile::get_identifier() const { return identifier; } -const string& IndexFile::get_suffix() const { - return suffix; +const vector& IndexFile::get_suffixes() const { + return suffixes; } const vector& IndexFile::get_filenames() const { @@ -6022,10 +6098,6 @@ void IndexFile::provide(const vector& filenames) { } void IndexFile::assign_constructed(const vector& filenames) { - for (auto& f : filenames) { - std::cerr << " " << f; - } - std::cerr << std::endl; this->filenames = filenames; provided_directly = false; } diff --git a/src/index_registry.hpp b/src/index_registry.hpp index 29bdafd481..bdc08c8b1f 100644 --- a/src/index_registry.hpp +++ b/src/index_registry.hpp @@ -215,6 +215,9 @@ class IndexingPlan { /// applies only to a particular sample, it will be scoped to the sample /// name). Stored deduplicated and in the order added. map> scopes; + /// The unique (when combined with scopes) suffix for each index in the + /// plan. + map assigned_suffix; /// The registry that the plan is using. /// The registry must not move while the plan is in use. @@ -266,6 +269,11 @@ class IndexRegistry { /// Register an index containing the given identifier void register_index(const IndexName& identifier, const string& suffix); + + /// Register an index containing the given identifier, with multiple possible suffixes. + /// The first suffix not already occupied will be used (allowing scopes to + /// disambiguate between unscoped and scoped indexes with the same suffix). + void register_index(const IndexName& identifier, const vector& suffixes); /// Register a recipe to produce an index using other indexes /// or input files. Recipes registered earlier will have higher priority. @@ -281,12 +289,20 @@ class IndexRegistry { /// optionally with a scope that propagates to descendant files. /// /// The empty string represents no scope. + /// + /// TODO: If scopes contain ".", we can run into problems with combinations + /// of different scopes producing the same final string. Right now we only + /// use one kind of scope, which avoids this. void provide(const IndexName& identifier, const string& filename, const std::string& scope = ""); /// Indicate a list of serialized files that contains some identified index, /// optionally with a scope that propagates to descendant files. /// /// The empty string represents no scope. + /// + /// TODO: If scopes contain ".", we can run into problems with combinations + /// of different scopes producing the same final string. Right now we only + /// use one kind of scope, which avoids this. void provide(const IndexName& identifier, const vector& filenames, const std::string& scope = ""); /// Remove a provided index @@ -408,12 +424,15 @@ class IndexFile { /// Create a new IndexFile with a unique identifier IndexFile(const IndexName& identifier, const string& suffix); + + /// Create a new IndexFile with a unique identifier, which may use one of several suffixes. + IndexFile(const IndexName& identifier, const vector& suffixes); /// Get the globally unique identifier for this index const IndexName& get_identifier() const; - /// Returns the suffix to be used for this index - const string& get_suffix() const; + /// Returns the suffixes that can used for this index + const vector& get_suffixes() const; /// Get the filename(s) that contain this index const vector& get_filenames() const; @@ -450,8 +469,8 @@ class IndexFile { // the global identifier for the IndexName identifier; - // the suffix it adds to output files - const string suffix; + // the suffixes it can add to output files + const vector suffixes; // the filename(s) associated with the index vector filenames; diff --git a/src/subcommand/autoindex_main.cpp b/src/subcommand/autoindex_main.cpp index fc16be62fd..5179424407 100644 --- a/src/subcommand/autoindex_main.cpp +++ b/src/subcommand/autoindex_main.cpp @@ -390,6 +390,15 @@ int main_autoindex(int argc, char** argv) { if (!registry.available(target)) { vector inferred_file_names = registry.get_possible_filenames(target); for (const string& filename : inferred_file_names) { + if (target == "Giraffe GBZ" && !ends_with(filename, ".giraffe.gbz")) { + // TODO: Giraffe GBZ indexes can be saved as ..gbz or .giraffe.gbz. + // But we can't pick up ..gbz automatically without possibly mistaking a plain GBZ for a Giraffe one. + // And we don't handle haplotype samplign in autoindex yet anyway. + // So only find Giraffe GBZs. + // TODO: Allow the siffixes to have Snakemake-style + // wildcards that populate scopes on the files. + continue; + } if (file_exists(filename)) { logger.info() << "Guessing that " << filename << " is " << target << endl; registry.provide(target, filename); From 888d65c4a814d1b7ac36b11d5168cd4b71f84f6f Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Tue, 10 Mar 2026 16:36:40 -0400 Subject: [PATCH 13/22] Add missing dependency --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 2a1514a81e..1fbc47e12e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -44,7 +44,7 @@ RUN apt-get -qq -y update && apt-get -qq -y upgrade && apt-get -y install \ samtools curl unzip redland-utils librdf-dev cmake pkg-config wget gtk-doc-tools \ raptor2-utils rasqal-utils bison flex gawk libgoogle-perftools-dev liblz4-dev liblzma-dev \ libcairo2-dev libpixman-1-dev libffi-dev libcairo-dev libprotobuf-dev libboost-all-dev \ - tabix bcftools libzstd-dev pybind11-dev python3-pybind11 pandoc libssl-dev + tabix bcftools libzstd-dev pybind11-dev python3-pybind11 pandoc libssl-dev kmc ###DEPS_END### FROM packages AS build From a4869042eb053686adb00254a5a1856beec5dce4 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Tue, 10 Mar 2026 17:34:33 -0400 Subject: [PATCH 14/22] Hook up CLI for non-diploid sampling and controlling haplotype count --- src/index_registry.cpp | 27 +++++++----- src/index_registry.hpp | 12 ++++-- src/subcommand/giraffe_main.cpp | 69 ++++++++++++++++++------------- test/haplotype-sampling/README.md | 2 +- test/t/54_vg_haplotypes.t | 18 +++++++- 5 files changed, 83 insertions(+), 45 deletions(-) diff --git a/src/index_registry.cpp b/src/index_registry.cpp index a2feedb04e..fdf79cbb38 100644 --- a/src/index_registry.cpp +++ b/src/index_registry.cpp @@ -130,9 +130,11 @@ int IndexingParameters::downsample_context_length = gbwtgraph::PATH_COVER_DEFAUL double IndexingParameters::max_memory_proportion = 0.75; double IndexingParameters::thread_chunk_inflation_factor = 2.0; IndexingParameters::Verbosity IndexingParameters::verbosity = IndexingParameters::Basic; -std::unordered_set IndexingParameters::haplotype_reference_samples = {}; -int IndexingParameters::haplotype_minimizer_k = 29; -int IndexingParameters::haplotype_minimizer_w = 11; +std::unordered_set IndexingParameters::haplotype_sampling_reference_samples = {}; +size_t IndexingParameters::haplotype_sampling_num_haplotypes = Recombinator::NUM_CANDIDATES; +bool IndexingParameters::haplotype_sampling_diploid = true; +int IndexingParameters::haplotype_sampling_minimizer_k = 29; +int IndexingParameters::haplotype_sampling_minimizer_w = 11; void copy_file(const string& from_fp, const string& to_fp) { require_exists(context, from_fp); @@ -4215,14 +4217,14 @@ IndexRegistry VGIndexes::get_vg_index_registry() { load_gbz(gbz, gbz_filename, IndexingParameters::verbosity == IndexingParameters::Debug); // Override reference samples if requested. - if (!IndexingParameters::haplotype_reference_samples.empty()) { + if (!IndexingParameters::haplotype_sampling_reference_samples.empty()) { if (IndexingParameters::verbosity != IndexingParameters::None) { info(context) << "Updating reference samples." << endl; } - size_t present = gbz.set_reference_samples(IndexingParameters::haplotype_reference_samples); - if (present != IndexingParameters::haplotype_reference_samples.size()) { + size_t present = gbz.set_reference_samples(IndexingParameters::haplotype_sampling_reference_samples); + if (present != IndexingParameters::haplotype_sampling_reference_samples.size()) { warn(context) << "Only " << present << " out of " - << IndexingParameters::haplotype_reference_samples.size() + << IndexingParameters::haplotype_sampling_reference_samples.size() << " reference samples are present." << endl; } } @@ -4239,7 +4241,10 @@ IndexRegistry VGIndexes::get_vg_index_registry() { ? Haplotypes::verbosity_basic : Haplotypes::verbosity_silent); Recombinator recombinator(gbz, haplotypes, hap_verbosity); - Recombinator::Parameters parameters(Recombinator::Parameters::preset_diploid); + Recombinator::Parameters parameters(Recombinator::Parameters::preset_default); + parameters.num_haplotypes = IndexingParameters::haplotype_sampling_num_haplotypes; + parameters.diploid_sampling = IndexingParameters::haplotype_sampling_diploid; + parameters.include_reference = true; gbwt::GBWT sampled_gbwt = recombinator.generate_haplotypes(kff_filename, parameters); // Build GBZ. @@ -4356,8 +4361,8 @@ IndexRegistry VGIndexes::get_vg_index_registry() { // Build minimizer index without payload. MinimizerIndexParameters minimizer_params; - minimizer_params.minimizers(IndexingParameters::haplotype_minimizer_k, - IndexingParameters::haplotype_minimizer_w) + minimizer_params.minimizers(IndexingParameters::haplotype_sampling_minimizer_k, + IndexingParameters::haplotype_sampling_minimizer_w) .verbose(IndexingParameters::verbosity >= IndexingParameters::Debug); HaplotypePartitioner::minimizer_index_type minimizer_index = build_minimizer_index(gbz, nullptr, nullptr, minimizer_params); @@ -4432,7 +4437,7 @@ IndexRegistry VGIndexes::get_vg_index_registry() { // Run kmc: count k-mers in KFF format with canonical k-mers. // Use fork()+execvp() instead of system() to avoid shell quoting issues. string reads_arg = "@" + list_file; - string kmc_k_arg = "-k" + to_string(IndexingParameters::haplotype_minimizer_k); + string kmc_k_arg = "-k" + to_string(IndexingParameters::haplotype_sampling_minimizer_k); const char* kmc_argv[] = {"kmc", kmc_k_arg.c_str(), "-m128", "-okff", "-hp", reads_arg.c_str(), kmc_prefix.c_str(), kmc_tmp_dir.c_str(), nullptr}; diff --git a/src/index_registry.hpp b/src/index_registry.hpp index bdc08c8b1f..2e20291423 100644 --- a/src/index_registry.hpp +++ b/src/index_registry.hpp @@ -131,12 +131,16 @@ struct IndexingParameters { static double thread_chunk_inflation_factor; // whether indexing algorithms will log progress (if available) [Basic] static Verbosity verbosity; - // reference samples to override in GBZ during haplotype sampling [{}] - static std::unordered_set haplotype_reference_samples; + // Set of samples to make references and retain during haplotype sampling [{}] + static std::unordered_set haplotype_sampling_reference_samples; + // Number of haplotypes to sample during haplotype sampling [Recombinator::NUM_CANDIDATES] + static size_t haplotype_sampling_num_haplotypes; + // Whether to do diploid sampling during haplotype sampling [true] + static bool haplotype_sampling_diploid; // length of k-mer used in haplotype index and kmer counting [29] - static int haplotype_minimizer_k; + static int haplotype_sampling_minimizer_k; // length of window used in haplotype index [11] - static int haplotype_minimizer_w; + static int haplotype_sampling_minimizer_w; }; /** diff --git a/src/subcommand/giraffe_main.cpp b/src/subcommand/giraffe_main.cpp index 8846c28530..50095f9e24 100644 --- a/src/subcommand/giraffe_main.cpp +++ b/src/subcommand/giraffe_main.cpp @@ -659,17 +659,21 @@ void help_giraffe(char** argv, const BaseOptionGroup& parser, const std::map(optarg); + if (IndexingParameters::haplotype_sampling_num_haplotypes == 0) { + logger.error() << "number of haplotypes cannot be 0" << std::endl; + } + break; + + case OPT_NO_DIPLOID_SAMPLING: + IndexingParameters::haplotype_sampling_diploid = false; break; case 'G': @@ -1263,10 +1289,6 @@ int main_giraffe(int argc, char** argv) { output_basename = optarg; break; - case OPT_SET_REFERENCE: - reference_samples.insert(optarg); - break; - case OPT_REPORT_NAME: report_name = ensure_writable(logger, optarg); break; @@ -1502,15 +1524,16 @@ int main_giraffe(int argc, char** argv) { << " that sample name is " << sample_scope << std::endl; } } else { - sample_scope = "sample"; + logger.error() << "Unable to determine a sample name for haplotype sampling; provide --sample" << endl; } } if (sample_scope == "giraffe") { logger.warn() << "Using \"giraffe\" as a sample name may lead to filename collisions." << std::endl; } - // Pass parameters to the recipes through IndexingParameters. - IndexingParameters::haplotype_reference_samples = reference_samples; + // Pass parameters not directly provided in option parsers to the + // recipes through IndexingParameters. + IndexingParameters::haplotype_sampling_reference_samples = reference_samples; if (kff_filename.empty() && fastq_filename_1.empty()) { // TODO: Eventually learn to kmer count from GAM @@ -1631,16 +1654,6 @@ int main_giraffe(int argc, char** argv) { } } - if (haplotype_sampling && !registry.available("Giraffe GBZ") && !registry.available("GBZ")) { - // When doing haplotype sampling, we need to name our sampled GBZ just - // .gbz on top of the basename we put the sample name in, because the - // tests require it. This means we can't build a base .gbz and a - // sampled .gbz in the same indexing run, or they will fight over the - // name. So make sure we don't try. - logger.error() << "Cannot do haplotype sampling without a GBZ available" << std::endl; - } - // TODO: Actually force the index registry to use the name we want for the sampled GBZ even though it's not the one it would generate. - //If we're making new zipcodes, we should rebuild the minimizers too if (!(indexes_and_extensions.count(std::string("Long Read Minimizers")) || indexes_and_extensions.count(std::string("Long Read PathMinimizers"))) && indexes_and_extensions.count(std::string("Long Read Zipcodes"))) { logger.info() << "Rebuilding minimizer index to include zipcodes" << endl; diff --git a/test/haplotype-sampling/README.md b/test/haplotype-sampling/README.md index e188cf8a9c..3acbb64aca 100644 --- a/test/haplotype-sampling/README.md +++ b/test/haplotype-sampling/README.md @@ -151,5 +151,5 @@ You may also want to read about the [best practices](https://github.com/vgteam/v Remember to clean up after trying all the examples: ``` -rm -f HG003.gbz giraffe-integration.gam micb-kir3dl1.dist micb-kir3dl1.gbz micb-kir3dl1.giraffe.gbz micb-kir3dl1.hapl micb-kir3dl1.ri micb-kir3dl1.shortread.withzip.min micb-kir3dl1.shortread.zipcodes reference.fa to-original.gam to-sampled.gam +rm -f HG003.gbz giraffe-integration.gam micb-kir3dl1.dist micb-kir3dl1.gbz micb-kir3dl1.giraffe.gbz micb-kir3dl1.hapl micb-kir3dl1.ri micb-kir3dl1.shortread.withzip.min micb-kir3dl1.shortread.zipcodes reference.fa to-original.gam to-sampled.gam integration.* ``` diff --git a/test/t/54_vg_haplotypes.t b/test/t/54_vg_haplotypes.t index 421d61f89e..500788b926 100644 --- a/test/t/54_vg_haplotypes.t +++ b/test/t/54_vg_haplotypes.t @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 40 +plan tests 44 # The test graph consists of two subgraphs of the HPRC Minigraph-Cactus v1.1 graph: # - GRCh38#chr6:31498145-31511124 (micb) @@ -118,6 +118,20 @@ vg giraffe -Z full.gbz --haplotype-sampling \ is $? 0 "Giraffe does fully automatic haplotype sampling" is $(vg gbwt -H -Z auto_all.HG003.gbz) 4 "fully automatic haplotype sampling produces 2 diploid + 2 reference haplotypes" +# Giraffe integration, non-diploid +vg giraffe -Z full.gbz --haplotype-sampling --no-diploid-sampling --num-haplotypes 3 \ + --index-basename auto_nondip -N HG003 \ + -f haplotype-sampling/HG003.fq.gz > auto_nondip.gam 2> /dev/null +is $? 0 "Giraffe does non-diploid haplotype sampling" +is $(vg gbwt -H -Z auto_nondip.HG003.gbz) 5 "non-diploid haplotype sampling sampling produces 3 requested + 2 reference haplotypes" + +# Giraffe integration, single reference +vg giraffe -Z full.gbz --haplotype-sampling --set-reference GRCh38 \ + --index-basename auto_oneref -N HG003 \ + -f haplotype-sampling/HG003.fq.gz > auto_oneref.gam 2> /dev/null +is $? 0 "Giraffe does haplotype sampling when setting reference" +is $(vg gbwt -H -Z auto_oneref.HG003.gbz) 3 "setting reference produces 2 diploid + 1 reference haplotypes" + # Attempts to use mismatched files fail vg haplotypes -i full.hapl -k haplotype-sampling/HG003.kff -g /dev/null diploid.gbz 2> log.txt is $? 1 "sampling with mismatched GBZ and haplotype information fails" @@ -133,4 +147,6 @@ rm -f GRCh38.HG003.* HG003_GRCh38.gam rm -f auto_hapl.HG003.* auto_hapl.gam rm -f auto_kff.HG003.* auto_kff.gam rm -f auto_all.HG003.* auto_all.gam +rm -f auto_nondip.HG003.* auto_nondip.gam +rm -f auto_oneref.HG003.* auto_oneref.gam rm -f log.txt From 14950bd21140366845d7f2e5e11b6e50ec402844 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Tue, 10 Mar 2026 18:22:32 -0400 Subject: [PATCH 15/22] Wrap long help --- src/subcommand/giraffe_main.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/subcommand/giraffe_main.cpp b/src/subcommand/giraffe_main.cpp index 50095f9e24..910bfeab40 100644 --- a/src/subcommand/giraffe_main.cpp +++ b/src/subcommand/giraffe_main.cpp @@ -661,7 +661,8 @@ void help_giraffe(char** argv, const BaseOptionGroup& parser, const std::map Date: Tue, 10 Mar 2026 18:50:18 -0400 Subject: [PATCH 16/22] Fix CLI code to pass linting --- src/subcommand/giraffe_main.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/subcommand/giraffe_main.cpp b/src/subcommand/giraffe_main.cpp index 910bfeab40..c336cee7f3 100644 --- a/src/subcommand/giraffe_main.cpp +++ b/src/subcommand/giraffe_main.cpp @@ -667,8 +667,8 @@ void help_giraffe(char** argv, const BaseOptionGroup& parser, const std::map Date: Wed, 11 Mar 2026 11:44:50 -0400 Subject: [PATCH 17/22] Implement Snakemake-y wildcards to try and let scopes control what suffix we use --- src/index_registry.cpp | 229 ++++++++++++++++++++++---------- src/index_registry.hpp | 67 ++++++---- src/subcommand/giraffe_main.cpp | 10 +- 3 files changed, 202 insertions(+), 104 deletions(-) diff --git a/src/index_registry.cpp b/src/index_registry.cpp index fdf79cbb38..ce780fea5d 100644 --- a/src/index_registry.cpp +++ b/src/index_registry.cpp @@ -628,7 +628,7 @@ IndexRegistry VGIndexes::get_vg_index_registry() { registry.register_index("GBWTGraph", "gg"); registry.register_index("GBZ", "gbz"); - registry.register_index("Giraffe GBZ", std::vector{"gbz", "giraffe.gbz"}); + registry.register_index("Giraffe GBZ", std::vector{"{sample}.gbz", "giraffe.gbz"}); registry.register_index("Haplotype-Sampled GBZ", "sampled.gbz"); registry.register_index("Top Level Chain Distance Index", "tcdist"); registry.register_index("r Index", "ri"); @@ -4605,15 +4605,20 @@ bool IndexingPlan::is_intermediate(const IndexName& identifier) const { return !targets.count(identifier); } -const vector IndexingPlan::get_scopes(const IndexName& identifier) const { +const map IndexingPlan::get_scopes(const IndexName& identifier) const { auto found = scopes.find(identifier); if (found == scopes.end()) { return {}; } - // TODO: we can't really return an empty vector reference so we are stuck doing a copy here. + // We can't refer into a new slot so we need to copy return found->second; } +const map& IndexingPlan::get_scopes(const IndexName& identifier) { + // We can create the entry if not present + return scopes[identifier]; +} + int64_t IndexingPlan::target_memory_usage() const { return IndexingParameters::max_memory_proportion * registry->get_target_memory_usage(); } @@ -4638,18 +4643,58 @@ string IndexingPlan::output_filepath(const IndexName& identifier, size_t chunk, // we're not saving this file, make it temporary filepath << registry->get_work_dir() + "/" + sha1sum(identifier); } - for (auto& scope : get_scopes(identifier)) { - // This index belongs to just a sample or something. - filepath << "." << scope; + + // We need the planned scopes, not the scopes for a completed file. + auto planned_scopes = get_scopes(identifier); + string substituted_suffix; + for (auto& suffix : registry->get_index(identifier)->get_suffixes()) { + // Find the first suffix where all its wildcards are filled in by scopes + auto needed_keys = IndexRegistry::get_wildcards(suffix); + bool missing_keys = false; + for (auto& key : needed_keys) { + if (!planned_scopes.count(key)) { + missing_keys = true; + break; + } + } + if (missing_keys) { + continue; + } + // Now we know we want this suffix + + // Substitute the suffix + substituted_suffix = IndexRegistry::substitute_wildcards(suffix, planned_scopes); + + // Make the set of scopes not accounted for in the suffix. + // TODO: Merge with substitution? + // TODO: Just make all scope-able files say so? + set unused_scopes; + for (auto& kv : planned_scopes) { + unused_scopes.insert(kv.first); + } + for (auto& used : needed_keys) { + unused_scopes.erase(used); + } + + for (auto& key : unused_scopes) { + // Put the extra scopes in the filename + filepath << "." << planned_scopes.at(key); + } + + break; + } + if (substituted_suffix.empty()) { + // We really should find a suffix to use + throw std::runtime_error("No suffix for " + identifier + " can be used with available scopes."); } if (num_chunks > 1) { // we add digits to make the suffix unique for this chunk (the setup disallows suffixes - // that start with digits) + // that start with digits). + // TODO: What if a scope starts with digits? Put the numbers somewhere else? filepath << "." << to_string(chunk); } - // Use the suffix we've assigned, which is the first one we have that's - // unique within the scopes. - filepath << "." << assigned_suffix.at(identifier); + // Use the suffix we've substituted. + filepath << "." << substituted_suffix; return filepath.str(); } @@ -4690,14 +4735,13 @@ set IndexingPlan::dependents(const IndexName& identifier) const { return dependent_steps; } -void IndexingPlan::add_scope(const IndexName& identifier, const string& scope) { +void IndexingPlan::add_scope(const IndexName& identifier, const string& key, const string& value) { // Get a mutable reference to the scopes we need to add to. auto& index_scopes = scopes[identifier]; - // TODO: Change the data structure here if we anticipate an appreciable number of scopes. - auto found = std::find(index_scopes.begin(), index_scopes.end(), scope); + auto found = index_scopes.find(key); if (found == index_scopes.end()) { - index_scopes.push_back(scope); + index_scopes.emplace_hint(found, key, value); } } @@ -4780,9 +4824,9 @@ void IndexRegistry::make_indexes(const vector& identifiers) { if (!index->was_provided_directly()) { // and assign the new (or first) ones index->assign_constructed(results); - for (auto& scope : plan.get_scopes(*it)) { + for (auto& scope_kv : plan.get_scopes(*it)) { // and keep their scopes (in case we ever need them) - index->add_scope(scope); + index->add_scope(scope_kv.first, scope_kv.second); } } ++it; @@ -4915,12 +4959,96 @@ void IndexRegistry::register_index(const IndexName& identifier, const vector IndexRegistry::get_wildcards(const string& pattern) { + set result; + size_t wildcard_start = std::numeric_limits::max(); + for (size_t i = 0; i < pattern.size(); i++) { + // Scan the string and find the wildcards + auto& c = pattern[i]; + if (wildcard_start == std::numeric_limits::max()) { + // Need an open brace + if (c == '{') { + wildcard_start = i; + } else if (c == '}') { + throw std::runtime_error("Unmatched closing brace in " + pattern); + } + } else { + // Need a close brace + if (c == '}') { + if (i - wildcard_start < 2) { + // There's no name here + throw std::runtime_error("Empty wildcard in " + pattern); + } + string name = pattern.substr(wildcard_start + 1, i - wildcard_start - 2); + auto found = result.find(name); + if (found != result.end()) { + throw std::runtime_error("Duplicate wildcard in " + pattern); + } + // Save the new wildcard + result.emplace_hint(found, name); + // Go back to not-in-a-wildcard state + wildcard_start = std::numeric_limits::max(); + } + } + } + if (wildcard_start != std::numeric_limits::max()) { + // We ended while reading a wildcard + throw std::runtime_error("Unmatched opening brace in " + pattern); + } + return result; +} + +string IndexRegistry::substitute_wildcards(const string& pattern, const map values) { + // TODO: Deduplicate code with get_wildcards somehow + + std::stringstream result; + + size_t wildcard_start = std::numeric_limits::max(); + for (size_t i = 0; i < pattern.size(); i++) { + // Scan the string and find the wildcards + auto& c = pattern[i]; + if (wildcard_start == std::numeric_limits::max()) { + // Need an open brace + if (c == '{') { + wildcard_start = i; + } else if (c == '}') { + throw std::runtime_error("Unmatched closing brace in " + pattern); + } else { + result << c; + } + } else { + // Need a close brace + if (c == '}') { + if (i - wildcard_start < 2) { + // There's no name here + throw std::runtime_error("Empty wildcard in " + pattern); + } + string name = pattern.substr(wildcard_start + 1, i - wildcard_start - 2); + auto found = values.find(name); + if (found == values.end()) { + throw std::runtime_error("Undefined wildcard " + name + " in " + pattern); + } + // Output the wildcard value + result << found->second; + // Go back to not-in-a-wildcard state + wildcard_start = std::numeric_limits::max(); + } + } + } + if (wildcard_start != std::numeric_limits::max()) { + // We ended while reading a wildcard + throw std::runtime_error("Unmatched opening brace in " + pattern); + } + + return result.str(); +} + -void IndexRegistry::provide(const IndexName& identifier, const string& filename, const std::string& scope) { - provide(identifier, vector(1, filename), scope); +void IndexRegistry::provide(const IndexName& identifier, const string& filename, const map& scopes) { + provide(identifier, vector(1, filename), scopes); } -void IndexRegistry::provide(const IndexName& identifier, const vector& filenames, const std::string& scope) { +void IndexRegistry::provide(const IndexName& identifier, const vector& filenames, const map& scopes) { if (IndexingParameters::verbosity >= IndexingParameters::Debug) { info(context) << "Provided: " << identifier << endl; } @@ -4934,8 +5062,8 @@ void IndexRegistry::provide(const IndexName& identifier, const vector& f } auto index = get_index(identifier); index->provide(filenames); - if (!scope.empty()) { - index->add_scope(scope); + for (auto& scope_kv : scopes) { + index->add_scope(scope_kv.first, scope_kv.second); } } @@ -5870,52 +5998,10 @@ IndexingPlan IndexRegistry::make_plan(const IndexGroup& end_products) const { for (const IndexName& dependent_index_name : dependent_recipe.first) { // For each index the recipe generates - for (auto& scope : provided_scopes) { + for (auto& scope_kv : provided_scopes) { // Make sure that index is scoped with all scopes on the input. - plan.add_scope(dependent_index_name, scope); + plan.add_scope(dependent_index_name, scope_kv.first, scope_kv.second); } - - // Note that all outputs will get their scopes from the - // inputs added in the same order (the order that the - // inputs appear in the plan). - } - } - } - } - - // Assign each index a unique scoped suffix, so we can use ..gbz - // for the Giraffe GBZ and .gbz for the plain one. - unordered_set used_scoped_suffixes; - for (auto& step : plan.steps) { - for (const IndexName& index_name : step.first) { - auto index = get_index(index_name); - // If scopes were provided, this is an input, and just use those - vector scopes = index->get_scopes(); - if (scopes.empty()) { - // Otherwise see if we have planned scopes - scopes = plan.get_scopes(index_name); - } - - // Build the part that is any scopes (either provided or planned) - std::stringstream suffix_base; - for (auto& scope : scopes) { - suffix_base << scope << "."; - } - - for (auto& suffix : index->get_suffixes()) { - // Condider this suffix option on top of the scopes - string candidate_suffix = suffix_base.str() + suffix; - - auto found = used_scoped_suffixes.find(candidate_suffix); - if (found == used_scoped_suffixes.end()) { - // This is a fresh one. Use it. - used_scoped_suffixes.emplace_hint(found, candidate_suffix); - // We only remember the suffix itself; we reconstruct the - // actual name later, with chunk info inserted. - // TODO: Instead of building an essentially fake string key - // should we do something else? - plan.assigned_suffix[index_name] = suffix; - break; } } } @@ -6111,23 +6197,22 @@ bool IndexFile::was_provided_directly() const { return provided_directly; } -void IndexFile::add_scope(const string& scope) { +void IndexFile::add_scope(const string& key, const string& value) { if (!is_finished()) { // Scopes for nonexistent files belong to the plan, not us. - throw std::logic_error("Cannot assign " + scope + " scope to unfinished " + get_identifier() + " index"); + throw std::logic_error("Cannot assign " + key + " = " + value + " scope to unfinished " + get_identifier() + " index"); } - // TODO: Change the data structure here if we anticipate an appreciable number of scopes. - auto found = std::find(scopes.begin(), scopes.end(), scope); + auto found = scopes.find(key); if (found == scopes.end()) { - scopes.push_back(scope); + scopes.emplace_hint(found, key, value); #ifdef debug_index_registry - std::cerr << "Added scope " << scope << " to " << get_identifier() << std::endl; + std::cerr << "Added scope " << key << " = " + value + " to " << get_identifier() << std::endl; #endif } } -const vector& IndexFile::get_scopes() const { +const map& IndexFile::get_scopes() const { return scopes; } diff --git a/src/index_registry.hpp b/src/index_registry.hpp index 2e20291423..87dbb3d222 100644 --- a/src/index_registry.hpp +++ b/src/index_registry.hpp @@ -180,6 +180,13 @@ class IndexingPlan { string output_filepath(const IndexName& identifier) const; /// Get the suffix with which to save the given index's files. + /// + /// The path will substitute scopes into any {wildcards} in the index's + /// suffix, and scopes not used in the suffix will appear in the + /// filename in key order. + /// + /// The suffix used will be the first one for which all wildcards can be + /// substituted. string output_filepath(const IndexName& identifier, size_t chunk, size_t num_chunks) const; /// Ge the steps of the plan @@ -189,10 +196,15 @@ class IndexingPlan { /// plan, and false if it is to be preserved. bool is_intermediate(const IndexName& identifier) const; - /// Get the deduplicated scopes, in addition order, that the given index - /// will be qualified with when generated. Only works for indexes that will - /// be generated by the plan, not for inputs. - const vector get_scopes(const IndexName& identifier) const; + /// Get the scopes that the given index will be qualified with when + /// generated. Only works for indexes that will be generated by the plan, + /// not for inputs. + const map get_scopes(const IndexName& identifier) const; + + /// Get the scopes that the given index will be qualified with when + /// generated. Only works for indexes that will be generated by the plan, + /// not for inputs. + const map& get_scopes(const IndexName& identifier); // TODO: is this where this function wants to live? /// The memory limit, with a little slosh for prediction inaccuracy @@ -208,7 +220,7 @@ class IndexingPlan { /// Add a scope to the scopes the given index will be qualified with when /// generated, if not present already. - void add_scope(const IndexName& identifier, const string& scope); + void add_scope(const IndexName& identifier, const string& key, const string& value); /// The steps to be invoked in the plan. May be empty before the plan is /// actually planned. @@ -217,11 +229,9 @@ class IndexingPlan { set targets; /// The scopes qualifying each index in the plan (for example, if an index /// applies only to a particular sample, it will be scoped to the sample - /// name). Stored deduplicated and in the order added. - map> scopes; - /// The unique (when combined with scopes) suffix for each index in the - /// plan. - map assigned_suffix; + /// name). Stored using an ordered map to ensure a consistent iteration + /// order. + map> scopes; /// The registry that the plan is using. /// The registry must not move while the plan is in use. @@ -275,9 +285,15 @@ class IndexRegistry { void register_index(const IndexName& identifier, const string& suffix); /// Register an index containing the given identifier, with multiple possible suffixes. - /// The first suffix not already occupied will be used (allowing scopes to - /// disambiguate between unscoped and scoped indexes with the same suffix). + /// The first suffix where all {wildcards} can be substituted with scopes will be used. void register_index(const IndexName& identifier, const vector& suffixes); + + /// Get the names of all brace-enclosed {wildcards} in the given pattern + static set get_wildcards(const string& pattern); + + /// Substitute wildcards into the given pattern. All wildcards must have + /// values assigned. Extra values not used are allowed. + static string substitute_wildcards(const string& pattern, const map values); /// Register a recipe to produce an index using other indexes /// or input files. Recipes registered earlier will have higher priority. @@ -288,26 +304,22 @@ class IndexRegistry { /// Indicate one recipe is a broadened version of another. The indexes consumed and produced /// by the generalization must be semantically identical to those of the generalizee void register_generalization(const RecipeName& generalizer, const RecipeName& generalizee); - + /// Indicate a serialized file that contains some identified index, - /// optionally with a scope that propagates to descendant files. - /// - /// The empty string represents no scope. + /// optionally with scopes that propagates to descendant files. /// /// TODO: If scopes contain ".", we can run into problems with combinations /// of different scopes producing the same final string. Right now we only /// use one kind of scope, which avoids this. - void provide(const IndexName& identifier, const string& filename, const std::string& scope = ""); + void provide(const IndexName& identifier, const string& filename, const map& scopes = {}); /// Indicate a list of serialized files that contains some identified index, - /// optionally with a scope that propagates to descendant files. - /// - /// The empty string represents no scope. + /// optionally with scopes that propagates to descendant files. /// /// TODO: If scopes contain ".", we can run into problems with combinations /// of different scopes producing the same final string. Right now we only /// use one kind of scope, which avoids this. - void provide(const IndexName& identifier, const vector& filenames, const std::string& scope = ""); + void provide(const IndexName& identifier, const vector& filenames, const map& scopes = {}); /// Remove a provided index void reset(const IndexName& identifier); @@ -447,17 +459,16 @@ class IndexFile { /// Assign constructed filenames to this index void assign_constructed(const vector& filenames); - /// Append a new scope to qualify the files in the index, if the scope is + /// Add a new scope to qualify the files in the index, if the scope is /// not already on the index. /// /// Should only be used on files that are actually filled in already; /// scopes for files that don't exist yet are the responsibility of the /// IndexingPlan. - void add_scope(const string& scope); + void add_scope(const string& key, const string& value); /// Get all scopes qualifying the index (such as a sample name). - /// Scopes are deduplicated and produced in the order added. - const vector& get_scopes() const; + const map& get_scopes() const; /// Returns true if the index has already been built or provided bool is_finished() const; @@ -479,10 +490,8 @@ class IndexFile { // the filename(s) associated with the index vector filenames; - // The dedplicated ordered scopes qualifying the index. - // TODO: Use an ordered set for good algorithmics for deduplicating scopes. - // Right now we have exactly one kind of scope so this doesn't matter. - vector scopes; + // The scopes qualifying the index. + map scopes; // keep track of whether the index was provided directly bool provided_directly = false; diff --git a/src/subcommand/giraffe_main.cpp b/src/subcommand/giraffe_main.cpp index c336cee7f3..522b15f41d 100644 --- a/src/subcommand/giraffe_main.cpp +++ b/src/subcommand/giraffe_main.cpp @@ -1542,7 +1542,7 @@ int main_giraffe(int argc, char** argv) { } if (!kff_filename.empty()) { - registry.provide("KFF Kmer Counts", kff_filename, sample_scope); + registry.provide("KFF Kmer Counts", kff_filename, {{"sample", sample_scope}}); } if (!fastq_filename_1.empty()) { // If FASTQs are available, send those and we will do haplotype sampling. @@ -1552,7 +1552,7 @@ int main_giraffe(int argc, char** argv) { } // Provide the FASTQs to the index registry, but mark everything // derived from them as being scoped to the particular sample. - registry.provide("FASTQ", fastqs, sample_scope); + registry.provide("FASTQ", fastqs, {{"sample", sample_scope}}); } @@ -1640,7 +1640,11 @@ int main_giraffe(int argc, char** argv) { // Report it because this may not be desired behavior. logger.info() << "Guessing that " << inferred_filename << " is " << index_and_extensions->first << endl; - registry.provide(index_and_extensions->first, inferred_filename, haplotype_sampling ? sample_scope : ""); + std::map scopes; + if (haplotype_sampling) { + scopes.emplace("sample", sample_scope); + } + registry.provide(index_and_extensions->first, inferred_filename, scopes); found = true; // Skip other extension options for the index break; From 9272a72c26f354add2fe6e15e4a544663f822cf7 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Wed, 11 Mar 2026 11:49:15 -0400 Subject: [PATCH 18/22] Don't show wildcards in get_possible_filenames() --- src/index_registry.cpp | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/src/index_registry.cpp b/src/index_registry.cpp index ce780fea5d..4ab5276b32 100644 --- a/src/index_registry.cpp +++ b/src/index_registry.cpp @@ -5096,9 +5096,25 @@ vector IndexRegistry::get_possible_filenames(const IndexName& identifier error(context) << "cannot require unregistered index: " << identifier << endl; } const IndexFile* index = get_index(identifier); + auto& index_scopes = index->get_scopes(); vector filenames; for (auto& suffix : index->get_suffixes()) { - // Try all the possible suffixes + // Find all suffixes where all wildcards are filled in by scopes. + // TODO: We might want to really match the wildcards against filenames, + // Snakemake-style. + auto needed_keys = IndexRegistry::get_wildcards(suffix); + bool missing_keys = false; + for (auto& key : needed_keys) { + if (!index_scopes.count(key)) { + missing_keys = true; + break; + } + } + if (missing_keys) { + continue; + } + + // Try all those suffixes filenames.push_back(get_prefix() + "." + suffix); } return filenames; From 2171d4a30d6270a11223d28950e77bfa9e87e691 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Wed, 11 Mar 2026 12:18:05 -0400 Subject: [PATCH 19/22] Fix wildcard parsing --- src/index_registry.cpp | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/index_registry.cpp b/src/index_registry.cpp index 4ab5276b32..4368fd5e5e 100644 --- a/src/index_registry.cpp +++ b/src/index_registry.cpp @@ -4654,6 +4654,9 @@ string IndexingPlan::output_filepath(const IndexName& identifier, size_t chunk, for (auto& key : needed_keys) { if (!planned_scopes.count(key)) { missing_keys = true; +#ifdef debug_index_registry + std::cerr << "Pattern " << suffix << " can't be used because " << key << " scope is not available for " << identifier << std::endl; +#endif break; } } @@ -4676,6 +4679,10 @@ string IndexingPlan::output_filepath(const IndexName& identifier, size_t chunk, unused_scopes.erase(used); } +#ifdef debug_index_registry + std::cerr << "Pattern " << suffix << " substituted as " << substituted_suffix << " for " << identifier << "; " << unused_scopes.size() << " extra scopes" << std::endl; +#endif + for (auto& key : unused_scopes) { // Put the extra scopes in the filename filepath << "." << planned_scopes.at(key); @@ -4979,7 +4986,7 @@ set IndexRegistry::get_wildcards(const string& pattern) { // There's no name here throw std::runtime_error("Empty wildcard in " + pattern); } - string name = pattern.substr(wildcard_start + 1, i - wildcard_start - 2); + string name = pattern.substr(wildcard_start + 1, i - wildcard_start - 1); auto found = result.find(name); if (found != result.end()) { throw std::runtime_error("Duplicate wildcard in " + pattern); @@ -5023,7 +5030,7 @@ string IndexRegistry::substitute_wildcards(const string& pattern, const map Date: Wed, 11 Mar 2026 12:28:18 -0400 Subject: [PATCH 20/22] Add tests to keep wildcard system working --- src/index_registry.cpp | 4 ++ src/unittest/index_registry.cpp | 65 +++++++++++++++++++++++++++++++++ 2 files changed, 69 insertions(+) diff --git a/src/index_registry.cpp b/src/index_registry.cpp index 4368fd5e5e..b58748d5b2 100644 --- a/src/index_registry.cpp +++ b/src/index_registry.cpp @@ -4995,6 +4995,8 @@ set IndexRegistry::get_wildcards(const string& pattern) { result.emplace_hint(found, name); // Go back to not-in-a-wildcard state wildcard_start = std::numeric_limits::max(); + } else if (c == '{') { + throw std::runtime_error("Opening brace inside wildcard " + pattern); } } } @@ -5039,6 +5041,8 @@ string IndexRegistry::substitute_wildcards(const string& pattern, const mapsecond; // Go back to not-in-a-wildcard state wildcard_start = std::numeric_limits::max(); + } else if (c == '{') { + throw std::runtime_error("Opening brace inside wildcard " + pattern); } } } diff --git a/src/unittest/index_registry.cpp b/src/unittest/index_registry.cpp index 6c71fd8587..babbc377ac 100644 --- a/src/unittest/index_registry.cpp +++ b/src/unittest/index_registry.cpp @@ -279,5 +279,70 @@ TEST_CASE("IndexRegistry can make plans on a dummy recipe graph", "[indexregistr // } } +TEST_CASE("IndexRegistry wildcard parsing works", "[indexregistry]") { + + SECTION("empty string") { + auto wildcards = IndexRegistry::get_wildcards(""); + REQUIRE(wildcards.empty()); + } + + SECTION("string with no wildcards") { + auto wildcards = IndexRegistry::get_wildcards("no wildcards here"); + REQUIRE(wildcards.empty()); + } + + SECTION("string with one wildcard") { + auto wildcards = IndexRegistry::get_wildcards("some {wildcards} here"); + REQUIRE(wildcards.size() == 1); + REQUIRE(wildcards.count("wildcards")); + } + + SECTION("string with several wildcards") { + auto wildcards = IndexRegistry::get_wildcards("{some} {wildcards} {here}"); + REQUIRE(wildcards.size() == 3); + REQUIRE(wildcards.count("some")); + REQUIRE(wildcards.count("wildcards")); + REQUIRE(wildcards.count("here")); + + // Make sure they iterate in alphabetical order + std::vector found; + for (auto& w : wildcards) { + found.push_back(w); + } + REQUIRE(found.at(0) == "here"); + REQUIRE(found.at(1) == "some"); + REQUIRE(found.at(2) == "wildcards"); + } +} + +TEST_CASE("IndexRegistry wildcard substitution works", "[indexregistry]") { + + SECTION("empty string, empty valuation") { + auto result = IndexRegistry::substitute_wildcards("", {}); + REQUIRE(result == ""); + } + + SECTION("empty string, nonempty valuation") { + auto result = IndexRegistry::substitute_wildcards("", {{"a", "thing"}, {"b", "another thing"}}); + REQUIRE(result == ""); + } + + SECTION("nonempty string, nonempty valuation") { + auto result = IndexRegistry::substitute_wildcards("stuff", {{"a", "thing"}, {"b", "another thing"}}); + REQUIRE(result == "stuff"); + } + + SECTION("nonempty string with some wildcards, nonempty valuation") { + auto result = IndexRegistry::substitute_wildcards("st{a}ff", {{"a", "thing"}, {"b", "another thing"}}); + REQUIRE(result == "stthingff"); + } + + SECTION("nonempty string with all wildcards, nonempty valuation") { + auto result = IndexRegistry::substitute_wildcards("st{a}ff{b}", {{"a", "thing"}, {"b", "another thing"}}); + REQUIRE(result == "stthingffanother thing"); + } + +} + } } From 89b6bc6581f90c9af0e7b38f35dd23291cd24da3 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Mon, 16 Mar 2026 16:26:00 -0400 Subject: [PATCH 21/22] Add missing filesystem include --- src/index_registry.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/index_registry.cpp b/src/index_registry.cpp index 7bdeb97bbd..0eb26fedbb 100644 --- a/src/index_registry.cpp +++ b/src/index_registry.cpp @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include From 62dfb3ccf2cc9dd1e2c5aa924da8e8c33b283306 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Tue, 17 Mar 2026 14:15:20 -0400 Subject: [PATCH 22/22] Save Giraffe logs and clean up any wayward old outputs --- test/t/54_vg_haplotypes.t | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/test/t/54_vg_haplotypes.t b/test/t/54_vg_haplotypes.t index 500788b926..b94d4d12af 100644 --- a/test/t/54_vg_haplotypes.t +++ b/test/t/54_vg_haplotypes.t @@ -70,24 +70,27 @@ is $(vg gbwt -C -Z diploid3.gbz) 2 "2 contigs" is $(vg gbwt -H -Z diploid3.gbz) 3 "2 generated + 1 reference haplotypes" # Giraffe integration, guessed output name +rm -f full.HG003.* default.gam vg giraffe -Z full.gbz --haplotype-name full.hapl --kff-name haplotype-sampling/HG003.kff \ - -f haplotype-sampling/HG003.fq.gz > default.gam 2> /dev/null + -f haplotype-sampling/HG003.fq.gz > default.gam is $? 0 "Giraffe integration with a guessed output name" cmp diploid.gbz full.HG003.gbz is $? 0 "the sampled graph is identical to a manually sampled one" # Giraffe integration, specified output name +rm -f sampled.003HG.* specified.gam vg giraffe -Z full.gbz --haplotype-name full.hapl --kff-name haplotype-sampling/HG003.kff \ --index-basename sampled -N 003HG \ - -f haplotype-sampling/HG003.fq.gz > specified.gam 2> /dev/null + -f haplotype-sampling/HG003.fq.gz > specified.gam is $? 0 "Giraffe integration with a specified output name" cmp full.HG003.gbz sampled.003HG.gbz is $? 0 "the sampled graphs are identical" # Giraffe integration, specified reference sample +rm -f GRCh38.HG003.* HG003_GRCh38.gam vg giraffe -Z full.gbz --haplotype-name full.hapl --kff-name haplotype-sampling/HG003.kff \ --index-basename GRCh38 -N HG003 --set-reference GRCh38 \ - -f haplotype-sampling/HG003.fq.gz > HG003_GRCh38.gam 2> /dev/null + -f haplotype-sampling/HG003.fq.gz > HG003_GRCh38.gam is $? 0 "Giraffe integration with a specified reference sample" cmp diploid3.gbz GRCh38.HG003.gbz is $? 0 "the sampled graph is identical to a manually sampled one" @@ -95,42 +98,47 @@ is $? 0 "the sampled graph is identical to a manually sampled one" # Giraffe integration, haplotype index built automatically from the GBZ. # Providing --kff-name without --haplotype-name implies haplotype sampling; # the haplotype index (.hapl) is built by the IndexRegistry from the GBZ. +rm -f auto_hapl.HG003.* auto_hapl.gam vg giraffe -Z full.gbz --kff-name haplotype-sampling/HG003.kff \ --index-basename auto_hapl -N HG003 \ - -f haplotype-sampling/HG003.fq.gz > auto_hapl.gam 2> /dev/null + -f haplotype-sampling/HG003.fq.gz > auto_hapl.gam is $? 0 "Giraffe builds haplotype index automatically" -is $(vg gbwt -H -Z auto_hapl.HG003.gbz) 4 "auto-built hapl produces 2 diploid + 2 reference haplotypes" +is "$(vg gbwt -H -Z auto_hapl.HG003.gbz)" 4 "auto-built hapl produces 2 diploid + 2 reference haplotypes" # Giraffe integration, kmer counting done automatically from reads. # Providing --haplotype-name without --kff-name implies haplotype sampling; # the kmer counts (.kff) are built by the IndexRegistry from the reads using kmc. +rm -f auto_kff.HG003.* auto_kff.gam vg giraffe -Z full.gbz --haplotype-name full.hapl \ --index-basename auto_kff -N HG003 \ - -f haplotype-sampling/HG003.fq.gz > auto_kff.gam 2> /dev/null + -f haplotype-sampling/HG003.fq.gz > auto_kff.gam is $? 0 "Giraffe counts kmers from reads automatically" -is $(vg gbwt -H -Z auto_kff.HG003.gbz) 4 "auto kmer counting produces 2 diploid + 2 reference haplotypes" +is "$(vg gbwt -H -Z auto_kff.HG003.gbz)" 4 "auto kmer counting produces 2 diploid + 2 reference haplotypes" # Giraffe integration, fully automatic: both hapl and kff are built by the # IndexRegistry. Triggered by --haplotype-sampling without either input file. +rm -f auto_all.HG003.* auto_all.gam vg giraffe -Z full.gbz --haplotype-sampling \ --index-basename auto_all -N HG003 \ - -f haplotype-sampling/HG003.fq.gz > auto_all.gam 2> /dev/null + -f haplotype-sampling/HG003.fq.gz > auto_all.gam is $? 0 "Giraffe does fully automatic haplotype sampling" -is $(vg gbwt -H -Z auto_all.HG003.gbz) 4 "fully automatic haplotype sampling produces 2 diploid + 2 reference haplotypes" +is "$(vg gbwt -H -Z auto_all.HG003.gbz)" 4 "fully automatic haplotype sampling produces 2 diploid + 2 reference haplotypes" # Giraffe integration, non-diploid +rm -f auto_nondip.HG003.* auto_nondip.gam vg giraffe -Z full.gbz --haplotype-sampling --no-diploid-sampling --num-haplotypes 3 \ --index-basename auto_nondip -N HG003 \ - -f haplotype-sampling/HG003.fq.gz > auto_nondip.gam 2> /dev/null + -f haplotype-sampling/HG003.fq.gz > auto_nondip.gam is $? 0 "Giraffe does non-diploid haplotype sampling" -is $(vg gbwt -H -Z auto_nondip.HG003.gbz) 5 "non-diploid haplotype sampling sampling produces 3 requested + 2 reference haplotypes" +is "$(vg gbwt -H -Z auto_nondip.HG003.gbz)" 5 "non-diploid haplotype sampling sampling produces 3 requested + 2 reference haplotypes" # Giraffe integration, single reference +rm -f auto_oneref.HG003.* auto_oneref.gam vg giraffe -Z full.gbz --haplotype-sampling --set-reference GRCh38 \ --index-basename auto_oneref -N HG003 \ - -f haplotype-sampling/HG003.fq.gz > auto_oneref.gam 2> /dev/null + -f haplotype-sampling/HG003.fq.gz > auto_oneref.gam is $? 0 "Giraffe does haplotype sampling when setting reference" -is $(vg gbwt -H -Z auto_oneref.HG003.gbz) 3 "setting reference produces 2 diploid + 1 reference haplotypes" +is "$(vg gbwt -H -Z auto_oneref.HG003.gbz)" 3 "setting reference produces 2 diploid + 1 reference haplotypes" # Attempts to use mismatched files fail vg haplotypes -i full.hapl -k haplotype-sampling/HG003.kff -g /dev/null diploid.gbz 2> log.txt