From ccef5a609bc6de1b81c6d1addf759afff72cabba Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Fri, 6 Mar 2026 16:10:06 -0500
Subject: [PATCH 01/22] Synthesize terrible code for one-command hap indexing
 and sampling

---
 Brewfile                        |   2 +
 src/index_registry.cpp          | 258 +++++++++++++++++++++++++++++++-
 src/index_registry.hpp          |   6 +
 src/subcommand/giraffe_main.cpp | 177 +++++++++-------------
 test/t/54_vg_haplotypes.t       |  37 ++++-
 5 files changed, 371 insertions(+), 109 deletions(-)
diff --git a/Brewfile b/Brewfile
index 2869f99db4..9a7376ad58 100644
--- a/Brewfile
+++ b/Brewfile
@@ -22,3 +22,5 @@ brew "boost"
 brew "pybind11"
 brew "pandoc"
 brew "openssl"
+tap "brewsci/bio"
+brew "brewsci/bio/kmc"
diff --git a/src/index_registry.cpp b/src/index_registry.cpp
index db3aafc896..6304650405 100644
--- a/src/index_registry.cpp
+++ b/src/index_registry.cpp
@@ -24,6 +24,7 @@
 #include <bdsg/packed_graph.hpp>
 #include <xg.hpp>
 #include <gbwt/variants.h>
+#include <gbwt/fast_locate.h>
 #include <gbwtgraph/index.h>
 #include <gbwtgraph/gbwtgraph.h>
 #include <gbwtgraph/gbz.h>
@@ -54,6 +55,7 @@
 #include "gfa.hpp"
 #include "job_schedule.hpp"
 #include "path.hpp"
+#include "recombinator.hpp"
 #include "zip_code.hpp"
 
 #include "io/save_handle_graph.hpp"
@@ -125,6 +127,9 @@ int IndexingParameters::downsample_context_length = gbwtgraph::PATH_COVER_DEFAUL
 double IndexingParameters::max_memory_proportion = 0.75;
 double IndexingParameters::thread_chunk_inflation_factor = 2.0;
 IndexingParameters::Verbosity IndexingParameters::verbosity = IndexingParameters::Basic;
+std::unordered_set<std::string> IndexingParameters::haplotype_reference_samples = {};
+std::string IndexingParameters::haplotype_fastq_1 = "";
+std::string IndexingParameters::haplotype_fastq_2 = "";
 
 void copy_file(const string& from_fp, const string& to_fp) {
     require_exists(context, from_fp);
@@ -619,6 +624,9 @@ IndexRegistry VGIndexes::get_vg_index_registry() {
     registry.register_index("GBWTGraph", "gg");
     registry.register_index("GBZ", "gbz");
     registry.register_index("Giraffe GBZ", "giraffe.gbz");
+    registry.register_index("Unsampled Giraffe GBZ", "unsampled.gbz");
+    registry.register_index("Haplotype Index", "hapl");
+    registry.register_index("KFF Kmer Counts", "kff");
     
     registry.register_index("Short Read Minimizers", "shortread.withzip.min");
     registry.register_index("Short Read Zipcodes", "shortread.zipcodes");
@@ -3957,7 +3965,255 @@ IndexRegistry VGIndexes::get_vg_index_registry() {
     ////////////////////////////////////
     // GBZ Recipes
     ////////////////////////////////////
-    
+
+    // Haplotype sampling: produce a personalized "Giraffe GBZ" by sampling
+    // haplotypes from a full-pangenome "Unsampled Giraffe GBZ" given
+    // haplotype information and kmer counts from reads.
+    registry.register_recipe({"Giraffe GBZ"}, {"Haplotype Index", "KFF Kmer Counts", "Unsampled Giraffe GBZ"},
+                             [](const vector<const IndexFile*>& inputs,
+                                const IndexingPlan* plan,
+                                AliasGraph& alias_graph,
+                                const IndexGroup& constructing) {
+        if (IndexingParameters::verbosity != IndexingParameters::None) {
+            info(context) << "Sampling haplotypes." << endl;
+        }
+
+        assert(inputs.size() == 3);
+        auto hapl_filenames = inputs[0]->get_filenames();
+        auto kff_filenames = inputs[1]->get_filenames();
+        auto gbz_filenames = inputs[2]->get_filenames();
+        assert(gbz_filenames.size() == 1);
+        assert(hapl_filenames.size() == 1);
+        assert(kff_filenames.size() == 1);
+        auto gbz_filename = gbz_filenames.front();
+        auto hapl_filename = hapl_filenames.front();
+        auto kff_filename = kff_filenames.front();
+
+        assert(constructing.size() == 1);
+        vector<vector<string>> all_outputs(constructing.size());
+        auto gbz_output = *constructing.begin();
+        auto& output_names = all_outputs[0];
+
+        // Load GBZ.
+        gbwtgraph::GBZ gbz;
+        load_gbz(gbz, gbz_filename, IndexingParameters::verbosity == IndexingParameters::Debug);
+
+        // Override reference samples if requested.
+        if (!IndexingParameters::haplotype_reference_samples.empty()) {
+            if (IndexingParameters::verbosity != IndexingParameters::None) {
+                info(context) << "Updating reference samples." << endl;
+            }
+            size_t present = gbz.set_reference_samples(IndexingParameters::haplotype_reference_samples);
+            if (present != IndexingParameters::haplotype_reference_samples.size()) {
+                warn(context) << "Only " << present << " out of "
+                              << IndexingParameters::haplotype_reference_samples.size()
+                              << " reference samples are present." << endl;
+            }
+        }
+
+        // Load haplotype information.
+        if (IndexingParameters::verbosity != IndexingParameters::None) {
+            info(context) << "Loading haplotype information from " << hapl_filename << "." << endl;
+        }
+        Haplotypes haplotypes;
+        haplotypes.load_from(hapl_filename);
+
+        // Sample haplotypes.
+        Haplotypes::Verbosity hap_verbosity = (IndexingParameters::verbosity != IndexingParameters::None
+                                               ? Haplotypes::verbosity_basic
+                                               : Haplotypes::verbosity_silent);
+        Recombinator recombinator(gbz, haplotypes, hap_verbosity);
+        Recombinator::Parameters parameters(Recombinator::Parameters::preset_diploid);
+        gbwt::GBWT sampled_gbwt = recombinator.generate_haplotypes(kff_filename, parameters);
+
+        // Build GBWTGraph and save GBZ.
+        if (IndexingParameters::verbosity != IndexingParameters::None) {
+            info(context) << "Building GBWTGraph." << endl;
+        }
+        gbwtgraph::GBZ sampled_graph(std::move(sampled_gbwt), gbz);
+        string output_name = plan->output_filepath(gbz_output);
+        save_gbz(sampled_graph, output_name, IndexingParameters::verbosity == IndexingParameters::Debug);
+
+        output_names.push_back(output_name);
+        return all_outputs;
+    });
+
+    // Allow "Unsampled Giraffe GBZ" to be built from GBWT + GBWTGraph,
+    // so users who provide those two files can still do haplotype sampling.
+    registry.register_recipe({"Unsampled Giraffe GBZ"}, {"GBWTGraph", "Giraffe GBWT"},
+                             [](const vector<const IndexFile*>& inputs,
+                                const IndexingPlan* plan,
+                                AliasGraph& alias_graph,
+                                const IndexGroup& constructing) {
+        if (IndexingParameters::verbosity != IndexingParameters::None) {
+            info(context) << "Combining Giraffe GBWT and GBWTGraph into unsampled GBZ." << endl;
+        }
+
+        assert(inputs.size() == 2);
+        auto gg_filenames = inputs[0]->get_filenames();
+        auto gbwt_filenames = inputs[1]->get_filenames();
+        assert(gg_filenames.size() == 1);
+        assert(gbwt_filenames.size() == 1);
+        auto gg_filename = gg_filenames.front();
+        auto gbwt_filename = gbwt_filenames.front();
+
+        assert(constructing.size() == 1);
+        vector<vector<string>> all_outputs(constructing.size());
+        auto gbz_output = *constructing.begin();
+        auto& output_names = all_outputs[0];
+
+        gbwtgraph::GBZ gbz;
+        load_gbz(gbz, gbwt_filename, gg_filename, IndexingParameters::verbosity == IndexingParameters::Debug);
+
+        string output_name = plan->output_filepath(gbz_output);
+        save_gbz(gbz, output_name, IndexingParameters::verbosity == IndexingParameters::Debug);
+
+        output_names.push_back(output_name);
+        return all_outputs;
+    });
+
+    // Build "Haplotype Index" (.hapl) from an unsampled GBZ.
+    // The distance index is found at <gbz_basename>.dist or built from the GBZ.
+    // The r-index is constructed inline from the GBZ's GBWT.
+    // Using only "Unsampled Giraffe GBZ" as input avoids the cycle:
+    //   Haplotype Index → Giraffe Distance Index → Giraffe GBZ → Haplotype Index
+    registry.register_recipe({"Haplotype Index"}, {"Unsampled Giraffe GBZ"},
+                             [](const vector<const IndexFile*>& inputs,
+                                const IndexingPlan* plan,
+                                AliasGraph& alias_graph,
+                                const IndexGroup& constructing) {
+        if (IndexingParameters::verbosity != IndexingParameters::None) {
+            info(context) << "Generating haplotype information." << endl;
+        }
+
+        assert(inputs.size() == 1);
+        auto gbz_filenames = inputs[0]->get_filenames();
+        assert(gbz_filenames.size() == 1);
+        auto gbz_filename = gbz_filenames.front();
+
+        assert(constructing.size() == 1);
+        vector<vector<string>> all_outputs(constructing.size());
+        auto hapl_output = *constructing.begin();
+        auto& output_names = all_outputs[0];
+
+        // Load GBZ.
+        gbwtgraph::GBZ gbz;
+        load_gbz(gbz, gbz_filename, IndexingParameters::verbosity == IndexingParameters::Debug);
+
+        // Find the distance index by stripping known GBZ suffixes from the filename.
+        SnarlDistanceIndex distance_index;
+        string dist_candidate = gbz_filename;
+        for (const string& suffix : {".unsampled.gbz", ".giraffe.gbz", ".gbz"}) {
+            if (dist_candidate.size() > suffix.size() &&
+                dist_candidate.substr(dist_candidate.size() - suffix.size()) == suffix) {
+                dist_candidate = dist_candidate.substr(0, dist_candidate.size() - suffix.size()) + ".dist";
+                break;
+            }
+        }
+        if (ifstream(dist_candidate).is_open()) {
+            if (IndexingParameters::verbosity != IndexingParameters::None) {
+                info(context) << "Loading distance index from " << dist_candidate << "." << endl;
+            }
+            distance_index.deserialize(dist_candidate);
+        } else {
+            if (IndexingParameters::verbosity != IndexingParameters::None) {
+                info(context) << "Building distance index." << endl;
+            }
+            IntegratedSnarlFinder snarl_finder(gbz.graph);
+            fill_in_distance_index(&distance_index, &gbz.graph, &snarl_finder);
+        }
+
+        // Build minimizer index without payload, using short-read parameters.
+        MinimizerIndexParameters minimizer_params;
+        minimizer_params.minimizers(IndexingParameters::short_read_minimizer_k,
+                                    IndexingParameters::short_read_minimizer_w)
+                        .verbose(IndexingParameters::verbosity >= IndexingParameters::Debug);
+        HaplotypePartitioner::minimizer_index_type minimizer_index =
+            build_minimizer_index(gbz, nullptr, nullptr, minimizer_params);
+
+        // Build r-index inline from the GBZ's GBWT.
+        gbwt::FastLocate r_index(gbz.index);
+        r_index.setGBWT(gbz.index);
+
+        // Partition the haplotypes.
+        Haplotypes::Verbosity hap_verbosity = (IndexingParameters::verbosity != IndexingParameters::None
+                                               ? Haplotypes::verbosity_basic
+                                               : Haplotypes::verbosity_silent);
+        HaplotypePartitioner partitioner(gbz, r_index, distance_index, minimizer_index, hap_verbosity);
+        HaplotypePartitioner::Parameters partitioner_params;
+        Haplotypes haplotypes = partitioner.partition_haplotypes(partitioner_params);
+
+        // Save the haplotype information.
+        string output_name = plan->output_filepath(hapl_output);
+        haplotypes.serialize_to(output_name);
+
+        output_names.push_back(output_name);
+        return all_outputs;
+    });
+
+    // Build "KFF Kmer Counts" (.kff) by counting k-mers from reads using kmc.
+    // FASTQ paths are passed via IndexingParameters::haplotype_fastq_1/2.
+    registry.register_recipe({"KFF Kmer Counts"}, {},
+                             [](const vector<const IndexFile*>& inputs,
+                                const IndexingPlan* plan,
+                                AliasGraph& alias_graph,
+                                const IndexGroup& constructing) {
+        if (IndexingParameters::verbosity != IndexingParameters::None) {
+            info(context) << "Counting k-mers from reads for haplotype sampling." << endl;
+        }
+
+        const string& fastq1 = IndexingParameters::haplotype_fastq_1;
+        const string& fastq2 = IndexingParameters::haplotype_fastq_2;
+        if (fastq1.empty()) {
+            error(context) << "No reads provided for k-mer counting. "
+                           << "Use -f to specify input reads." << endl;
+        }
+
+        assert(constructing.size() == 1);
+        vector<vector<string>> all_outputs(constructing.size());
+        auto kff_output = *constructing.begin();
+        auto& output_names = all_outputs[0];
+
+        string output_name = plan->output_filepath(kff_output);
+
+        // Create a temp directory for kmc intermediate files.
+        string tmp_dir = temp_file::create_directory();
+
+        // Write a file listing the input reads.
+        string list_file = tmp_dir + "/reads.txt";
+        {
+            ofstream list_out(list_file);
+            list_out << fastq1 << "\n";
+            if (!fastq2.empty()) {
+                list_out << fastq2 << "\n";
+            }
+        }
+
+        // kmc produces output_prefix.kff; strip the ".kff" extension to get the prefix.
+        string kmc_prefix = output_name;
+        if (kmc_prefix.size() >= 4 && kmc_prefix.substr(kmc_prefix.size() - 4) == ".kff") {
+            kmc_prefix = kmc_prefix.substr(0, kmc_prefix.size() - 4);
+        }
+
+        string kmc_tmp_dir = tmp_dir + "/kmc_tmp";
+        mkdir(kmc_tmp_dir.c_str(), 0700);
+
+        // Run kmc: count 29-mers in KFF format with canonical k-mers.
+        string cmd = "kmc -k29 -m128 -okff -hp @" + list_file
+                     + " " + kmc_prefix + " " + kmc_tmp_dir;
+        if (IndexingParameters::verbosity == IndexingParameters::None) {
+            cmd += " > /dev/null 2>&1";
+        }
+        int ret = system(cmd.c_str());
+        if (ret != 0) {
+            error(context) << "kmc failed with exit code " << ret
+                           << ". Make sure kmc is in your PATH." << endl;
+        }
+
+        output_names.push_back(output_name);
+        return all_outputs;
+    });
+
     registry.register_recipe({"Giraffe GBZ"}, {"GBZ"},
                              [](const vector<const IndexFile*>& inputs,
                                 const IndexingPlan* plan,
diff --git a/src/index_registry.hpp b/src/index_registry.hpp
index 1cb84a4d80..c5f7a03296 100644
--- a/src/index_registry.hpp
+++ b/src/index_registry.hpp
@@ -128,6 +128,12 @@ struct IndexingParameters {
     static double thread_chunk_inflation_factor;
     // whether indexing algorithms will log progress (if available) [Basic]
     static Verbosity verbosity;
+    // reference samples to override in GBZ during haplotype sampling [{}]
+    static std::unordered_set<std::string> haplotype_reference_samples;
+    // first FASTQ file to use for k-mer counting in haplotype sampling [""]
+    static std::string haplotype_fastq_1;
+    // second FASTQ file to use for k-mer counting in haplotype sampling [""]
+    static std::string haplotype_fastq_2;
 };
 
 /**
diff --git a/src/subcommand/giraffe_main.cpp b/src/subcommand/giraffe_main.cpp
index c68b938db5..7dc401fa38 100644
--- a/src/subcommand/giraffe_main.cpp
+++ b/src/subcommand/giraffe_main.cpp
@@ -31,7 +31,6 @@
 #include <bdsg/overlays/overlay_helper.hpp>
 
 #include "../gbwtgraph_helper.hpp"
-#include "../recombinator.hpp"
 
 #include <gbwtgraph/gbz.h>
 #include <gbwtgraph/minimizer.h>
@@ -617,15 +616,6 @@ std::string strip_suffixes(std::string filename, const std::vector<std::string>&
     return filename;
 }
 
-// Returns the name of the sampled GBZ.
-std::string sample_haplotypes(
-    const Logger& logger,
-    const std::vector<std::pair<std::string, std::string>>& indexes,
-    const std::unordered_set<std::string>& reference_samples,
-    const std::string& basename, const std::string& sample_name, const std::string& haplotype_file, const std::string& kff_file,
-    bool progress
-);
-
 //----------------------------------------------------------------------------
 
 void help_giraffe(char** argv, const BaseOptionGroup& parser, const std::map<std::string, Preset>& presets, bool full_help) {
@@ -668,6 +658,8 @@ void help_giraffe(char** argv, const BaseOptionGroup& parser, const std::map<std
          << "                                and annotate alignments with them" << endl;
 
     cerr << "haplotype sampling:" << endl
+         << "      --haplotype-sampling      enable fully automatic haplotype sampling" << endl
+         << "                                (build hapl index and count kmers automatically)" << endl
          << "      --haplotype-name FILE     sample from haplotype information in FILE" << endl
          << "      --kff-name FILE           sample according to kmer counts in FILE" << endl
          << "      --index-basename STR      name prefix for generated graph/index files" << endl
@@ -761,6 +753,7 @@ int main_giraffe(int argc, char** argv) {
     constexpr int OPT_KFF_NAME = 1101;
     constexpr int OPT_INDEX_BASENAME = 1102;
     constexpr int OPT_SET_REFERENCE = 1103;
+    constexpr int OPT_HAPLOTYPE_SAMPLING = 1104;
 
 
     // initialize parameters with their default options
@@ -775,6 +768,7 @@ int main_giraffe(int argc, char** argv) {
 
     // For haplotype sampling.
     string haplotype_name, kff_name;
+    bool haplotype_sampling_flag = false;
     std::unordered_set<std::string> reference_samples;
 
     string output_basename;
@@ -1071,6 +1065,7 @@ int main_giraffe(int argc, char** argv) {
         {"zipcode-name", required_argument, 0, 'z'},
         {"dist-name", required_argument, 0, 'd'},
         {"progress", no_argument, 0, 'p'},
+        {"haplotype-sampling", no_argument, 0, OPT_HAPLOTYPE_SAMPLING},
         {"haplotype-name", required_argument, 0, OPT_HAPLOTYPE_NAME},
         {"kff-name", required_argument, 0, OPT_KFF_NAME},
         {"index-basename", required_argument, 0, OPT_INDEX_BASENAME},
@@ -1186,6 +1181,9 @@ int main_giraffe(int argc, char** argv) {
                 show_progress = true;
                 break;
 
+            case OPT_HAPLOTYPE_SAMPLING:
+                haplotype_sampling_flag = true;
+                break;
             case OPT_HAPLOTYPE_NAME:
                 haplotype_name = optarg;
                 break;
@@ -1405,6 +1403,12 @@ int main_giraffe(int argc, char** argv) {
     for (const auto& filename : provided_indexes) {
         require_exists(logger, filename.second);
     }
+    if (!haplotype_name.empty()) {
+        require_exists(logger, haplotype_name);
+    }
+    if (!kff_name.empty()) {
+        require_exists(logger, kff_name);
+    }
     
     // Decide if we are outputting to an htslib format
     bool hts_output = (output_format == "SAM" || output_format == "BAM" || output_format == "CRAM");
@@ -1459,21 +1463,72 @@ int main_giraffe(int argc, char** argv) {
         // We don't have file paths to load defined for recombination-aware short-read minimizers.
         logger.error() << "Path minimizers cannot be used with short reads." << endl;
     }
-    bool haplotype_sampling = !haplotype_name.empty() & !kff_name.empty();
+    // Haplotype sampling is active when explicitly requested (--haplotype-sampling)
+    // or when either --haplotype-name or --kff-name is given.
+    bool haplotype_sampling = haplotype_sampling_flag || !haplotype_name.empty() || !kff_name.empty();
+    // Save the GBZ-derived basename before any override, so we can find
+    // dist and other indexes co-located with the source GBZ during sampling.
+    string original_basename = index_basename;
     if (!index_basename_override.empty()) {
         index_basename = index_basename_override;
     }
     if (haplotype_sampling) {
-        // If we do haplotype sampling, we get a new GBZ and later build indexes for it.
-        string gbz_name = sample_haplotypes(logger, provided_indexes, reference_samples, index_basename, 
-                                            sample_name, haplotype_name, kff_name, show_progress);
-        registry.provide("Giraffe GBZ", gbz_name);
-        index_basename = split_ext(gbz_name).first;
+
+        // Determine sample name early so we can incorporate it into the
+        // index basename; downstream indexes (dist, min) will then be
+        // co-located with the sampled GBZ.
+        string sample = sample_name;
+        if (sample.empty()) {
+            if (!kff_name.empty()) {
+                sample = file_base_name(kff_name);
+                if (show_progress) {
+                    logger.info() << "Guessing from " << kff_name
+                                  << " that sample name is " << sample << std::endl;
+                }
+            } else if (!fastq_filename_1.empty()) {
+                // Strip .gz then use file_base_name to strip .fq/.fastq
+                sample = file_base_name(strip_suffixes(fastq_filename_1, {".gz"}));
+                if (show_progress) {
+                    logger.info() << "Guessing from " << fastq_filename_1
+                                  << " that sample name is " << sample << std::endl;
+                }
+            } else {
+                sample = "sample";
+            }
+        }
+        if (sample == "giraffe") {
+            logger.warn() << "Using \"giraffe\" as a sample name may lead to filename collisions." << std::endl;
+        }
+        index_basename = index_basename + "." + sample;
+
+        // Pass parameters to the recipes through IndexingParameters.
+        IndexingParameters::haplotype_reference_samples = reference_samples;
+        IndexingParameters::haplotype_fastq_1 = fastq_filename_1;
+        IndexingParameters::haplotype_fastq_2 = fastq_filename_2;
+
+        // Provide inputs to the registry.  The full-pangenome GBZ is typed
+        // as "Unsampled Giraffe GBZ" so the sampling recipe can distinguish
+        // it from the personalized "Giraffe GBZ" it will produce.
+        for (auto& index : provided_indexes) {
+            if (index.first == "Giraffe GBZ") {
+                registry.provide("Unsampled Giraffe GBZ", index.second);
+            } else {
+                registry.provide(index.first, index.second);
+            }
+        }
+
+        // Provide haplotype index and kff only if explicitly given.
+        if (!haplotype_name.empty()) {
+            registry.provide("Haplotype Index", haplotype_name);
+        }
+        if (!kff_name.empty()) {
+            registry.provide("KFF Kmer Counts", kff_name);
+        }
+
     } else {
-        // Otherwise we use the provided indexes.
+        // Otherwise use the provided indexes directly.
         for (auto& index : provided_indexes) {
             registry.provide(index.first, index.second);
-
         }
     }
     registry.set_prefix(index_basename);
@@ -2296,91 +2351,5 @@ int main_giraffe(int argc, char** argv) {
 
 //----------------------------------------------------------------------------
 
-std::string sample_haplotypes(
-    const Logger& logger,
-    const std::vector<std::pair<std::string, std::string>>& indexes,
-    const std::unordered_set<std::string>& reference_samples,
-    const std::string& basename, const std::string& sample_name, const std::string& haplotype_file, const std::string& kff_file,
-    bool progress
-) {
-    if (progress) {
-        logger.info() << "Sampling haplotypes" << std::endl;
-    }
-
-    // Sanity checks.
-    if (haplotype_file.empty() || kff_file.empty()) {
-        logger.error() << "Haplotype sampling requires --haplotype-name and --kff-name." << std::endl;
-    }
-
-    // Determine output name.
-    std::string sample = sample_name;
-    if (sample.empty()) {
-        sample = file_base_name(kff_file);
-        if (progress) {
-            logger.info() << "Guessing from " << kff_file
-                          << " that sample name is " << sample << std::endl;
-        }
-    }
-    if (sample == "giraffe") {
-        logger.warn() << "Using \"giraffe\" as a sample name may lead to filename collisions." << std::endl;
-    }
-    std::string output_name = basename + "." + sample + ".gbz";
-
-    // Load GBZ.
-    gbwtgraph::GBZ gbz;
-    if (indexes.size() == 1 && indexes[0].first == "Giraffe GBZ") {
-        load_gbz(gbz, indexes[0].second, progress);
-    } else if (indexes.size() == 2 && indexes[0].first == "Giraffe GBWT" && indexes[1].first == "GBWTGraph") {
-        load_gbz(gbz, indexes[0].second, indexes[1].second, progress);
-    } else if (indexes.size() == 2 && indexes[0].first == "GBWTGraph" && indexes[1].first == "Giraffe GBWT") {
-        load_gbz(gbz, indexes[1].second, indexes[0].second, progress);
-    } else {
-        logger.error() << "Haplotype sampling requires either -Z "
-                       << "or (-g and -H) with no other indexes." << std::endl;
-    }
-
-    // Override reference samples if necessary.
-    if (!reference_samples.empty()) {
-        if (progress) {
-            logger.info() << "Updating reference samples" << std::endl;
-        }
-        size_t present = gbz.set_reference_samples(reference_samples);
-        if (present != reference_samples.size()) {
-            logger.warn() << "Only " << present << " out of " << reference_samples.size()
-                             << " reference samples are present" << std::endl;
-        }
-    }
-
-    // Load haplotype information.
-    if (progress) {
-        logger.info() << "Loading haplotype information from "
-                      << haplotype_file << std::endl;
-    }
-    Haplotypes haplotypes;
-    haplotypes.load_from(haplotype_file);
-
-    // Sample haplotypes.
-    Haplotypes::Verbosity verbosity = (progress ? Haplotypes::verbosity_basic : Haplotypes::verbosity_silent);
-    Recombinator recombinator(gbz, haplotypes, verbosity);
-    Recombinator::Parameters parameters(Recombinator::Parameters::preset_diploid);
-    gbwt::GBWT sampled_gbwt;
-    try {
-        sampled_gbwt = recombinator.generate_haplotypes(kff_file, parameters);
-    } catch (const std::runtime_error& e) {
-        logger.error() << "Haplotype sampling failed: " << e.what() << std::endl;
-    }
-
-    // Create GBWTGraph and save GBZ.
-    if (progress) {
-        logger.info() << "Building GBWTGraph" << std::endl;
-    }
-    gbwtgraph::GBZ sampled_graph(std::move(sampled_gbwt), gbz);
-    save_gbz(sampled_graph, output_name, progress);
-
-    return output_name;
-}
-
-//----------------------------------------------------------------------------
-
 // Register subcommand
 static Subcommand vg_giraffe("giraffe", "fast haplotype-aware read alignment", PIPELINE, 6, main_giraffe);
diff --git a/test/t/54_vg_haplotypes.t b/test/t/54_vg_haplotypes.t
index b951283800..eb82e1da08 100644
--- a/test/t/54_vg_haplotypes.t
+++ b/test/t/54_vg_haplotypes.t
@@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap
 
 PATH=../bin:$PATH # for vg
 
-plan tests 34
+plan tests 40
 
 # The test graph consists of two subgraphs of the HPRC Minigraph-Cactus v1.1 graph:
 # - GRCh38#chr6:31498145-31511124 (micb)
@@ -73,7 +73,7 @@ is $(vg gbwt -H -Z diploid3.gbz) 3 "2 generated + 1 reference haplotypes"
 vg giraffe -Z full.gbz --haplotype-name full.hapl --kff-name haplotype-sampling/HG003.kff \
     -f haplotype-sampling/HG003.fq.gz > default.gam 2> /dev/null
 is $? 0 "Giraffe integration with a guessed output name"
-cmp diploid.gbz full.HG003.gbz
+cmp diploid.gbz full.HG003.giraffe.gbz
 is $? 0 "the sampled graph is identical to a manually sampled one"
 
 # Giraffe integration, specified output name
@@ -81,7 +81,7 @@ vg giraffe -Z full.gbz --haplotype-name full.hapl --kff-name haplotype-sampling/
     --index-basename sampled -N 003HG \
     -f haplotype-sampling/HG003.fq.gz > specified.gam 2> /dev/null
 is $? 0 "Giraffe integration with a specified output name"
-cmp full.HG003.gbz sampled.003HG.gbz
+cmp full.HG003.giraffe.gbz sampled.003HG.giraffe.gbz
 is $? 0 "the sampled graphs are identical"
 
 # Giraffe integration, specified reference sample
@@ -89,9 +89,35 @@ vg giraffe -Z full.gbz --haplotype-name full.hapl --kff-name haplotype-sampling/
     --index-basename GRCh38 -N HG003 --set-reference GRCh38 \
     -f haplotype-sampling/HG003.fq.gz > HG003_GRCh38.gam 2> /dev/null
 is $? 0 "Giraffe integration with a specified reference sample"
-cmp diploid3.gbz GRCh38.HG003.gbz
+cmp diploid3.gbz GRCh38.HG003.giraffe.gbz
 is $? 0 "the sampled graph is identical to a manually sampled one"
 
+# Giraffe integration, haplotype index built automatically from the GBZ.
+# Providing --kff-name without --haplotype-name implies haplotype sampling;
+# the haplotype index (.hapl) is built by the IndexRegistry from the GBZ.
+vg giraffe -Z full.gbz --kff-name haplotype-sampling/HG003.kff \
+    --index-basename auto_hapl -N HG003 \
+    -f haplotype-sampling/HG003.fq.gz > auto_hapl.gam 2> /dev/null
+is $? 0 "Giraffe builds haplotype index automatically"
+is $(vg gbwt -H -Z auto_hapl.HG003.giraffe.gbz) 4 "auto-built hapl produces 2 diploid + 2 reference haplotypes"
+
+# Giraffe integration, kmer counting done automatically from reads.
+# Providing --haplotype-name without --kff-name implies haplotype sampling;
+# the kmer counts (.kff) are built by the IndexRegistry from the reads using kmc.
+vg giraffe -Z full.gbz --haplotype-name full.hapl \
+    --index-basename auto_kff -N HG003 \
+    -f haplotype-sampling/HG003.fq.gz > auto_kff.gam 2> /dev/null
+is $? 0 "Giraffe counts kmers from reads automatically"
+is $(vg gbwt -H -Z auto_kff.HG003.giraffe.gbz) 4 "auto kmer counting produces 2 diploid + 2 reference haplotypes"
+
+# Giraffe integration, fully automatic: both hapl and kff are built by the
+# IndexRegistry. Triggered by --haplotype-sampling without either input file.
+vg giraffe -Z full.gbz --haplotype-sampling \
+    --index-basename auto_all -N HG003 \
+    -f haplotype-sampling/HG003.fq.gz > auto_all.gam 2> /dev/null
+is $? 0 "Giraffe does fully automatic haplotype sampling"
+is $(vg gbwt -H -Z auto_all.HG003.giraffe.gbz) 4 "fully automatic haplotype sampling produces 2 diploid + 2 reference haplotypes"
+
 # Attempts to use mismatched files fail
 vg haplotypes -i full.hapl -k haplotype-sampling/HG003.kff -g /dev/null diploid.gbz 2> log.txt
 is $? 1 "sampling with mismatched GBZ and haplotype information fails"
@@ -104,4 +130,7 @@ rm -f diploid.gbz diploid2.gbz diploid3.gbz
 rm -f full.HG003.* default.gam
 rm -f sampled.003HG.* specified.gam
 rm -f GRCh38.HG003.* HG003_GRCh38.gam
+rm -f auto_hapl.HG003.* auto_hapl.gam
+rm -f auto_kff.HG003.* auto_kff.gam
+rm -f auto_all.HG003.* auto_all.gam
 rm -f log.txt
\ No newline at end of file

From 57c06969b75aad76734035bf80f461263227724b Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Fri, 6 Mar 2026 16:37:48 -0500
Subject: [PATCH 02/22] Synthesize slightly better code that models more things
 in the graph

---
 src/index_registry.cpp          | 161 +++++++++++++++++++++-----------
 src/index_registry.hpp          |   4 -
 src/subcommand/giraffe_main.cpp |  27 +++++-
 3 files changed, 134 insertions(+), 58 deletions(-)

diff --git a/src/index_registry.cpp b/src/index_registry.cpp
index 6304650405..da0284e574 100644
--- a/src/index_registry.cpp
+++ b/src/index_registry.cpp
@@ -128,8 +128,6 @@ double IndexingParameters::max_memory_proportion = 0.75;
 double IndexingParameters::thread_chunk_inflation_factor = 2.0;
 IndexingParameters::Verbosity IndexingParameters::verbosity = IndexingParameters::Basic;
 std::unordered_set<std::string> IndexingParameters::haplotype_reference_samples = {};
-std::string IndexingParameters::haplotype_fastq_1 = "";
-std::string IndexingParameters::haplotype_fastq_2 = "";
 
 void copy_file(const string& from_fp, const string& to_fp) {
     require_exists(context, from_fp);
@@ -625,6 +623,10 @@ IndexRegistry VGIndexes::get_vg_index_registry() {
     registry.register_index("GBZ", "gbz");
     registry.register_index("Giraffe GBZ", "giraffe.gbz");
     registry.register_index("Unsampled Giraffe GBZ", "unsampled.gbz");
+    registry.register_index("Unsampled Giraffe GBZ Distance Index", "unsampled.dist");
+    registry.register_index("Unsampled Giraffe GBZ Top Level Chain Distance Index", "tcdist");
+    registry.register_index("Unsampled Giraffe GBZ r index", "unsampled.ri");
+    registry.register_index("Haplotype FASTQs", "hap.fastq");
     registry.register_index("Haplotype Index", "hapl");
     registry.register_index("KFF Kmer Counts", "kff");
     
@@ -4072,57 +4074,119 @@ IndexRegistry VGIndexes::get_vg_index_registry() {
         return all_outputs;
     });
 
-    // Build "Haplotype Index" (.hapl) from an unsampled GBZ.
-    // The distance index is found at <gbz_basename>.dist or built from the GBZ.
-    // The r-index is constructed inline from the GBZ's GBWT.
-    // Using only "Unsampled Giraffe GBZ" as input avoids the cycle:
-    //   Haplotype Index → Giraffe Distance Index → Giraffe GBZ → Haplotype Index
-    registry.register_recipe({"Haplotype Index"}, {"Unsampled Giraffe GBZ"},
+    // Build the r-index for an unsampled GBZ from its GBWT.
+    registry.register_recipe({"Unsampled Giraffe GBZ r index"}, {"Unsampled Giraffe GBZ"},
                              [](const vector<const IndexFile*>& inputs,
                                 const IndexingPlan* plan,
                                 AliasGraph& alias_graph,
                                 const IndexGroup& constructing) {
         if (IndexingParameters::verbosity != IndexingParameters::None) {
-            info(context) << "Generating haplotype information." << endl;
+            info(context) << "Building r-index for unsampled GBZ." << endl;
         }
 
         assert(inputs.size() == 1);
-        auto gbz_filenames = inputs[0]->get_filenames();
-        assert(gbz_filenames.size() == 1);
-        auto gbz_filename = gbz_filenames.front();
+        auto gbz_filename = inputs[0]->get_filenames().front();
 
         assert(constructing.size() == 1);
         vector<vector<string>> all_outputs(constructing.size());
-        auto hapl_output = *constructing.begin();
         auto& output_names = all_outputs[0];
 
-        // Load GBZ.
         gbwtgraph::GBZ gbz;
         load_gbz(gbz, gbz_filename, IndexingParameters::verbosity == IndexingParameters::Debug);
 
-        // Find the distance index by stripping known GBZ suffixes from the filename.
-        SnarlDistanceIndex distance_index;
-        string dist_candidate = gbz_filename;
-        for (const string& suffix : {".unsampled.gbz", ".giraffe.gbz", ".gbz"}) {
-            if (dist_candidate.size() > suffix.size() &&
-                dist_candidate.substr(dist_candidate.size() - suffix.size()) == suffix) {
-                dist_candidate = dist_candidate.substr(0, dist_candidate.size() - suffix.size()) + ".dist";
-                break;
-            }
+        gbwt::FastLocate r_index(gbz.index);
+        string output_name = plan->output_filepath(*constructing.begin());
+        save_r_index(r_index, output_name, IndexingParameters::verbosity != IndexingParameters::None);
+
+        output_names.push_back(output_name);
+        return all_outputs;
+    });
+
+    // Top-level-chain distance index: alias from a full unsampled distance index
+    // (higher priority — no extra computation needed).
+    registry.register_recipe({"Unsampled Giraffe GBZ Top Level Chain Distance Index"},
+                             {"Unsampled Giraffe GBZ Distance Index"},
+                             [](const vector<const IndexFile*>& inputs,
+                                const IndexingPlan* plan,
+                                AliasGraph& alias_graph,
+                                const IndexGroup& constructing) {
+        alias_graph.register_alias(*constructing.begin(), inputs[0]);
+        return vector<vector<string>>(1, inputs.front()->get_filenames());
+    });
+
+    // Top-level-chain distance index: build cheaply from the GBZ with
+    // only_top_level_chain_distances=true (lower priority).
+    registry.register_recipe({"Unsampled Giraffe GBZ Top Level Chain Distance Index"},
+                             {"Unsampled Giraffe GBZ"},
+                             [](const vector<const IndexFile*>& inputs,
+                                const IndexingPlan* plan,
+                                AliasGraph& alias_graph,
+                                const IndexGroup& constructing) {
+        if (IndexingParameters::verbosity != IndexingParameters::None) {
+            info(context) << "Building top-level-chain distance index for unsampled GBZ." << endl;
         }
-        if (ifstream(dist_candidate).is_open()) {
-            if (IndexingParameters::verbosity != IndexingParameters::None) {
-                info(context) << "Loading distance index from " << dist_candidate << "." << endl;
-            }
-            distance_index.deserialize(dist_candidate);
-        } else {
-            if (IndexingParameters::verbosity != IndexingParameters::None) {
-                info(context) << "Building distance index." << endl;
-            }
-            IntegratedSnarlFinder snarl_finder(gbz.graph);
-            fill_in_distance_index(&distance_index, &gbz.graph, &snarl_finder);
+
+        assert(inputs.size() == 1);
+        auto gbz_filename = inputs[0]->get_filenames().front();
+
+        assert(constructing.size() == 1);
+        vector<vector<string>> all_outputs(constructing.size());
+        auto& output_names = all_outputs[0];
+
+        gbwtgraph::GBZ gbz;
+        load_gbz(gbz, gbz_filename, IndexingParameters::verbosity == IndexingParameters::Debug);
+
+        string output_name = plan->output_filepath(*constructing.begin());
+        SnarlDistanceIndex distance_index;
+        IntegratedSnarlFinder snarl_finder(gbz.graph);
+        fill_in_distance_index(&distance_index, &gbz.graph, &snarl_finder,
+                               /*size_limit=*/50000, /*only_top_level_chain_distances=*/true);
+        distance_index.serialize(output_name);
+
+        output_names.push_back(output_name);
+        return all_outputs;
+    });
+
+    // Build "Haplotype Index" (.hapl) from the unsampled GBZ, its top-level-chain
+    // distance index, and its r-index.
+    // Alphabetical input order: GBZ < Top Level Chain Distance Index < r index
+    registry.register_recipe({"Haplotype Index"},
+                             {"Unsampled Giraffe GBZ",
+                              "Unsampled Giraffe GBZ Top Level Chain Distance Index",
+                              "Unsampled Giraffe GBZ r index"},
+                             [](const vector<const IndexFile*>& inputs,
+                                const IndexingPlan* plan,
+                                AliasGraph& alias_graph,
+                                const IndexGroup& constructing) {
+        if (IndexingParameters::verbosity != IndexingParameters::None) {
+            info(context) << "Generating haplotype information." << endl;
         }
 
+        assert(inputs.size() == 3);
+        // inputs[0] = Unsampled Giraffe GBZ
+        // inputs[1] = Unsampled Giraffe GBZ Top Level Chain Distance Index
+        // inputs[2] = Unsampled Giraffe GBZ r index
+        auto gbz_filename  = inputs[0]->get_filenames().front();
+        auto dist_filename = inputs[1]->get_filenames().front();
+        auto ri_filename   = inputs[2]->get_filenames().front();
+
+        assert(constructing.size() == 1);
+        vector<vector<string>> all_outputs(constructing.size());
+        auto& output_names = all_outputs[0];
+
+        // Load GBZ.
+        gbwtgraph::GBZ gbz;
+        load_gbz(gbz, gbz_filename, IndexingParameters::verbosity == IndexingParameters::Debug);
+
+        // Load distance index.
+        SnarlDistanceIndex distance_index;
+        distance_index.deserialize(dist_filename);
+
+        // Load r-index and restore the GBWT pointer.
+        gbwt::FastLocate r_index;
+        load_r_index(r_index, ri_filename, IndexingParameters::verbosity != IndexingParameters::None);
+        r_index.setGBWT(gbz.index);
+
         // Build minimizer index without payload, using short-read parameters.
         MinimizerIndexParameters minimizer_params;
         minimizer_params.minimizers(IndexingParameters::short_read_minimizer_k,
@@ -4131,10 +4195,6 @@ IndexRegistry VGIndexes::get_vg_index_registry() {
         HaplotypePartitioner::minimizer_index_type minimizer_index =
             build_minimizer_index(gbz, nullptr, nullptr, minimizer_params);
 
-        // Build r-index inline from the GBZ's GBWT.
-        gbwt::FastLocate r_index(gbz.index);
-        r_index.setGBWT(gbz.index);
-
         // Partition the haplotypes.
         Haplotypes::Verbosity hap_verbosity = (IndexingParameters::verbosity != IndexingParameters::None
                                                ? Haplotypes::verbosity_basic
@@ -4144,16 +4204,16 @@ IndexRegistry VGIndexes::get_vg_index_registry() {
         Haplotypes haplotypes = partitioner.partition_haplotypes(partitioner_params);
 
         // Save the haplotype information.
-        string output_name = plan->output_filepath(hapl_output);
+        string output_name = plan->output_filepath(*constructing.begin());
         haplotypes.serialize_to(output_name);
 
         output_names.push_back(output_name);
         return all_outputs;
     });
 
-    // Build "KFF Kmer Counts" (.kff) by counting k-mers from reads using kmc.
-    // FASTQ paths are passed via IndexingParameters::haplotype_fastq_1/2.
-    registry.register_recipe({"KFF Kmer Counts"}, {},
+    // Build "KFF Kmer Counts" (.kff) by counting k-mers from the provided reads
+    // using kmc. The reads are registered as "Haplotype FASTQs".
+    registry.register_recipe({"KFF Kmer Counts"}, {"Haplotype FASTQs"},
                              [](const vector<const IndexFile*>& inputs,
                                 const IndexingPlan* plan,
                                 AliasGraph& alias_graph,
@@ -4162,19 +4222,17 @@ IndexRegistry VGIndexes::get_vg_index_registry() {
             info(context) << "Counting k-mers from reads for haplotype sampling." << endl;
         }
 
-        const string& fastq1 = IndexingParameters::haplotype_fastq_1;
-        const string& fastq2 = IndexingParameters::haplotype_fastq_2;
-        if (fastq1.empty()) {
-            error(context) << "No reads provided for k-mer counting. "
-                           << "Use -f to specify input reads." << endl;
+        assert(inputs.size() == 1);
+        auto fastq_filenames = inputs[0]->get_filenames();
+        if (fastq_filenames.empty()) {
+            error(context) << "No reads provided for k-mer counting." << endl;
         }
 
         assert(constructing.size() == 1);
         vector<vector<string>> all_outputs(constructing.size());
-        auto kff_output = *constructing.begin();
         auto& output_names = all_outputs[0];
 
-        string output_name = plan->output_filepath(kff_output);
+        string output_name = plan->output_filepath(*constructing.begin());
 
         // Create a temp directory for kmc intermediate files.
         string tmp_dir = temp_file::create_directory();
@@ -4183,9 +4241,8 @@ IndexRegistry VGIndexes::get_vg_index_registry() {
         string list_file = tmp_dir + "/reads.txt";
         {
             ofstream list_out(list_file);
-            list_out << fastq1 << "\n";
-            if (!fastq2.empty()) {
-                list_out << fastq2 << "\n";
+            for (const string& fq : fastq_filenames) {
+                list_out << fq << "\n";
             }
         }
 
diff --git a/src/index_registry.hpp b/src/index_registry.hpp
index c5f7a03296..c11d283469 100644
--- a/src/index_registry.hpp
+++ b/src/index_registry.hpp
@@ -130,10 +130,6 @@ struct IndexingParameters {
     static Verbosity verbosity;
     // reference samples to override in GBZ during haplotype sampling [{}]
     static std::unordered_set<std::string> haplotype_reference_samples;
-    // first FASTQ file to use for k-mer counting in haplotype sampling [""]
-    static std::string haplotype_fastq_1;
-    // second FASTQ file to use for k-mer counting in haplotype sampling [""]
-    static std::string haplotype_fastq_2;
 };
 
 /**
diff --git a/src/subcommand/giraffe_main.cpp b/src/subcommand/giraffe_main.cpp
index 7dc401fa38..7fb032bb83 100644
--- a/src/subcommand/giraffe_main.cpp
+++ b/src/subcommand/giraffe_main.cpp
@@ -5,6 +5,7 @@
 #include <omp.h>
 #include <unistd.h>
 #include <getopt.h>
+#include <fstream>
 #include <iostream>
 #include <cassert>
 #include <cstring>
@@ -1503,15 +1504,18 @@ int main_giraffe(int argc, char** argv) {
 
         // Pass parameters to the recipes through IndexingParameters.
         IndexingParameters::haplotype_reference_samples = reference_samples;
-        IndexingParameters::haplotype_fastq_1 = fastq_filename_1;
-        IndexingParameters::haplotype_fastq_2 = fastq_filename_2;
 
         // Provide inputs to the registry.  The full-pangenome GBZ is typed
         // as "Unsampled Giraffe GBZ" so the sampling recipe can distinguish
         // it from the personalized "Giraffe GBZ" it will produce.
+        // Likewise, a provided "Giraffe Distance Index" is re-typed as
+        // "Unsampled Giraffe GBZ Distance Index" since it belongs to the
+        // unsampled graph.
         for (auto& index : provided_indexes) {
             if (index.first == "Giraffe GBZ") {
                 registry.provide("Unsampled Giraffe GBZ", index.second);
+            } else if (index.first == "Giraffe Distance Index") {
+                registry.provide("Unsampled Giraffe GBZ Distance Index", index.second);
             } else {
                 registry.provide(index.first, index.second);
             }
@@ -1525,6 +1529,25 @@ int main_giraffe(int argc, char** argv) {
             registry.provide("KFF Kmer Counts", kff_name);
         }
 
+        // Provide FASTQs as a registry node so the KFF kmer counting recipe
+        // can discover them without going through IndexingParameters.
+        if (!fastq_filename_1.empty()) {
+            vector<string> fastqs = {fastq_filename_1};
+            if (!fastq_filename_2.empty()) {
+                fastqs.push_back(fastq_filename_2);
+            }
+            registry.provide("Haplotype FASTQs", fastqs);
+        }
+
+        // Auto-provide the unsampled distance index from the original GBZ
+        // basename if a .dist file exists there and we don't already have one.
+        if (!registry.available("Unsampled Giraffe GBZ Distance Index")) {
+            string dist_candidate = original_basename + ".dist";
+            if (ifstream(dist_candidate).good()) {
+                registry.provide("Unsampled Giraffe GBZ Distance Index", dist_candidate);
+            }
+        }
+
     } else {
         // Otherwise use the provided indexes directly.
         for (auto& index : provided_indexes) {

From c5d06dcb614bbf7dcca710b60f38794c1f578b01 Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Fri, 6 Mar 2026 17:08:44 -0500
Subject: [PATCH 03/22] Synthesize code to avoid shell

---
 README.md                       |  2 +-
 src/index_registry.cpp          | 63 ++++++++++++++++++++++++++-------
 src/subcommand/giraffe_main.cpp | 13 ++++---
 3 files changed, 61 insertions(+), 17 deletions(-)

diff --git a/README.md b/README.md
index 592270d463..a3e1d5e4cd 100644
--- a/README.md
+++ b/README.md
@@ -91,7 +91,7 @@ On other distros, or if you do not have root access, you will need to perform th
                          automake gettext autopoint libtool jq bsdmainutils bc rs parallel \
                          npm curl unzip redland-utils librdf-dev bison flex gawk lzma-dev \
                          liblzma-dev liblz4-dev libffi-dev libcairo-dev libboost-all-dev \
-                         libzstd-dev pybind11-dev python3-pybind11 libssl-dev
+                         libzstd-dev pybind11-dev python3-pybind11 libssl-dev kmc
                          
 At present, you will need GCC version 9 or greater, with support for C++17, to compile vg. (Check your version with `gcc --version`.) GCC up to 11.4.0 is supported.
 
diff --git a/src/index_registry.cpp b/src/index_registry.cpp
index da0284e574..e16f6d0b44 100644
--- a/src/index_registry.cpp
+++ b/src/index_registry.cpp
@@ -17,8 +17,10 @@
 #include <cstdlib>
 #include <regex>
 #include <omp.h>
+#include <fcntl.h>
 #include <sys/stat.h>
 #include <sys/wait.h>
+#include <unistd.h>
 
 #include <bdsg/hash_graph.hpp>
 #include <bdsg/packed_graph.hpp>
@@ -626,7 +628,7 @@ IndexRegistry VGIndexes::get_vg_index_registry() {
     registry.register_index("Unsampled Giraffe GBZ Distance Index", "unsampled.dist");
     registry.register_index("Unsampled Giraffe GBZ Top Level Chain Distance Index", "tcdist");
     registry.register_index("Unsampled Giraffe GBZ r index", "unsampled.ri");
-    registry.register_index("Haplotype FASTQs", "hap.fastq");
+    registry.register_index("Sample FASTQ", "sample.fastq");
     registry.register_index("Haplotype Index", "hapl");
     registry.register_index("KFF Kmer Counts", "kff");
     
@@ -4212,8 +4214,8 @@ IndexRegistry VGIndexes::get_vg_index_registry() {
     });
 
     // Build "KFF Kmer Counts" (.kff) by counting k-mers from the provided reads
-    // using kmc. The reads are registered as "Haplotype FASTQs".
-    registry.register_recipe({"KFF Kmer Counts"}, {"Haplotype FASTQs"},
+    // using kmc. The reads are registered as "Sample FASTQ".
+    registry.register_recipe({"KFF Kmer Counts"}, {"Sample FASTQ"},
                              [](const vector<const IndexFile*>& inputs,
                                 const IndexingPlan* plan,
                                 AliasGraph& alias_graph,
@@ -4256,15 +4258,52 @@ IndexRegistry VGIndexes::get_vg_index_registry() {
         mkdir(kmc_tmp_dir.c_str(), 0700);
 
         // Run kmc: count 29-mers in KFF format with canonical k-mers.
-        string cmd = "kmc -k29 -m128 -okff -hp @" + list_file
-                     + " " + kmc_prefix + " " + kmc_tmp_dir;
-        if (IndexingParameters::verbosity == IndexingParameters::None) {
-            cmd += " > /dev/null 2>&1";
-        }
-        int ret = system(cmd.c_str());
-        if (ret != 0) {
-            error(context) << "kmc failed with exit code " << ret
-                           << ". Make sure kmc is in your PATH." << endl;
+        // Use fork()+execvp() instead of system() to avoid shell quoting issues.
+        string reads_arg = "@" + list_file;
+        {
+            // Flush all stdio streams before forking so the child doesn't
+            // inherit buffered data that would be flushed twice.
+            fflush(nullptr);
+            cout.flush();
+            cerr.flush();
+
+            pid_t pid = fork();
+            if (pid == -1) {
+                error(context) << "fork() failed for kmc: " << strerror(errno) << endl;
+            } else if (pid == 0) {
+                // Child: redirect stdout/stderr to /dev/null when not verbose.
+                if (IndexingParameters::verbosity == IndexingParameters::None) {
+                    int devnull = open("/dev/null", O_WRONLY);
+                    if (devnull != -1) {
+                        dup2(devnull, STDOUT_FILENO);
+                        dup2(devnull, STDERR_FILENO);
+                        close(devnull);
+                    }
+                }
+                char* kmc_argv[] = {
+                    const_cast<char*>("kmc"),
+                    const_cast<char*>("-k29"),
+                    const_cast<char*>("-m128"),
+                    const_cast<char*>("-okff"),
+                    const_cast<char*>("-hp"),
+                    const_cast<char*>(reads_arg.c_str()),
+                    const_cast<char*>(kmc_prefix.c_str()),
+                    const_cast<char*>(kmc_tmp_dir.c_str()),
+                    nullptr
+                };
+                execvp("kmc", kmc_argv);
+                // execvp only returns on failure.
+                _exit(127);
+            } else {
+                int child_stat;
+                waitpid(pid, &child_stat, 0);
+                int ret = WIFEXITED(child_stat) ? WEXITSTATUS(child_stat) : -1;
+                if (ret == 127) {
+                    error(context) << "kmc could not be executed. Make sure kmc is installed and in your PATH." << endl;
+                } else if (ret != 0) {
+                    error(context) << "kmc failed with exit code " << ret << "." << endl;
+                }
+            }
         }
 
         output_names.push_back(output_name);
diff --git a/src/subcommand/giraffe_main.cpp b/src/subcommand/giraffe_main.cpp
index 7fb032bb83..956733aa92 100644
--- a/src/subcommand/giraffe_main.cpp
+++ b/src/subcommand/giraffe_main.cpp
@@ -27,6 +27,7 @@
 #include "../hts_alignment_emitter.hpp"
 #include "../minimizer_mapper.hpp"
 #include "../index_registry.hpp"
+#include "../utility.hpp"
 #include "../watchdog.hpp"
 #include "../crash.hpp"
 #include <bdsg/overlays/overlay_helper.hpp>
@@ -1536,14 +1537,18 @@ int main_giraffe(int argc, char** argv) {
             if (!fastq_filename_2.empty()) {
                 fastqs.push_back(fastq_filename_2);
             }
-            registry.provide("Haplotype FASTQs", fastqs);
+            registry.provide("Sample FASTQ", fastqs);
         }
 
         // Auto-provide the unsampled distance index from the original GBZ
-        // basename if a .dist file exists there and we don't already have one.
+        // basename if a dist file exists there and we don't already have one.
         if (!registry.available("Unsampled Giraffe GBZ Distance Index")) {
-            string dist_candidate = original_basename + ".dist";
-            if (ifstream(dist_candidate).good()) {
+            // Prefer a top-level-chain-only .tcdist, then fall back to full .dist.
+            string tcdist_candidate = original_basename + ".tcdist";
+            string dist_candidate   = original_basename + ".dist";
+            if (file_exists(tcdist_candidate)) {
+                registry.provide("Unsampled Giraffe GBZ Top Level Chain Distance Index", tcdist_candidate);
+            } else if (file_exists(dist_candidate)) {
                 registry.provide("Unsampled Giraffe GBZ Distance Index", dist_candidate);
             }
         }

From 171c89347df6752aeb3c2e41673751b809323a2f Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Fri, 6 Mar 2026 17:17:43 -0500
Subject: [PATCH 04/22] Automatically split haplotype indexing kmer defaults
 from short read ones

---
 src/index_registry.cpp | 11 +++++++----
 src/index_registry.hpp |  4 ++++
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/index_registry.cpp b/src/index_registry.cpp
index e16f6d0b44..9d17496e58 100644
--- a/src/index_registry.cpp
+++ b/src/index_registry.cpp
@@ -130,6 +130,8 @@ double IndexingParameters::max_memory_proportion = 0.75;
 double IndexingParameters::thread_chunk_inflation_factor = 2.0;
 IndexingParameters::Verbosity IndexingParameters::verbosity = IndexingParameters::Basic;
 std::unordered_set<std::string> IndexingParameters::haplotype_reference_samples = {};
+int IndexingParameters::haplotype_minimizer_k = 29;
+int IndexingParameters::haplotype_minimizer_w = 11;
 
 void copy_file(const string& from_fp, const string& to_fp) {
     require_exists(context, from_fp);
@@ -4191,8 +4193,8 @@ IndexRegistry VGIndexes::get_vg_index_registry() {
 
         // Build minimizer index without payload, using short-read parameters.
         MinimizerIndexParameters minimizer_params;
-        minimizer_params.minimizers(IndexingParameters::short_read_minimizer_k,
-                                    IndexingParameters::short_read_minimizer_w)
+        minimizer_params.minimizers(IndexingParameters::haplotype_minimizer_k,
+                                    IndexingParameters::haplotype_minimizer_w)
                         .verbose(IndexingParameters::verbosity >= IndexingParameters::Debug);
         HaplotypePartitioner::minimizer_index_type minimizer_index =
             build_minimizer_index(gbz, nullptr, nullptr, minimizer_params);
@@ -4257,9 +4259,10 @@ IndexRegistry VGIndexes::get_vg_index_registry() {
         string kmc_tmp_dir = tmp_dir + "/kmc_tmp";
         mkdir(kmc_tmp_dir.c_str(), 0700);
 
-        // Run kmc: count 29-mers in KFF format with canonical k-mers.
+        // Run kmc: count k-mers in KFF format with canonical k-mers.
         // Use fork()+execvp() instead of system() to avoid shell quoting issues.
         string reads_arg = "@" + list_file;
+        string kmc_k_arg = "-k" + to_string(IndexingParameters::haplotype_minimizer_k);
         {
             // Flush all stdio streams before forking so the child doesn't
             // inherit buffered data that would be flushed twice.
@@ -4282,7 +4285,7 @@ IndexRegistry VGIndexes::get_vg_index_registry() {
                 }
                 char* kmc_argv[] = {
                     const_cast<char*>("kmc"),
-                    const_cast<char*>("-k29"),
+                    const_cast<char*>(kmc_k_arg.c_str()),
                     const_cast<char*>("-m128"),
                     const_cast<char*>("-okff"),
                     const_cast<char*>("-hp"),
diff --git a/src/index_registry.hpp b/src/index_registry.hpp
index c11d283469..82ea89a2c1 100644
--- a/src/index_registry.hpp
+++ b/src/index_registry.hpp
@@ -130,6 +130,10 @@ struct IndexingParameters {
     static Verbosity verbosity;
     // reference samples to override in GBZ during haplotype sampling [{}]
     static std::unordered_set<std::string> haplotype_reference_samples;
+    // length of k-mer used in haplotype index and kmer counting [29]
+    static int haplotype_minimizer_k;
+    // length of window used in haplotype index [11]
+    static int haplotype_minimizer_w;
 };
 
 /**

From 7c49f5ee00d6ac6f94b003f0c0ea3bf192f555a8 Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Fri, 6 Mar 2026 17:34:23 -0500
Subject: [PATCH 05/22] Simplify casting around child process

---
 src/index_registry.cpp | 84 ++++++++++++++++++++----------------------
 1 file changed, 39 insertions(+), 45 deletions(-)

diff --git a/src/index_registry.cpp b/src/index_registry.cpp
index 9d17496e58..2a3cc43893 100644
--- a/src/index_registry.cpp
+++ b/src/index_registry.cpp
@@ -4251,9 +4251,11 @@ IndexRegistry VGIndexes::get_vg_index_registry() {
         }
 
         // kmc produces output_prefix.kff; strip the ".kff" extension to get the prefix.
-        string kmc_prefix = output_name;
-        if (kmc_prefix.size() >= 4 && kmc_prefix.substr(kmc_prefix.size() - 4) == ".kff") {
-            kmc_prefix = kmc_prefix.substr(0, kmc_prefix.size() - 4);
+        string kmc_prefix;
+        if (output_name.size() >= 4 && output_name.substr(output_name.size() - 4) == ".kff") {
+            kmc_prefix = output_name.substr(0, output_name.size() - 4);
+        } else {
+            error(context) << "KFF file has wrong extension: " << output_name << endl;
         }
 
         string kmc_tmp_dir = tmp_dir + "/kmc_tmp";
@@ -4263,50 +4265,42 @@ IndexRegistry VGIndexes::get_vg_index_registry() {
         // Use fork()+execvp() instead of system() to avoid shell quoting issues.
         string reads_arg = "@" + list_file;
         string kmc_k_arg = "-k" + to_string(IndexingParameters::haplotype_minimizer_k);
-        {
-            // Flush all stdio streams before forking so the child doesn't
-            // inherit buffered data that would be flushed twice.
-            fflush(nullptr);
-            cout.flush();
-            cerr.flush();
-
-            pid_t pid = fork();
-            if (pid == -1) {
-                error(context) << "fork() failed for kmc: " << strerror(errno) << endl;
-            } else if (pid == 0) {
-                // Child: redirect stdout/stderr to /dev/null when not verbose.
-                if (IndexingParameters::verbosity == IndexingParameters::None) {
-                    int devnull = open("/dev/null", O_WRONLY);
-                    if (devnull != -1) {
-                        dup2(devnull, STDOUT_FILENO);
-                        dup2(devnull, STDERR_FILENO);
-                        close(devnull);
-                    }
-                }
-                char* kmc_argv[] = {
-                    const_cast<char*>("kmc"),
-                    const_cast<char*>(kmc_k_arg.c_str()),
-                    const_cast<char*>("-m128"),
-                    const_cast<char*>("-okff"),
-                    const_cast<char*>("-hp"),
-                    const_cast<char*>(reads_arg.c_str()),
-                    const_cast<char*>(kmc_prefix.c_str()),
-                    const_cast<char*>(kmc_tmp_dir.c_str()),
-                    nullptr
-                };
-                execvp("kmc", kmc_argv);
-                // execvp only returns on failure.
-                _exit(127);
-            } else {
-                int child_stat;
-                waitpid(pid, &child_stat, 0);
-                int ret = WIFEXITED(child_stat) ? WEXITSTATUS(child_stat) : -1;
-                if (ret == 127) {
-                    error(context) << "kmc could not be executed. Make sure kmc is installed and in your PATH." << endl;
-                } else if (ret != 0) {
-                    error(context) << "kmc failed with exit code " << ret << "." << endl;
+        const char* kmc_argv[] = {"kmc", kmc_k_arg.c_str(), "-m128", "-okff", "-hp",
+            reads_arg.c_str(), kmc_prefix.c_str(), kmc_tmp_dir.c_str(), nullptr};
+
+        // Flush shared output streams before forking
+        cout.flush();
+        cerr.flush();
+        fflush(nullptr);
+
+        pid_t pid = fork();
+        if (pid == -1) {
+            error(context) << "fork() failed for kmc: " << strerror(errno) << endl;
+        } else if (pid == 0) {
+            // Child: redirect stdout/stderr to /dev/null when not verbose.
+            if (IndexingParameters::verbosity == IndexingParameters::None) {
+                int devnull = open("/dev/null", O_WRONLY);
+                if (devnull != -1) {
+                    dup2(devnull, STDOUT_FILENO);
+                    dup2(devnull, STDERR_FILENO);
+                    close(devnull);
                 }
             }
+            // execvp promises never to modify its arguments but casn't say
+            // that in the C++ type system because it causes problems in the C
+            // type system. See https://stackoverflow.com/a/19505361
+            execvp("kmc", (char**)kmc_argv);
+            // execvp only returns on failure.
+            _exit(127);
+        } else {
+            int child_stat;
+            waitpid(pid, &child_stat, 0);
+            int ret = WIFEXITED(child_stat) ? WEXITSTATUS(child_stat) : -1;
+            if (ret == 127) {
+                error(context) << "kmc could not be executed. Make sure kmc is installed and in your PATH." << endl;
+            } else if (ret != 0) {
+                error(context) << "kmc failed with exit code " << ret << "." << endl;
+            }
         }
 
         output_names.push_back(output_name);

From d62309fb8865a7ac4a2f792b217a9378e156c5bc Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Fri, 6 Mar 2026 19:32:41 -0500
Subject: [PATCH 06/22] Rationalize index graph and try and fail to get sampled
 GBZ to use the documented name

---
 doc/test-docs.sh                  |   3 +
 src/index_registry.cpp            | 510 ++++++++++++++----------------
 src/subcommand/giraffe_main.cpp   | 153 +++++----
 test/haplotype-sampling/README.md |  26 +-
 test/t/54_vg_haplotypes.t         |  14 +-
 5 files changed, 346 insertions(+), 360 deletions(-)

diff --git a/doc/test-docs.sh b/doc/test-docs.sh
index 246e7c07aa..c9f0e6f9b0 100755
--- a/doc/test-docs.sh
+++ b/doc/test-docs.sh
@@ -16,6 +16,9 @@ cd "${HERE}/../test"
 echo txm --jobs 1 "${HERE}/../README.md"
 txm --jobs 1 "${HERE}/../README.md"
 
+# Test extra test READMEs
+(cd haplotype-sampling && txm --jobs 1 "README.md")
+
 # Run all the wiki tests
 find "${HERE}/wiki" -name "*.md" | xargs -n 1 -t txm --jobs 1
 
diff --git a/src/index_registry.cpp b/src/index_registry.cpp
index 2a3cc43893..021567bdec 100644
--- a/src/index_registry.cpp
+++ b/src/index_registry.cpp
@@ -626,11 +626,10 @@ IndexRegistry VGIndexes::get_vg_index_registry() {
     registry.register_index("GBWTGraph", "gg");
     registry.register_index("GBZ", "gbz");
     registry.register_index("Giraffe GBZ", "giraffe.gbz");
-    registry.register_index("Unsampled Giraffe GBZ", "unsampled.gbz");
-    registry.register_index("Unsampled Giraffe GBZ Distance Index", "unsampled.dist");
-    registry.register_index("Unsampled Giraffe GBZ Top Level Chain Distance Index", "tcdist");
-    registry.register_index("Unsampled Giraffe GBZ r index", "unsampled.ri");
-    registry.register_index("Sample FASTQ", "sample.fastq");
+    registry.register_index("Haplotype-Sampled GBZ", "sampled.gbz");
+    registry.register_index("Top Level Chain Distance Index", "tcdist");
+    registry.register_index("r Index", "ri");
+    registry.register_index("FASTQ", "fastq");
     registry.register_index("Haplotype Index", "hapl");
     registry.register_index("KFF Kmer Counts", "kff");
     
@@ -3969,13 +3968,223 @@ IndexRegistry VGIndexes::get_vg_index_registry() {
     });
     
     ////////////////////////////////////
-    // GBZ Recipes
+    // Giraffe GBZ Recipes
     ////////////////////////////////////
 
-    // Haplotype sampling: produce a personalized "Giraffe GBZ" by sampling
-    // haplotypes from a full-pangenome "Unsampled Giraffe GBZ" given
-    // haplotype information and kmer counts from reads.
-    registry.register_recipe({"Giraffe GBZ"}, {"Haplotype Index", "KFF Kmer Counts", "Unsampled Giraffe GBZ"},
+    // Top priority: If we have the inputs to do haplotype sampling, use a
+    // haloptype sampled GBZ.
+    registry.register_recipe({"Giraffe GBZ"}, {"Haplotype-Sampled GBZ"},
+                             [](const vector<const IndexFile*>& inputs,
+                                const IndexingPlan* plan,
+                                AliasGraph& alias_graph,
+                                const IndexGroup& constructing) {
+        alias_graph.register_alias(*constructing.begin(), inputs[0]);
+        return vector<vector<string>>(1, inputs.front()->get_filenames());
+    });
+
+    // Next priority: if we were handed a GBZ, use it as the Giraffe GBZ
+    registry.register_recipe({"Giraffe GBZ"}, {"GBZ"},
+                             [](const vector<const IndexFile*>& inputs,
+                                const IndexingPlan* plan,
+                                AliasGraph& alias_graph,
+                                const IndexGroup& constructing) {
+        alias_graph.register_alias(*constructing.begin(), inputs[0]);
+        return vector<vector<string>>(1, inputs.front()->get_filenames());
+    });
+   
+    // After that, start trying the old GBWT-based workflows
+    registry.register_recipe({"Giraffe GBZ"}, {"GBWTGraph", "Giraffe GBWT"},
+                             [](const vector<const IndexFile*>& inputs,
+                                const IndexingPlan* plan,
+                                AliasGraph& alias_graph,
+                                const IndexGroup& constructing) {
+        if (IndexingParameters::verbosity != IndexingParameters::None) {
+            info(context) << "Combining Giraffe GBWT and GBWTGraph into GBZ." << endl;
+        }
+
+        assert(inputs.size() == 2);
+        auto gbwt_filenames = inputs[1]->get_filenames();
+        auto gg_filenames = inputs[0]->get_filenames();
+        assert(gbwt_filenames.size() == 1);
+        assert(gg_filenames.size() == 1);
+        auto gbwt_filename = gbwt_filenames.front();
+        auto gg_filename = gg_filenames.front();
+
+        assert(constructing.size() == 1);
+        vector<vector<string>> all_outputs(constructing.size());
+        auto gbz_output = *constructing.begin();
+        auto& output_names = all_outputs[0];
+
+        gbwtgraph::GBZ gbz;
+        load_gbz(gbz, gbwt_filename, gg_filename, IndexingParameters::verbosity == IndexingParameters::Debug);
+
+        string output_name = plan->output_filepath(gbz_output);
+        save_gbz(gbz, output_name, IndexingParameters::verbosity == IndexingParameters::Debug);
+
+        output_names.push_back(output_name);
+        return all_outputs;
+    });
+    
+    // These used to be a GBWTGraph recipe, but we don't want to produce GBWTGraphs anymore.
+    
+    registry.register_recipe({"Giraffe GBZ"}, {"Giraffe GBWT", "NamedNodeBackTranslation", "XG"},
+                             [](const vector<const IndexFile*>& inputs,
+                                const IndexingPlan* plan,
+                                AliasGraph& alias_graph,
+                                const IndexGroup& constructing) {
+        if (IndexingParameters::verbosity != IndexingParameters::None) {
+            info(context) << "Constructing GBZ using NamedNodeBackTranslation." << endl;
+        }
+        
+        assert(inputs.size() == 3);
+        auto gbwt_filenames = inputs[0]->get_filenames();
+        auto translation_filenames = inputs[1]->get_filenames();
+        auto xg_filenames = inputs[2]->get_filenames();
+        assert(gbwt_filenames.size() == 1);
+        assert(translation_filenames.size() == 1);
+        assert(xg_filenames.size() == 1);
+        auto gbwt_filename = gbwt_filenames.front();
+        auto translation_filename = translation_filenames.front(); 
+        auto xg_filename = xg_filenames.front();
+        
+        assert(constructing.size() == 1);
+        vector<vector<string>> all_outputs(constructing.size());
+        auto gbz_output = *constructing.begin();
+        auto& output_names = all_outputs[0];
+        
+        ifstream infile_xg;
+        init_in(infile_xg, xg_filename);
+        auto xg_index = vg::io::VPKG::load_one<xg::XG>(infile_xg);
+        
+        ifstream infile_translation;
+        init_in(infile_translation, translation_filename);
+        // There's only one implementation we can use here at the moment, so
+        // don't bother with the normal loader/saver system.
+        FlatFileBackTranslation translation(infile_translation);
+
+        gbwt::GBWT index;
+        load_gbwt(index, gbwt_filename, IndexingParameters::verbosity == IndexingParameters::Debug);
+        gbwtgraph::GBZ gbz(std::move(index), *xg_index, &translation);
+
+        // We need to compute pggname manually, because a generic HandleGraph
+        // does not contain the name of the parent graph. And we use nullptr,
+        // because we currently cannot determine the name from another souce.
+        gbz.compute_pggname(nullptr);
+
+        string output_name = plan->output_filepath(gbz_output);
+        save_gbz(gbz, output_name, IndexingParameters::verbosity == IndexingParameters::Debug);
+
+        output_names.push_back(output_name);
+        return all_outputs;
+    });
+
+    registry.register_recipe({"Giraffe GBZ"}, {"Giraffe GBWT", "XG"},
+                             [](const vector<const IndexFile*>& inputs,
+                                const IndexingPlan* plan,
+                                AliasGraph& alias_graph,
+                                const IndexGroup& constructing) {
+        if (IndexingParameters::verbosity != IndexingParameters::None) {
+            info(context) << "Constructing GBZ." << endl;
+        }
+        
+        assert(inputs.size() == 2);
+        auto gbwt_filenames = inputs[0]->get_filenames();
+        auto xg_filenames = inputs[1]->get_filenames();
+        assert(gbwt_filenames.size() == 1);
+        assert(xg_filenames.size() == 1);
+        auto gbwt_filename = gbwt_filenames.front();
+        auto xg_filename = xg_filenames.front();
+        
+        assert(constructing.size() == 1);
+        vector<vector<string>> all_outputs(constructing.size());
+        auto gbz_output = *constructing.begin();
+        auto& output_names = all_outputs[0];
+        
+        ifstream infile_xg;
+        init_in(infile_xg, xg_filename);
+        auto xg_index = vg::io::VPKG::load_one<xg::XG>(infile_xg);
+
+        gbwt::GBWT index;
+        load_gbwt(index, gbwt_filename, IndexingParameters::verbosity == IndexingParameters::Debug);
+        gbwtgraph::GBZ gbz(std::move(index), *xg_index, algorithms::find_translation(xg_index.get()));
+
+        // We need to compute pggname manually, because a generic HandleGraph
+        // does not contain the name of the parent graph. And we use nullptr,
+        // because we currently cannot determine the name from another souce.
+        gbz.compute_pggname(nullptr);
+
+        string output_name = plan->output_filepath(gbz_output);
+        save_gbz(gbz, output_name, IndexingParameters::verbosity == IndexingParameters::Debug);
+
+        output_names.push_back(output_name);
+        return all_outputs;
+    });
+
+    ////////////////////////////////////
+    // General GBZ Recipes
+    ////////////////////////////////////
+
+    registry.register_recipe({"GBZ"}, {"Reference GFA w/ Haplotypes"},
+                             [](const vector<const IndexFile*>& inputs,
+                                const IndexingPlan* plan,
+                                AliasGraph& alias_graph,
+                                const IndexGroup& constructing) {
+        if (IndexingParameters::verbosity != IndexingParameters::None) {
+            info(context) << "Constructing a GBZ from GFA input." << endl;
+        }
+        
+        assert(inputs.size() == 1);
+        if (inputs[0]->get_filenames().size() != 1) {
+            error(context) << "Graph construction does not support multiple GFAs at this time." << endl;
+        }
+        auto gfa_filename = inputs[0]->get_filenames().front();
+        
+        assert(constructing.size() == 1);
+        vector<vector<string>> all_outputs(constructing.size());
+        auto gbz_output = *constructing.begin();
+        auto& output_names = all_outputs[0];
+        
+        string output_name = plan->output_filepath(gbz_output);
+        
+        gbwtgraph::GFAParsingParameters params = get_best_gbwtgraph_gfa_parsing_parameters();
+        // TODO: there's supposedly a heuristic to set batch size that could perform better than this global param,
+        // but it would be kind of a pain to update it like we do the global param
+        params.batch_size = IndexingParameters::gbwt_insert_batch_size;
+        params.sample_interval = IndexingParameters::gbwt_sampling_interval;
+        params.max_node_length = IndexingParameters::max_node_size;
+        // TODO: Here we assume that the GFA file contains 100 haplotypes. If there are more,
+        // we could safely launch more jobs. Using 600 as the divisor would be a better
+        // estimate of the number of segments, but we chop long segments to 32 bp nodes.
+        params.parallel_jobs = guess_parallel_gbwt_jobs(
+            std::max(get_file_size(gfa_filename) / 300, std::int64_t(1)),
+            100,
+            plan->target_memory_usage(),
+            params.batch_size
+        );
+        params.show_progress = IndexingParameters::verbosity == IndexingParameters::Debug;
+        if (IndexingParameters::verbosity >= IndexingParameters::Debug) {
+            info(context) << "Running " << params.parallel_jobs << " jobs in parallel" << std::endl;
+        }
+
+        // jointly generate the GBWT and record sequences
+        unique_ptr<gbwt::GBWT> gbwt_index;
+        unique_ptr<gbwtgraph::NaiveGraph> seq_source;
+        tie(gbwt_index, seq_source) = gbwtgraph::gfa_to_gbwt(gfa_filename, params);
+        
+        // convert sequences into GBZ and save
+        gbwtgraph::GBZ gbz(gbwt_index, seq_source);
+        save_gbz(gbz, output_name, IndexingParameters::verbosity == IndexingParameters::Debug);
+        
+        output_names.push_back(output_name);
+        return all_outputs;
+    });
+
+    ////////////////////////////////////
+    // Haplotype Sampling Recipes
+    ////////////////////////////////////
+
+    // Haplotype sampling: produce a personalized "Haplotype-Sampled GBZ"
+    registry.register_recipe({"Haplotype-Sampled GBZ"}, {"GBZ", "Haplotype Index", "KFF Kmer Counts"},
                              [](const vector<const IndexFile*>& inputs,
                                 const IndexingPlan* plan,
                                 AliasGraph& alias_graph,
@@ -3985,9 +4194,9 @@ IndexRegistry VGIndexes::get_vg_index_registry() {
         }
 
         assert(inputs.size() == 3);
-        auto hapl_filenames = inputs[0]->get_filenames();
-        auto kff_filenames = inputs[1]->get_filenames();
-        auto gbz_filenames = inputs[2]->get_filenames();
+        auto gbz_filenames = inputs[0]->get_filenames();
+        auto hapl_filenames = inputs[1]->get_filenames();
+        auto kff_filenames = inputs[2]->get_filenames();
         assert(gbz_filenames.size() == 1);
         assert(hapl_filenames.size() == 1);
         assert(kff_filenames.size() == 1);
@@ -4044,48 +4253,14 @@ IndexRegistry VGIndexes::get_vg_index_registry() {
         return all_outputs;
     });
 
-    // Allow "Unsampled Giraffe GBZ" to be built from GBWT + GBWTGraph,
-    // so users who provide those two files can still do haplotype sampling.
-    registry.register_recipe({"Unsampled Giraffe GBZ"}, {"GBWTGraph", "Giraffe GBWT"},
+    // Build the r-index for a GBZ from its GBWT.
+    registry.register_recipe({"r Index"}, {"GBZ"},
                              [](const vector<const IndexFile*>& inputs,
                                 const IndexingPlan* plan,
                                 AliasGraph& alias_graph,
                                 const IndexGroup& constructing) {
         if (IndexingParameters::verbosity != IndexingParameters::None) {
-            info(context) << "Combining Giraffe GBWT and GBWTGraph into unsampled GBZ." << endl;
-        }
-
-        assert(inputs.size() == 2);
-        auto gg_filenames = inputs[0]->get_filenames();
-        auto gbwt_filenames = inputs[1]->get_filenames();
-        assert(gg_filenames.size() == 1);
-        assert(gbwt_filenames.size() == 1);
-        auto gg_filename = gg_filenames.front();
-        auto gbwt_filename = gbwt_filenames.front();
-
-        assert(constructing.size() == 1);
-        vector<vector<string>> all_outputs(constructing.size());
-        auto gbz_output = *constructing.begin();
-        auto& output_names = all_outputs[0];
-
-        gbwtgraph::GBZ gbz;
-        load_gbz(gbz, gbwt_filename, gg_filename, IndexingParameters::verbosity == IndexingParameters::Debug);
-
-        string output_name = plan->output_filepath(gbz_output);
-        save_gbz(gbz, output_name, IndexingParameters::verbosity == IndexingParameters::Debug);
-
-        output_names.push_back(output_name);
-        return all_outputs;
-    });
-
-    // Build the r-index for an unsampled GBZ from its GBWT.
-    registry.register_recipe({"Unsampled Giraffe GBZ r index"}, {"Unsampled Giraffe GBZ"},
-                             [](const vector<const IndexFile*>& inputs,
-                                const IndexingPlan* plan,
-                                AliasGraph& alias_graph,
-                                const IndexGroup& constructing) {
-        if (IndexingParameters::verbosity != IndexingParameters::None) {
-            info(context) << "Building r-index for unsampled GBZ." << endl;
+            info(context) << "Building r-index for GBZ." << endl;
         }
 
         assert(inputs.size() == 1);
@@ -4106,28 +4281,14 @@ IndexRegistry VGIndexes::get_vg_index_registry() {
         return all_outputs;
     });
 
-    // Top-level-chain distance index: alias from a full unsampled distance index
-    // (higher priority — no extra computation needed).
-    registry.register_recipe({"Unsampled Giraffe GBZ Top Level Chain Distance Index"},
-                             {"Unsampled Giraffe GBZ Distance Index"},
-                             [](const vector<const IndexFile*>& inputs,
-                                const IndexingPlan* plan,
-                                AliasGraph& alias_graph,
-                                const IndexGroup& constructing) {
-        alias_graph.register_alias(*constructing.begin(), inputs[0]);
-        return vector<vector<string>>(1, inputs.front()->get_filenames());
-    });
-
-    // Top-level-chain distance index: build cheaply from the GBZ with
-    // only_top_level_chain_distances=true (lower priority).
-    registry.register_recipe({"Unsampled Giraffe GBZ Top Level Chain Distance Index"},
-                             {"Unsampled Giraffe GBZ"},
+    registry.register_recipe({"Top Level Chain Distance Index"},
+                             {"GBZ"},
                              [](const vector<const IndexFile*>& inputs,
                                 const IndexingPlan* plan,
                                 AliasGraph& alias_graph,
                                 const IndexGroup& constructing) {
         if (IndexingParameters::verbosity != IndexingParameters::None) {
-            info(context) << "Building top-level-chain distance index for unsampled GBZ." << endl;
+            info(context) << "Building top-level-chain distance index for GBZ." << endl;
         }
 
         assert(inputs.size() == 1);
@@ -4143,21 +4304,21 @@ IndexRegistry VGIndexes::get_vg_index_registry() {
         string output_name = plan->output_filepath(*constructing.begin());
         SnarlDistanceIndex distance_index;
         IntegratedSnarlFinder snarl_finder(gbz.graph);
+        // Only do top level chains
         fill_in_distance_index(&distance_index, &gbz.graph, &snarl_finder,
-                               /*size_limit=*/50000, /*only_top_level_chain_distances=*/true);
+                               50000, true);
         distance_index.serialize(output_name);
 
         output_names.push_back(output_name);
         return all_outputs;
     });
 
-    // Build "Haplotype Index" (.hapl) from the unsampled GBZ, its top-level-chain
+    // Build "Haplotype Index" (.hapl) from the GBZ, its top-level-chain
     // distance index, and its r-index.
-    // Alphabetical input order: GBZ < Top Level Chain Distance Index < r index
     registry.register_recipe({"Haplotype Index"},
-                             {"Unsampled Giraffe GBZ",
-                              "Unsampled Giraffe GBZ Top Level Chain Distance Index",
-                              "Unsampled Giraffe GBZ r index"},
+                             {"GBZ",
+                              "Top Level Chain Distance Index",
+                              "r Index"},
                              [](const vector<const IndexFile*>& inputs,
                                 const IndexingPlan* plan,
                                 AliasGraph& alias_graph,
@@ -4167,9 +4328,6 @@ IndexRegistry VGIndexes::get_vg_index_registry() {
         }
 
         assert(inputs.size() == 3);
-        // inputs[0] = Unsampled Giraffe GBZ
-        // inputs[1] = Unsampled Giraffe GBZ Top Level Chain Distance Index
-        // inputs[2] = Unsampled Giraffe GBZ r index
         auto gbz_filename  = inputs[0]->get_filenames().front();
         auto dist_filename = inputs[1]->get_filenames().front();
         auto ri_filename   = inputs[2]->get_filenames().front();
@@ -4191,7 +4349,7 @@ IndexRegistry VGIndexes::get_vg_index_registry() {
         load_r_index(r_index, ri_filename, IndexingParameters::verbosity != IndexingParameters::None);
         r_index.setGBWT(gbz.index);
 
-        // Build minimizer index without payload, using short-read parameters.
+        // Build minimizer index without payload.
         MinimizerIndexParameters minimizer_params;
         minimizer_params.minimizers(IndexingParameters::haplotype_minimizer_k,
                                     IndexingParameters::haplotype_minimizer_w)
@@ -4216,8 +4374,12 @@ IndexRegistry VGIndexes::get_vg_index_registry() {
     });
 
     // Build "KFF Kmer Counts" (.kff) by counting k-mers from the provided reads
-    // using kmc. The reads are registered as "Sample FASTQ".
-    registry.register_recipe({"KFF Kmer Counts"}, {"Sample FASTQ"},
+    // using kmc.
+    //
+    // Having the FASTQ available to the indexing logic is the limiting reagent
+    // that should determine if we do haplotype sampling or not, when given a
+    // GBZ.
+    registry.register_recipe({"KFF Kmer Counts"}, {"FASTQ"},
                              [](const vector<const IndexFile*>& inputs,
                                 const IndexingPlan* plan,
                                 AliasGraph& alias_graph,
@@ -4307,196 +4469,6 @@ IndexRegistry VGIndexes::get_vg_index_registry() {
         return all_outputs;
     });
 
-    registry.register_recipe({"Giraffe GBZ"}, {"GBZ"},
-                             [](const vector<const IndexFile*>& inputs,
-                                const IndexingPlan* plan,
-                                AliasGraph& alias_graph,
-                                const IndexGroup& constructing) {
-        alias_graph.register_alias(*constructing.begin(), inputs[0]);
-        return vector<vector<string>>(1, inputs.front()->get_filenames());
-    });
-    
-    registry.register_recipe({"GBZ"}, {"Reference GFA w/ Haplotypes"},
-                             [](const vector<const IndexFile*>& inputs,
-                                const IndexingPlan* plan,
-                                AliasGraph& alias_graph,
-                                const IndexGroup& constructing) {
-        if (IndexingParameters::verbosity != IndexingParameters::None) {
-            info(context) << "Constructing a GBZ from GFA input." << endl;
-        }
-        
-        assert(inputs.size() == 1);
-        if (inputs[0]->get_filenames().size() != 1) {
-            error(context) << "Graph construction does not support multiple GFAs at this time." << endl;
-        }
-        auto gfa_filename = inputs[0]->get_filenames().front();
-        
-        assert(constructing.size() == 1);
-        vector<vector<string>> all_outputs(constructing.size());
-        auto gbz_output = *constructing.begin();
-        auto& output_names = all_outputs[0];
-        
-        string output_name = plan->output_filepath(gbz_output);
-        
-        gbwtgraph::GFAParsingParameters params = get_best_gbwtgraph_gfa_parsing_parameters();
-        // TODO: there's supposedly a heuristic to set batch size that could perform better than this global param,
-        // but it would be kind of a pain to update it like we do the global param
-        params.batch_size = IndexingParameters::gbwt_insert_batch_size;
-        params.sample_interval = IndexingParameters::gbwt_sampling_interval;
-        params.max_node_length = IndexingParameters::max_node_size;
-        // TODO: Here we assume that the GFA file contains 100 haplotypes. If there are more,
-        // we could safely launch more jobs. Using 600 as the divisor would be a better
-        // estimate of the number of segments, but we chop long segments to 32 bp nodes.
-        params.parallel_jobs = guess_parallel_gbwt_jobs(
-            std::max(get_file_size(gfa_filename) / 300, std::int64_t(1)),
-            100,
-            plan->target_memory_usage(),
-            params.batch_size
-        );
-        params.show_progress = IndexingParameters::verbosity == IndexingParameters::Debug;
-        if (IndexingParameters::verbosity >= IndexingParameters::Debug) {
-            info(context) << "Running " << params.parallel_jobs << " jobs in parallel" << std::endl;
-        }
-
-        // jointly generate the GBWT and record sequences
-        unique_ptr<gbwt::GBWT> gbwt_index;
-        unique_ptr<gbwtgraph::NaiveGraph> seq_source;
-        tie(gbwt_index, seq_source) = gbwtgraph::gfa_to_gbwt(gfa_filename, params);
-        
-        // convert sequences into GBZ and save
-        gbwtgraph::GBZ gbz(gbwt_index, seq_source);
-        save_gbz(gbz, output_name, IndexingParameters::verbosity == IndexingParameters::Debug);
-        
-        output_names.push_back(output_name);
-        return all_outputs;
-    });
-
-    registry.register_recipe({"Giraffe GBZ"}, {"GBWTGraph", "Giraffe GBWT"},
-                             [](const vector<const IndexFile*>& inputs,
-                                const IndexingPlan* plan,
-                                AliasGraph& alias_graph,
-                                const IndexGroup& constructing) {
-        if (IndexingParameters::verbosity != IndexingParameters::None) {
-            info(context) << "Combining Giraffe GBWT and GBWTGraph into GBZ." << endl;
-        }
-
-        assert(inputs.size() == 2);
-        auto gbwt_filenames = inputs[1]->get_filenames();
-        auto gg_filenames = inputs[0]->get_filenames();
-        assert(gbwt_filenames.size() == 1);
-        assert(gg_filenames.size() == 1);
-        auto gbwt_filename = gbwt_filenames.front();
-        auto gg_filename = gg_filenames.front();
-
-        assert(constructing.size() == 1);
-        vector<vector<string>> all_outputs(constructing.size());
-        auto gbz_output = *constructing.begin();
-        auto& output_names = all_outputs[0];
-
-        gbwtgraph::GBZ gbz;
-        load_gbz(gbz, gbwt_filename, gg_filename, IndexingParameters::verbosity == IndexingParameters::Debug);
-
-        string output_name = plan->output_filepath(gbz_output);
-        save_gbz(gbz, output_name, IndexingParameters::verbosity == IndexingParameters::Debug);
-
-        output_names.push_back(output_name);
-        return all_outputs;
-    });
-    
-    // Thses used to be a GBWTGraph recipe, but we don't want to produce GBWTGraphs anymore.
-    
-    registry.register_recipe({"Giraffe GBZ"}, {"Giraffe GBWT", "NamedNodeBackTranslation", "XG"},
-                             [](const vector<const IndexFile*>& inputs,
-                                const IndexingPlan* plan,
-                                AliasGraph& alias_graph,
-                                const IndexGroup& constructing) {
-        if (IndexingParameters::verbosity != IndexingParameters::None) {
-            info(context) << "Constructing GBZ using NamedNodeBackTranslation." << endl;
-        }
-        
-        assert(inputs.size() == 3);
-        auto gbwt_filenames = inputs[0]->get_filenames();
-        auto translation_filenames = inputs[1]->get_filenames();
-        auto xg_filenames = inputs[2]->get_filenames();
-        assert(gbwt_filenames.size() == 1);
-        assert(translation_filenames.size() == 1);
-        assert(xg_filenames.size() == 1);
-        auto gbwt_filename = gbwt_filenames.front();
-        auto translation_filename = translation_filenames.front(); 
-        auto xg_filename = xg_filenames.front();
-        
-        assert(constructing.size() == 1);
-        vector<vector<string>> all_outputs(constructing.size());
-        auto gbz_output = *constructing.begin();
-        auto& output_names = all_outputs[0];
-        
-        ifstream infile_xg;
-        init_in(infile_xg, xg_filename);
-        auto xg_index = vg::io::VPKG::load_one<xg::XG>(infile_xg);
-        
-        ifstream infile_translation;
-        init_in(infile_translation, translation_filename);
-        // There's only one implementation we can use here at the moment, so
-        // don't bother with the normal loader/saver system.
-        FlatFileBackTranslation translation(infile_translation);
-
-        gbwt::GBWT index;
-        load_gbwt(index, gbwt_filename, IndexingParameters::verbosity == IndexingParameters::Debug);
-        gbwtgraph::GBZ gbz(std::move(index), *xg_index, &translation);
-
-        // We need to compute pggname manually, because a generic HandleGraph
-        // does not contain the name of the parent graph. And we use nullptr,
-        // because we currently cannot determine the name from another souce.
-        gbz.compute_pggname(nullptr);
-
-        string output_name = plan->output_filepath(gbz_output);
-        save_gbz(gbz, output_name, IndexingParameters::verbosity == IndexingParameters::Debug);
-
-        output_names.push_back(output_name);
-        return all_outputs;
-    });
-
-    registry.register_recipe({"Giraffe GBZ"}, {"Giraffe GBWT", "XG"},
-                             [](const vector<const IndexFile*>& inputs,
-                                const IndexingPlan* plan,
-                                AliasGraph& alias_graph,
-                                const IndexGroup& constructing) {
-        if (IndexingParameters::verbosity != IndexingParameters::None) {
-            info(context) << "Constructing GBZ." << endl;
-        }
-        
-        assert(inputs.size() == 2);
-        auto gbwt_filenames = inputs[0]->get_filenames();
-        auto xg_filenames = inputs[1]->get_filenames();
-        assert(gbwt_filenames.size() == 1);
-        assert(xg_filenames.size() == 1);
-        auto gbwt_filename = gbwt_filenames.front();
-        auto xg_filename = xg_filenames.front();
-        
-        assert(constructing.size() == 1);
-        vector<vector<string>> all_outputs(constructing.size());
-        auto gbz_output = *constructing.begin();
-        auto& output_names = all_outputs[0];
-        
-        ifstream infile_xg;
-        init_in(infile_xg, xg_filename);
-        auto xg_index = vg::io::VPKG::load_one<xg::XG>(infile_xg);
-
-        gbwt::GBWT index;
-        load_gbwt(index, gbwt_filename, IndexingParameters::verbosity == IndexingParameters::Debug);
-        gbwtgraph::GBZ gbz(std::move(index), *xg_index, algorithms::find_translation(xg_index.get()));
-
-        // We need to compute pggname manually, because a generic HandleGraph
-        // does not contain the name of the parent graph. And we use nullptr,
-        // because we currently cannot determine the name from another souce.
-        gbz.compute_pggname(nullptr);
-
-        string output_name = plan->output_filepath(gbz_output);
-        save_gbz(gbz, output_name, IndexingParameters::verbosity == IndexingParameters::Debug);
-
-        output_names.push_back(output_name);
-        return all_outputs;
-    });
 
     ////////////////////////////////////
     // Minimizers Recipes
diff --git a/src/subcommand/giraffe_main.cpp b/src/subcommand/giraffe_main.cpp
index 956733aa92..079947185d 100644
--- a/src/subcommand/giraffe_main.cpp
+++ b/src/subcommand/giraffe_main.cpp
@@ -764,8 +764,10 @@ int main_giraffe(int argc, char** argv) {
     IndexRegistry registry = VGIndexes::get_vg_index_registry();
 
     // Indexes provided to IndexRegistry in the arguments. We do not apply them
-    // immediately, because we may want to do haplotype sampling.
-    vector<pair<string, string>> provided_indexes;
+    // immediately, because we may want to do haplotype sampling. We need them
+    // indexed for lookup. We only support one file per index (FASTQs get
+    // handled separately).
+    std::unordered_map<string, string> provided_indexes;
     string index_basename, index_basename_override;
 
     // For haplotype sampling.
@@ -1134,7 +1136,7 @@ int main_giraffe(int argc, char** argv) {
             case 'Z':
                 // All provided_indexes are later validated
                 // with require_exists()
-                provided_indexes.emplace_back("Giraffe GBZ", optarg);
+                provided_indexes.emplace("GBZ", optarg);
 
                 // If we have a GBZ we probably want to use its name as the base name.
                 // But see -g.
@@ -1143,7 +1145,7 @@ int main_giraffe(int argc, char** argv) {
                 break;
 
             case 'x':
-                provided_indexes.emplace_back("XG", optarg);
+                provided_indexes.emplace("XG", optarg);
                 
                 // If we have an XG we probably want to use its name as the base name.
                 // But see -g.
@@ -1152,7 +1154,7 @@ int main_giraffe(int argc, char** argv) {
                 break;
 
             case 'g':
-                provided_indexes.emplace_back("GBWTGraph", optarg);
+                provided_indexes.emplace("GBWTGraph", optarg);
                 
                 // But if we have a GBWTGraph we probably want to use *its* name as the base name.
                 // Whichever is specified last will win, unless we also have a FASTA input name.
@@ -1161,22 +1163,22 @@ int main_giraffe(int argc, char** argv) {
                 break;
 
             case 'H':
-                provided_indexes.emplace_back("Giraffe GBWT", optarg);
+                provided_indexes.emplace("Giraffe GBWT", optarg);
                 break;
                 
             case 'm':
-                provided_indexes.emplace_back("Long Read Minimizers", optarg);
-                provided_indexes.emplace_back("Short Read Minimizers", optarg);
-                provided_indexes.emplace_back("Long Read PathMinimizers", optarg);
+                provided_indexes.emplace("Long Read Minimizers", optarg);
+                provided_indexes.emplace("Short Read Minimizers", optarg);
+                provided_indexes.emplace("Long Read PathMinimizers", optarg);
                 break;
                 
             case 'z':
-                provided_indexes.emplace_back("Long Read Zipcodes", optarg);
-                provided_indexes.emplace_back("Short Read Zipcodes", optarg);
-                provided_indexes.emplace_back("Long Read PathZipcodes", optarg);
+                provided_indexes.emplace("Long Read Zipcodes", optarg);
+                provided_indexes.emplace("Short Read Zipcodes", optarg);
+                provided_indexes.emplace("Long Read PathZipcodes", optarg);
                 break;
             case 'd':
-                provided_indexes.emplace_back("Giraffe Distance Index", optarg);
+                provided_indexes.emplace("Giraffe Distance Index", optarg);
                 break;
 
             case 'p':
@@ -1187,10 +1189,10 @@ int main_giraffe(int argc, char** argv) {
                 haplotype_sampling_flag = true;
                 break;
             case OPT_HAPLOTYPE_NAME:
-                haplotype_name = optarg;
+                provided_indexes.emplace("Haplotype Index", optarg);
                 break;
             case OPT_KFF_NAME:
-                kff_name = optarg;
+                provided_indexes.emplace("KFF Kmer Counts", optarg);
                 break;
             case OPT_INDEX_BASENAME:
                 index_basename_override = optarg;
@@ -1366,7 +1368,7 @@ int main_giraffe(int argc, char** argv) {
                            << " is not named like a FASTA" << std::endl;
         }
         
-        provided_indexes.emplace_back("Reference FASTA", fasta_filename);
+        provided_indexes.emplace("Reference FASTA", fasta_filename);
         // Everything else should be named like the FASTA by default
         index_basename = fasta_parts.first;
         
@@ -1389,7 +1391,7 @@ int main_giraffe(int argc, char** argv) {
             string file_type = IndexRegistry::vcf_is_phased(vcf_filename) ? "VCF w/ Phasing" : "VCF";
             
             // Feed it to the index registry to maybe use
-            provided_indexes.emplace_back(file_type, vcf_filename);
+            provided_indexes.emplace(file_type, vcf_filename);
         }
     }
 
@@ -1402,14 +1404,8 @@ int main_giraffe(int argc, char** argv) {
     
     // Now all the arguments are parsed, so see if they make sense
 
-    for (const auto& filename : provided_indexes) {
-        require_exists(logger, filename.second);
-    }
-    if (!haplotype_name.empty()) {
-        require_exists(logger, haplotype_name);
-    }
-    if (!kff_name.empty()) {
-        require_exists(logger, kff_name);
+    for (const auto& type_and_filename : provided_indexes) {
+        require_exists(logger, type_and_filename.second);
     }
     
     // Decide if we are outputting to an htslib format
@@ -1465,9 +1461,15 @@ int main_giraffe(int argc, char** argv) {
         // We don't have file paths to load defined for recombination-aware short-read minimizers.
         logger.error() << "Path minimizers cannot be used with short reads." << endl;
     }
+    
+    // Use all the provided indexes
+    for (auto& index : provided_indexes) {
+        registry.provide(index.first, index.second);
+    }
+
     // Haplotype sampling is active when explicitly requested (--haplotype-sampling)
     // or when either --haplotype-name or --kff-name is given.
-    bool haplotype_sampling = haplotype_sampling_flag || !haplotype_name.empty() || !kff_name.empty();
+    bool haplotype_sampling = haplotype_sampling_flag || provided_indexes.count("Haplotype Index") || provided_indexes.count("KFF Kmer Counts");
     // Save the GBZ-derived basename before any override, so we can find
     // dist and other indexes co-located with the source GBZ during sampling.
     string original_basename = index_basename;
@@ -1488,7 +1490,8 @@ int main_giraffe(int argc, char** argv) {
                                   << " that sample name is " << sample << std::endl;
                 }
             } else if (!fastq_filename_1.empty()) {
-                // Strip .gz then use file_base_name to strip .fq/.fastq
+                // Strip .gz then use file_base_name to strip .fq/.fastq, since
+                // strip_suffixes stops at the first one that's not there.
                 sample = file_base_name(strip_suffixes(fastq_filename_1, {".gz"}));
                 if (show_progress) {
                     logger.info() << "Guessing from " << fastq_filename_1
@@ -1506,59 +1509,50 @@ int main_giraffe(int argc, char** argv) {
         // Pass parameters to the recipes through IndexingParameters.
         IndexingParameters::haplotype_reference_samples = reference_samples;
 
-        // Provide inputs to the registry.  The full-pangenome GBZ is typed
-        // as "Unsampled Giraffe GBZ" so the sampling recipe can distinguish
-        // it from the personalized "Giraffe GBZ" it will produce.
-        // Likewise, a provided "Giraffe Distance Index" is re-typed as
-        // "Unsampled Giraffe GBZ Distance Index" since it belongs to the
-        // unsampled graph.
-        for (auto& index : provided_indexes) {
-            if (index.first == "Giraffe GBZ") {
-                registry.provide("Unsampled Giraffe GBZ", index.second);
-            } else if (index.first == "Giraffe Distance Index") {
-                registry.provide("Unsampled Giraffe GBZ Distance Index", index.second);
-            } else {
-                registry.provide(index.first, index.second);
-            }
-        }
-
-        // Provide haplotype index and kff only if explicitly given.
-        if (!haplotype_name.empty()) {
-            registry.provide("Haplotype Index", haplotype_name);
-        }
-        if (!kff_name.empty()) {
-            registry.provide("KFF Kmer Counts", kff_name);
-        }
-
-        // Provide FASTQs as a registry node so the KFF kmer counting recipe
-        // can discover them without going through IndexingParameters.
         if (!fastq_filename_1.empty()) {
+            // If FASTQs are available, send those and we will do haplotype sampling.
             vector<string> fastqs = {fastq_filename_1};
             if (!fastq_filename_2.empty()) {
                 fastqs.push_back(fastq_filename_2);
             }
-            registry.provide("Sample FASTQ", fastqs);
+            registry.provide("FASTQ", fastqs);
+        } else if (!provided_indexes.count("KFF Kmer Counts")) {
+            // TODO: Eventually learn to kmer count from GAM
+            logger.error() << "Cannot do haplotype sampling without kmer counts (--kff-name) or FASTQ input (--fastq-in/-f)" << std::endl; 
         }
-
-        // Auto-provide the unsampled distance index from the original GBZ
-        // basename if a dist file exists there and we don't already have one.
-        if (!registry.available("Unsampled Giraffe GBZ Distance Index")) {
-            // Prefer a top-level-chain-only .tcdist, then fall back to full .dist.
+        // Otherwise, we know we're sending kmer counts and will still do haplotype sampling
+        
+        if (!registry.available("Top Level Chain Distance Index")) {
+            // Look for base GBZ distance index
             string tcdist_candidate = original_basename + ".tcdist";
-            string dist_candidate   = original_basename + ".dist";
+            string dist_candidate = original_basename + ".dist";
             if (file_exists(tcdist_candidate)) {
-                registry.provide("Unsampled Giraffe GBZ Top Level Chain Distance Index", tcdist_candidate);
+                registry.provide("Top Level Chain Distance Index", tcdist_candidate);
             } else if (file_exists(dist_candidate)) {
-                registry.provide("Unsampled Giraffe GBZ Distance Index", dist_candidate);
+                // Provide it as a top level chain index even though it has more stuff
+                registry.provide("Top Level Chain Distance Index", dist_candidate);
+            }
+        }
+        
+        if (!registry.available("Haplotype Index")) {
+            // Look for hapl index
+            string hapl_candidate = original_basename + ".hapl";
+            if (file_exists(hapl_candidate)) {
+                registry.provide("Haplotype Index", hapl_candidate);
             }
         }
 
-    } else {
-        // Otherwise use the provided indexes directly.
-        for (auto& index : provided_indexes) {
-            registry.provide(index.first, index.second);
+        if (!registry.available("r Index")) {
+            // Look for r index
+            string r_candidate = original_basename + ".ri";
+            if (file_exists(r_candidate)) {
+                registry.provide("r Index", r_candidate);
+            }
         }
+
     }
+
+    // Use the basename we've determined for generated indexes
     registry.set_prefix(index_basename);
 
     // The IndexRegistry doesn't try to infer index files based on the
@@ -1595,18 +1589,13 @@ int main_giraffe(int argc, char** argv) {
         for (auto& extension : index_and_extensions->second) {
             // For each extension in priority order
             string inferred_filename = registry.get_prefix() + "." + extension;
-            if (ifstream(inferred_filename).is_open()) {
+            if (file_exists(inferred_filename)) {
                 // A file with the appropriate name exists and we can read it.
-                if (haplotype_sampling) {
-                    // If we did haplotype sampling, we are going to overwrite existing indexes.
-                    logger.warn() << inferred_filename << " exists and will be overwritten" << endl;
-                } else {
-                    // Report it because this may not be desired behavior.
-                    logger.info() << "Guessing that " << inferred_filename
-                                  << " is " << index_and_extensions->first << endl;
-                    registry.provide(index_and_extensions->first, inferred_filename);
-                    found = true;
-                }
+                // Report it because this may not be desired behavior.
+                logger.info() << "Guessing that " << inferred_filename
+                              << " is " << index_and_extensions->first << endl;
+                registry.provide(index_and_extensions->first, inferred_filename);
+                found = true;
                 // Skip other extension options for the index
                 break;
             }
@@ -1620,6 +1609,16 @@ int main_giraffe(int argc, char** argv) {
         }
     }
 
+    if (haplotype_sampling && !registry.available("Giraffe GBZ") && !registry.available("GBZ")) {
+        // When doing haplotype sampling, we need to name our sampled GBZ just
+        // .gbz on top of the basename we put the sample name in, because the
+        // tests require it. This means we can't build a base .gbz and a
+        // sampled .gbz in the same indexing run, or they will fight over the
+        // name. So make sure we don't try.
+        logger.error() << "Cannot do haplotype sampling without a GBZ available" << std::endl;
+    }
+    // TODO: Actually force the index registry to use the name we want for the sampled GBZ even though it's not the one it would generate.
+
     //If we're making new zipcodes, we should rebuild the minimizers too
     if (!(indexes_and_extensions.count(std::string("Long Read Minimizers")) || indexes_and_extensions.count(std::string("Long Read PathMinimizers"))) && indexes_and_extensions.count(std::string("Long Read Zipcodes"))) {
         logger.info() << "Rebuilding minimizer index to include zipcodes" << endl;
diff --git a/test/haplotype-sampling/README.md b/test/haplotype-sampling/README.md
index 89e0d3e6b6..406e9e9b82 100644
--- a/test/haplotype-sampling/README.md
+++ b/test/haplotype-sampling/README.md
@@ -1,3 +1,4 @@
+<!-- !test program bash -eo pipefail -->
 # Haplotype sampling example
 
 This directory contains a small test case for haplotype sampling using the `vg haplotypes` subcommand.
@@ -17,6 +18,7 @@ File `HG003.kff` contains 29-mer counts for the reads computed using KMC.
 
 First we need to build the indexes used by Giraffe:
 
+<!-- !test check Index the GFA -->
 ```
 vg autoindex --workflow giraffe --prefix micb-kir3dl1 -g micb-kir3dl1.gfa
 ```
@@ -25,6 +27,7 @@ This converts the graph into GBZ format (`micb-kir3dl1.giraffe.gbz`) and builds
 
 Then we can map the reads, specifying the GBZ file, the fastq file, and the output file:
 
+<!-- !test check Map to original graph -->
 ```
 vg giraffe -Z micb-kir3dl1.giraffe.gbz -f HG003.fq.gz -p > to-original.gam
 ```
@@ -35,6 +38,7 @@ However, because the graph contains two sets of reference paths (GRCh38 and CHM1
 
 We can use the `vg stats` subcommand to compute statistics on the aligned reads:
 
+<!-- !test check Get original mapping stats -->
 ```
 vg stats -a to-original.gam micb-kir3dl1.giraffe.gbz
 ```
@@ -46,6 +50,7 @@ vg stats -a to-original.gam micb-kir3dl1.giraffe.gbz
 In order to build the haplotype information used for haplotype sampling, we need a GBZ graph, a distance index, and an r-index.
 We build them using the following commands:
 
+<!-- !test check Build the GBZ and r index -->
 ```
 vg gbwt -G micb-kir3dl1.gfa --max-node 32 --gbz-format -g micb-kir3dl1.gbz -r micb-kir3dl1.ri
 vg index -j micb-kir3dl1.dist micb-kir3dl1.gbz
@@ -56,6 +61,7 @@ This makes the resulting GBZ graph (and hence the distance index) indentical to
 
 Now we can build the haplotype information file `micb-kir3dl1.hapl`:
 
+<!-- !test check Haplotype index the graph -->
 ```
 vg haplotypes -v 2 --subchain-length 300 -H micb-kir3dl1.hapl micb-kir3dl1.gbz
 ```
@@ -67,6 +73,7 @@ The latter makes the example more interesting than the default 10000 bp.
 
 We can now sample the haplotypes using the haplotype information, the k-mer counts, and the original graph:
 
+<!-- !test check Haplotype sample -->
 ```
 vg haplotypes -v 2 -i micb-kir3dl1.hapl -k HG003.kff \
     --include-reference --diploid-sampling \
@@ -79,9 +86,10 @@ Now we have the sampled graph `HG003.gbz`.
 We could build the indexes for Giraffe manually.
 But because the graph is a temporary object intended only for mapping a specific set of reads, we can let Giraffe handle index construction:
 
+<!-- !test check Map to sampled graph -->
 ```
 # Set the temporary directory as you wish
-export TMPDIR=/scratch/tmp
+# export TMPDIR=/scratch/tmp
 
 vg giraffe -Z HG003.gbz -f HG003.fq.gz -p --index-basename ${TMPDIR}/HG003 > to-sampled.gam
 ```
@@ -90,40 +98,44 @@ Here we specify that the indexes go to the temporary directory with base name `H
 
 We can again use `vg stats` to compute statistics:
 
+<!-- !test check Get stats for mappings to sampled graph -->
 ```
 vg stats -a to-sampled.gam HG003.gbz
 ```
 
 ### Giraffe integration
 
-Instead of sampling the haplotypes manually, we can let Giraffe handle everything:
+Instead of sampling the haplotypes manually, we can let Giraffe handle the sampling:
 
+<!-- !test check Haplotype sample in Giraffe -->
 ```
 # Set the temporary directory as you wish
-export TMPDIR=/scratch/tmp
+# export TMPDIR=/scratch/tmp
 
 vg giraffe -Z micb-kir3dl1.gbz --haplotype-name micb-kir3dl1.hapl --kff-name HG003.kff \
-    -f HG003.fq.gz -p --index-basename ${TMPDIR}/integration > giraffe-integration.gam
+    -f HG003.fq.gz -p --index-basename integration > giraffe-integration.gam
 ```
 
 This uses the best practices for haplotype sampling, which currently means including the reference paths and using diploid sampling.
 We put the sampled graph and the automatically built indexes into the temporary directory.
-The naming scheme (e.g. `${TMPDIR}/integration.HG003.gbz`) consists of three parts:
+The naming scheme (e.g. `integration.HG003.gbz`) consists of three parts:
 
-* Base name `${TMPDIR}/integration` specified with option `--index-basename`.
+* Base name `integration` specified with option `--index-basename`.
 * Sample name `HG003` guessed from the name of the KFF file or specified with option `--sample`.
 * Extension (e.g. `gbz`) that depends on the file.
 
 The statistics should be similar to those in the previous section:
 
+<!-- !test check Get stats for haplotype sampling in Giraffe -->
 ```
-vg stats -a giraffe-integration.gam ${TMPDIR}/integration.HG003.gbz
+vg stats -a giraffe-integration.gam integration.HG003.gbz
 ```
 
 ## Mapping reads to a linear reference
 
 Given a GBZ file, we can extract the paths corresponding to a given sample in fasta format:
 
+<!-- !test check Get linear reference -->
 ```
 vg paths -x micb-kir3dl1.gbz --extract-fasta --sample GRCh38 > reference.fa
 ```
diff --git a/test/t/54_vg_haplotypes.t b/test/t/54_vg_haplotypes.t
index eb82e1da08..421d61f89e 100644
--- a/test/t/54_vg_haplotypes.t
+++ b/test/t/54_vg_haplotypes.t
@@ -73,7 +73,7 @@ is $(vg gbwt -H -Z diploid3.gbz) 3 "2 generated + 1 reference haplotypes"
 vg giraffe -Z full.gbz --haplotype-name full.hapl --kff-name haplotype-sampling/HG003.kff \
     -f haplotype-sampling/HG003.fq.gz > default.gam 2> /dev/null
 is $? 0 "Giraffe integration with a guessed output name"
-cmp diploid.gbz full.HG003.giraffe.gbz
+cmp diploid.gbz full.HG003.gbz
 is $? 0 "the sampled graph is identical to a manually sampled one"
 
 # Giraffe integration, specified output name
@@ -81,7 +81,7 @@ vg giraffe -Z full.gbz --haplotype-name full.hapl --kff-name haplotype-sampling/
     --index-basename sampled -N 003HG \
     -f haplotype-sampling/HG003.fq.gz > specified.gam 2> /dev/null
 is $? 0 "Giraffe integration with a specified output name"
-cmp full.HG003.giraffe.gbz sampled.003HG.giraffe.gbz
+cmp full.HG003.gbz sampled.003HG.gbz
 is $? 0 "the sampled graphs are identical"
 
 # Giraffe integration, specified reference sample
@@ -89,7 +89,7 @@ vg giraffe -Z full.gbz --haplotype-name full.hapl --kff-name haplotype-sampling/
     --index-basename GRCh38 -N HG003 --set-reference GRCh38 \
     -f haplotype-sampling/HG003.fq.gz > HG003_GRCh38.gam 2> /dev/null
 is $? 0 "Giraffe integration with a specified reference sample"
-cmp diploid3.gbz GRCh38.HG003.giraffe.gbz
+cmp diploid3.gbz GRCh38.HG003.gbz
 is $? 0 "the sampled graph is identical to a manually sampled one"
 
 # Giraffe integration, haplotype index built automatically from the GBZ.
@@ -99,7 +99,7 @@ vg giraffe -Z full.gbz --kff-name haplotype-sampling/HG003.kff \
     --index-basename auto_hapl -N HG003 \
     -f haplotype-sampling/HG003.fq.gz > auto_hapl.gam 2> /dev/null
 is $? 0 "Giraffe builds haplotype index automatically"
-is $(vg gbwt -H -Z auto_hapl.HG003.giraffe.gbz) 4 "auto-built hapl produces 2 diploid + 2 reference haplotypes"
+is $(vg gbwt -H -Z auto_hapl.HG003.gbz) 4 "auto-built hapl produces 2 diploid + 2 reference haplotypes"
 
 # Giraffe integration, kmer counting done automatically from reads.
 # Providing --haplotype-name without --kff-name implies haplotype sampling;
@@ -108,7 +108,7 @@ vg giraffe -Z full.gbz --haplotype-name full.hapl \
     --index-basename auto_kff -N HG003 \
     -f haplotype-sampling/HG003.fq.gz > auto_kff.gam 2> /dev/null
 is $? 0 "Giraffe counts kmers from reads automatically"
-is $(vg gbwt -H -Z auto_kff.HG003.giraffe.gbz) 4 "auto kmer counting produces 2 diploid + 2 reference haplotypes"
+is $(vg gbwt -H -Z auto_kff.HG003.gbz) 4 "auto kmer counting produces 2 diploid + 2 reference haplotypes"
 
 # Giraffe integration, fully automatic: both hapl and kff are built by the
 # IndexRegistry. Triggered by --haplotype-sampling without either input file.
@@ -116,7 +116,7 @@ vg giraffe -Z full.gbz --haplotype-sampling \
     --index-basename auto_all -N HG003 \
     -f haplotype-sampling/HG003.fq.gz > auto_all.gam 2> /dev/null
 is $? 0 "Giraffe does fully automatic haplotype sampling"
-is $(vg gbwt -H -Z auto_all.HG003.giraffe.gbz) 4 "fully automatic haplotype sampling produces 2 diploid + 2 reference haplotypes"
+is $(vg gbwt -H -Z auto_all.HG003.gbz) 4 "fully automatic haplotype sampling produces 2 diploid + 2 reference haplotypes"
 
 # Attempts to use mismatched files fail
 vg haplotypes -i full.hapl -k haplotype-sampling/HG003.kff -g /dev/null diploid.gbz 2> log.txt
@@ -133,4 +133,4 @@ rm -f GRCh38.HG003.* HG003_GRCh38.gam
 rm -f auto_hapl.HG003.* auto_hapl.gam
 rm -f auto_kff.HG003.* auto_kff.gam
 rm -f auto_all.HG003.* auto_all.gam
-rm -f log.txt
\ No newline at end of file
+rm -f log.txt

From f54a4a6041e90bf6673ac049a854cda80fbe1487 Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Mon, 9 Mar 2026 18:02:11 -0400
Subject: [PATCH 07/22] Get partway into a scope applying system that is bad

---
 src/index_registry.cpp            | 65 ++++++++++++++++++++++++++-----
 src/index_registry.hpp            | 17 +++++---
 src/subcommand/autoindex_main.cpp |  2 +-
 3 files changed, 68 insertions(+), 16 deletions(-)

diff --git a/src/index_registry.cpp b/src/index_registry.cpp
index 021567bdec..276433f310 100644
--- a/src/index_registry.cpp
+++ b/src/index_registry.cpp
@@ -4598,23 +4598,30 @@ string IndexingPlan::output_filepath(const IndexName& identifier) const {
 
 string IndexingPlan::output_filepath(const IndexName& identifier, size_t chunk, size_t num_chunks) const {
     
-    string filepath;
+    std::stringstream filepath;
     if (registry->keep_intermediates ||
         (!is_intermediate(identifier) && !registry->get_index(identifier)->was_provided_directly())) {
         // we're saving this file, put it at the output prefix
-        filepath = registry->output_prefix;
+        filepath << registry->output_prefix;
     }
     else {
         // we're not saving this file, make it temporary
-        filepath = registry->get_work_dir() + "/" + sha1sum(identifier);
+        filepath << registry->get_work_dir() + "/" + sha1sum(identifier);
+    }
+    auto found = scopes.find(identifier);
+    if (found != scopes.end()) {
+        for (auto& scope : found->second) {
+            // This index belongs to just a sample or something.
+            filepath << "." << scope;
+        }
     }
     if (num_chunks > 1) {
         // we add digits to make the suffix unique for this chunk (the setup disallows suffixes
         // that start with digits)
-        filepath += "." + to_string(chunk);
+        filepath << "." << to_string(chunk);
     }
-    filepath += "." + registry->get_index(identifier)->get_suffix();
-    return filepath;
+    filepath << "." << registry->get_index(identifier)->get_suffix();
+    return filepath.str();
 }
  
 const vector<RecipeName>& IndexingPlan::get_steps() const {
@@ -4842,11 +4849,11 @@ void IndexRegistry::register_index(const IndexName& identifier, const string& su
 }
 
 
-void IndexRegistry::provide(const IndexName& identifier, const string& filename) {
-    provide(identifier, vector<string>(1, filename));
+void IndexRegistry::provide(const IndexName& identifier, const string& filename, const std::string& scope) {
+    provide(identifier, vector<string>(1, filename), scope);
 }
 
-void IndexRegistry::provide(const IndexName& identifier, const vector<string>& filenames) {
+void IndexRegistry::provide(const IndexName& identifier, const vector<string>& filenames, const std::string& scope) {
     if (IndexingParameters::verbosity >= IndexingParameters::Debug) {
         info(context) << "Provided: " << identifier << endl;
     }
@@ -5737,11 +5744,49 @@ IndexingPlan IndexRegistry::make_plan(const IndexGroup& end_products) const {
     }
 #endif
 
+    // Figure out if any of the input files apply a sample scope to descendant files in the plan.
+    // This holds sets of later recipes and the scopes to give them when we get there.
+    // TODO: This is complicated and the queueing is not needed; we can just apply the scopes when we find the descendants.
+    std::vector<std::pair<std::set<RecipeName>, std::vector<std::string>> scope_queue;
+    for (auto plan_it = plan.steps.begin(); plan_it != plan.steps.end(); ++plan_it) {
+        // For each step in ther plan in order
+        
+        for (auto& names_and_scopes : scope_queue) {
+            if (names_and_scopes.first.count(plan_it->first)) {
+                // We have scopes from a previous step to apply to the indexes
+                // generated by this step.
+                //
+                // TODO: We assume a single scope only ever comes from one
+                // input file, and we don't enforce a particular scope order
+                // other than rule order.
+                for (const IndexName& index_name : plan_it->first) {
+                    // Append all the new scopes to any existing scopes on this index.
+                    std::copy(names_and_scopes.second().begin(), names_and_scopes.second().end(), std::back_inserter(scopes[index_name])); 
+                }
+            }
+        }
+
+        for (const IndexName& index_name : plan_it->first) {
+            // For each index that step would generate
+            const IndexFile* index = get_index(index_name);
+            if (!index->was_provided_directly()) {
+                // Only directly provided indexes introduce new scopes
+                continue;
+            }
+            auto& provided_scopes = index->provided_scopes();
+            if (!provided_scopes.empty()) {
+                // We need to attach these scopes to anything based on this index.
+                set<RecipeName> dependent_recipes = dependents(index_name);
+            }
+        }
+    }
+
+
     // Now remove the input data from the plan
     plan.steps.resize(remove_if(plan.steps.begin(), plan.steps.end(), [&](const RecipeName& recipe_choice) {
         return all_finished(recipe_choice.first);
     }) - plan.steps.begin());
-    
+
     // The plan has methods that can come back and modify the registry.
     // We're not going to call any of them, but we have to hand off a non-const
     // pointer to ourselves so the plan can modify us later.
diff --git a/src/index_registry.hpp b/src/index_registry.hpp
index 82ea89a2c1..b1f4afa991 100644
--- a/src/index_registry.hpp
+++ b/src/index_registry.hpp
@@ -185,7 +185,7 @@ class IndexingPlan {
     // TODO: is this where this function wants to live?
     /// The memory limit, with a little slosh for prediction inaccuracy
     int64_t target_memory_usage() const;
-    /// The mmeory limit with no slosh
+    /// The memory limit with no slosh
     int64_t literal_target_memory_usage() const;
     
     /// Returns the recipes in the plan that depend on this index, including the one in which
@@ -199,6 +199,10 @@ class IndexingPlan {
     vector<RecipeName> steps;
     /// The indexes to create as outputs.
     set<IndexName> targets;
+    /// The scope qualifying each index in the plan (for example, if an index
+    /// applies only to a particular sample, it will be scoped to the sample
+    /// name).
+    map<IndexName, vector<string>> scopes;
     
     /// The registry that the plan is using.
     /// The registry must not move while the plan is in use.
@@ -261,11 +265,13 @@ class IndexRegistry {
     /// by the generalization must be semantically identical to those of the generalizee
     void register_generalization(const RecipeName& generalizer, const RecipeName& generalizee);
     
-    /// Indicate a serialized file that contains some identified index
-    void provide(const IndexName& identifier, const string& filename);
+    /// Indicate a serialized file that contains some identified index,
+    /// optionally with a scope that propagates to descendant files.
+    void provide(const IndexName& identifier, const string& filename, const std::string& scope = "");
     
-    /// Indicate a list of serialized files that contains some identified index
-    void provide(const IndexName& identifier, const vector<string>& filenames);
+    /// Indicate a list of serialized files that contains some identified index,
+    /// optionally with a scope that propagates to descendant files.
+    void provide(const IndexName& identifier, const vector<string>& filenames, const std::string& scope = "");
 
     /// Remove a provided index
     void reset(const IndexName& identifier);
@@ -275,6 +281,7 @@ class IndexRegistry {
     bool available(const IndexName& identifier) const;
 
     /// Get the possible filename(s) associated with the given index with the given prefix.
+    /// TODO: Get this to account for sample-scoped indexes.
     vector<string> get_possible_filenames(const IndexName& identifier) const;
     
     /// Get the filename(s) associated with the given index. Aborts if the
diff --git a/src/subcommand/autoindex_main.cpp b/src/subcommand/autoindex_main.cpp
index 31777f0d11..fc16be62fd 100644
--- a/src/subcommand/autoindex_main.cpp
+++ b/src/subcommand/autoindex_main.cpp
@@ -390,7 +390,7 @@ int main_autoindex(int argc, char** argv) {
             if (!registry.available(target)) {
                 vector<string> inferred_file_names = registry.get_possible_filenames(target);
                 for (const string& filename : inferred_file_names) {
-                    if (ifstream(filename).is_open()) {
+                    if (file_exists(filename)) {
                         logger.info() << "Guessing that " << filename << " is " << target << endl;
                         registry.provide(target, filename);
                         break;

From 6df0ca341e5a2a43435fb05e4eeb92c8af8644b2 Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Tue, 10 Mar 2026 13:25:45 -0400
Subject: [PATCH 08/22] Implement scopes and fix storing new paths for aliases

---
 src/index_registry.cpp            | 165 ++++++++++++++++++++----------
 src/index_registry.hpp            |  37 ++++++-
 src/subcommand/giraffe_main.cpp   |  94 +++++++++--------
 test/haplotype-sampling/README.md |   6 ++
 4 files changed, 202 insertions(+), 100 deletions(-)

diff --git a/src/index_registry.cpp b/src/index_registry.cpp
index 276433f310..8a476ede15 100644
--- a/src/index_registry.cpp
+++ b/src/index_registry.cpp
@@ -67,9 +67,9 @@
 #include "algorithms/component.hpp"
 #include "algorithms/find_translation.hpp"
 
-//#define debug_index_registry
+#define debug_index_registry
 //#define debug_index_registry_setup
-//#define debug_index_registry_recipes
+#define debug_index_registry_recipes
 //#define debug_index_registry_path_state
 
 namespace std {
@@ -4241,13 +4241,17 @@ IndexRegistry VGIndexes::get_vg_index_registry() {
         Recombinator::Parameters parameters(Recombinator::Parameters::preset_diploid);
         gbwt::GBWT sampled_gbwt = recombinator.generate_haplotypes(kff_filename, parameters);
 
-        // Build GBWTGraph and save GBZ.
+        // Build GBZ.
         if (IndexingParameters::verbosity != IndexingParameters::None) {
-            info(context) << "Building GBWTGraph." << endl;
+            info(context) << "Building GBZ." << endl;
         }
         gbwtgraph::GBZ sampled_graph(std::move(sampled_gbwt), gbz);
         string output_name = plan->output_filepath(gbz_output);
         save_gbz(sampled_graph, output_name, IndexingParameters::verbosity == IndexingParameters::Debug);
+        
+        if (IndexingParameters::verbosity != IndexingParameters::None) {
+            info(context) << "Saved GBZ to " << output_name << endl;
+        }
 
         output_names.push_back(output_name);
         return all_outputs;
@@ -4584,6 +4588,15 @@ bool IndexingPlan::is_intermediate(const IndexName& identifier) const {
     return !targets.count(identifier);
 }
 
+const vector<string> IndexingPlan::get_scopes(const IndexName& identifier) const {
+    auto found = scopes.find(identifier);
+    if (found == scopes.end()) {
+        return {};
+    }
+    // TODO: we can't really return an empty vector reference so we are stuck doing a copy here.
+    return found->second;
+}
+
 int64_t IndexingPlan::target_memory_usage() const {
     return IndexingParameters::max_memory_proportion * registry->get_target_memory_usage();
 }
@@ -4608,12 +4621,9 @@ string IndexingPlan::output_filepath(const IndexName& identifier, size_t chunk,
         // we're not saving this file, make it temporary
         filepath << registry->get_work_dir() + "/" + sha1sum(identifier);
     }
-    auto found = scopes.find(identifier);
-    if (found != scopes.end()) {
-        for (auto& scope : found->second) {
-            // This index belongs to just a sample or something.
-            filepath << "." << scope;
-        }
+    for (auto& scope : get_scopes(identifier)) {
+        // This index belongs to just a sample or something.
+        filepath << "." << scope;
     }
     if (num_chunks > 1) {
         // we add digits to make the suffix unique for this chunk (the setup disallows suffixes
@@ -4621,6 +4631,7 @@ string IndexingPlan::output_filepath(const IndexName& identifier, size_t chunk,
         filepath << "." << to_string(chunk);
     }
     filepath << "." << registry->get_index(identifier)->get_suffix();
+    std::cerr << "Considering output file path " << filepath.str() << std::endl;
     return filepath.str();
 }
  
@@ -4657,6 +4668,16 @@ set<RecipeName> IndexingPlan::dependents(const IndexName& identifier) const {
     return dependent_steps;
 }
 
+void IndexingPlan::add_scope(const IndexName& identifier, const string& scope) {
+    auto index_scopes = scopes[identifier];
+
+    // TODO: Change the data structure here if we anticipate an appreciable number of scopes.
+    auto found = std::find(index_scopes.begin(), index_scopes.end(), scope);
+    if (found == index_scopes.end()) {
+        index_scopes.push_back(scope);
+    }
+}
+
 IndexRegistry::~IndexRegistry() {
     if (!work_dir.empty()) {
         // Clean up our work directory with its temporary indexes.
@@ -4736,6 +4757,10 @@ void IndexRegistry::make_indexes(const vector<IndexName>& identifiers) {
                 if (!index->was_provided_directly()) {
                     // and assign the new (or first) ones
                     index->assign_constructed(results);
+                    for (auto& scope : plan.get_scopes(*it)) {
+                        // and keep their scopes (in case we ever need them)
+                        index->add_scope(scope);
+                    }
                 }
                 ++it;
             }
@@ -4792,7 +4817,10 @@ void IndexRegistry::make_indexes(const vector<IndexName>& identifiers) {
         auto f = find(aliasors.begin(), aliasors.end(), aliasee);
         bool can_move = f == aliasors.end() && !get_index(aliasee)->was_provided_directly();
         if (!can_move) {
-            // just remove the "alias" so we don't need to deal with it
+            // We need to copy.
+            // Just remove the "alias" for the index itself so that from here
+            // on we only need to wory about things that *do* need to be
+            // created.
             std::swap(*f, aliasors.back());
             aliasors.pop_back();
         }
@@ -4801,15 +4829,21 @@ void IndexRegistry::make_indexes(const vector<IndexName>& identifiers) {
         
         // copy aliases for any that we need to (start past index 0 if we can move it)
         for (size_t i = can_move; i < aliasors.size(); ++i) {
+            // We want the aliasor to point at the new paths
+            vector<string> new_aliasor_files;
             for (size_t j = 0; j < aliasee_filenames.size(); ++j) {
                 
                 auto copy_filename = plan.output_filepath(aliasors[i], j, aliasee_filenames.size());
                 copy_file(aliasee_filenames[j], copy_filename);
+                new_aliasor_files.push_back(copy_filename);
             }
+            // Point the aliasor index at the new paths
+            get_index(aliasors[i])->assign_constructed(new_aliasor_files);
         }
         // if we can move the aliasee (i.e. it is intermediate), then make
-        // one index by moving instead of copying
+        // one index (the 0th one) by moving instead of copying
         if (can_move) {
+            vector<string> new_aliasor_files;
             for (size_t j = 0; j < aliasee_filenames.size(); ++j) {
                 auto move_filename = plan.output_filepath(aliasors[0], j, aliasee_filenames.size());
                 int code = rename(aliasee_filenames[j].c_str(), move_filename.c_str());
@@ -4817,7 +4851,10 @@ void IndexRegistry::make_indexes(const vector<IndexName>& identifiers) {
                     // moving failed (maybe because the files on separate drives?) fall back on copying
                     copy_file(aliasee_filenames[j], move_filename);
                 }
+                new_aliasor_files.push_back(move_filename);
             }
+            // Point the aliasor index at the new paths
+            get_index(aliasors[0])->assign_constructed(new_aliasor_files);
         }
     }
     
@@ -5714,6 +5751,14 @@ IndexingPlan IndexRegistry::make_plan(const IndexGroup& end_products) const {
    
     // Now fill in the plan struct that the recipes need to know how to run.
     IndexingPlan plan;
+
+    // The plan has methods that need to interact with the registry, including
+    // some we need to use here.
+    //
+    // Some of them modify the registry. We're not going to use any of those,
+    // but we have to hand off a non-const pointer to ourselves so the plan can
+    // modify us later.
+    plan.registry = const_cast<IndexRegistry*>(this);
     
     // Copy over the end products
     std::copy(end_products.begin(), end_products.end(), std::inserter(plan.targets, plan.targets.begin()));
@@ -5744,54 +5789,48 @@ IndexingPlan IndexRegistry::make_plan(const IndexGroup& end_products) const {
     }
 #endif
 
-    // Figure out if any of the input files apply a sample scope to descendant files in the plan.
-    // This holds sets of later recipes and the scopes to give them when we get there.
-    // TODO: This is complicated and the queueing is not needed; we can just apply the scopes when we find the descendants.
-    std::vector<std::pair<std::set<RecipeName>, std::vector<std::string>> scope_queue;
+    // Now remove the input data from the plan
+    plan.steps.resize(remove_if(plan.steps.begin(), plan.steps.end(), [&](const RecipeName& recipe_choice) {
+        return all_finished(recipe_choice.first);
+    }) - plan.steps.begin());
+
+    // Now we're allowed to use dependents()
+
+    // Propagate scopes from input files
     for (auto plan_it = plan.steps.begin(); plan_it != plan.steps.end(); ++plan_it) {
         // For each step in ther plan in order
         
-        for (auto& names_and_scopes : scope_queue) {
-            if (names_and_scopes.first.count(plan_it->first)) {
-                // We have scopes from a previous step to apply to the indexes
-                // generated by this step.
-                //
-                // TODO: We assume a single scope only ever comes from one
-                // input file, and we don't enforce a particular scope order
-                // other than rule order.
-                for (const IndexName& index_name : plan_it->first) {
-                    // Append all the new scopes to any existing scopes on this index.
-                    std::copy(names_and_scopes.second().begin(), names_and_scopes.second().end(), std::back_inserter(scopes[index_name])); 
-                }
-            }
-        }
-
         for (const IndexName& index_name : plan_it->first) {
             // For each index that step would generate
             const IndexFile* index = get_index(index_name);
-            if (!index->was_provided_directly()) {
-                // Only directly provided indexes introduce new scopes
+            if (!index->is_finished()) {
+                // Only finished indexes bring in new scopes
                 continue;
             }
-            auto& provided_scopes = index->provided_scopes();
-            if (!provided_scopes.empty()) {
-                // We need to attach these scopes to anything based on this index.
-                set<RecipeName> dependent_recipes = dependents(index_name);
+
+            // Get the scopes that came in with this input
+            auto& provided_scopes = index->get_scopes();
+            
+            // We need to attach these scopes to anything based on this index.
+            // TODO: Should we just propagate scopes step by step as we scan the plan instead?
+            for (const RecipeName& dependent_recipe : plan.dependents(index_name)) {
+                // For each recipe transitively depending on this input
+                for (const IndexName& dependent_index_name : dependent_recipe.first) {
+                    // For each index the recipe generates
+                    
+                    for (auto& scope : provided_scopes) {
+                        // Make sure that index is scoped with all scopes on the input.
+                        plan.add_scope(dependent_index_name, scope);
+                    }
+
+                    // Note that all outputs will get their scopes from the
+                    // inputs added in the same order (the order that the
+                    // inputs appear in the plan).
+                }
             }
         }
     }
 
-
-    // Now remove the input data from the plan
-    plan.steps.resize(remove_if(plan.steps.begin(), plan.steps.end(), [&](const RecipeName& recipe_choice) {
-        return all_finished(recipe_choice.first);
-    }) - plan.steps.begin());
-
-    // The plan has methods that can come back and modify the registry.
-    // We're not going to call any of them, but we have to hand off a non-const
-    // pointer to ourselves so the plan can modify us later.
-    plan.registry = const_cast<IndexRegistry*>(this);
-    
     return plan;
 }
 
@@ -5932,10 +5971,6 @@ IndexFile::IndexFile(const IndexName& identifier, const string& suffix) : identi
     // nothing more to do
 }
 
-bool IndexFile::is_finished() const {
-    return !filenames.empty();
-}
-
 const IndexName& IndexFile::get_identifier() const {
     return identifier;
 }
@@ -5958,6 +5993,11 @@ void IndexFile::provide(const vector<string>& filenames) {
 }
 
 void IndexFile::assign_constructed(const vector<string>& filenames) {
+    std::cerr << "Set " << get_identifier() << " to be at:";
+    for (auto& f : filenames) {
+        std::cerr << " " << f;
+    }
+    std::cerr << std::endl;
     this->filenames = filenames;
     provided_directly = false;
 }
@@ -5966,6 +6006,27 @@ bool IndexFile::was_provided_directly() const {
     return provided_directly;
 }
 
+void IndexFile::add_scope(const string& scope) {
+    if (!is_finished()) {
+        // Scopes for nonexistent files belong to the plan, not us.
+        throw std::logic_error("Cannot assign " + scope + " scope to unfinished " + get_identifier() + " index");
+    }
+
+    // TODO: Change the data structure here if we anticipate an appreciable number of scopes.
+    auto found = std::find(scopes.begin(), scopes.end(), scope);
+    if (found == scopes.end()) {
+        scopes.push_back(scope);
+    }
+}
+
+const vector<string>& IndexFile::get_scopes() const {
+    return scopes;
+}
+
+bool IndexFile::is_finished() const {
+    return !filenames.empty();
+}
+
 void IndexFile::reset() {
     filenames.clear();
     provided_directly = false;
diff --git a/src/index_registry.hpp b/src/index_registry.hpp
index b1f4afa991..6726a419d0 100644
--- a/src/index_registry.hpp
+++ b/src/index_registry.hpp
@@ -181,6 +181,11 @@ class IndexingPlan {
     /// Returns true if the given index is to be intermediate under the given
     /// plan, and false if it is to be preserved.
     bool is_intermediate(const IndexName& identifier) const;
+
+    /// Get the deduplicated scopes, in addition order, that the given index
+    /// will be qualified with when generated. Only works for indexes that will
+    /// be generated by the plan, not for inputs.
+    const vector<string> get_scopes(const IndexName& identifier) const;
     
     // TODO: is this where this function wants to live?
     /// The memory limit, with a little slosh for prediction inaccuracy
@@ -189,19 +194,26 @@ class IndexingPlan {
     int64_t literal_target_memory_usage() const;
     
     /// Returns the recipes in the plan that depend on this index, including the one in which
-    /// it was created (if any)
+    /// it was created (if any).
+    ///
+    /// Requires that all indexes in the plan be generated by a recipe; inputs
+    /// must not be in the plan as steps in order to call this.
     set<RecipeName> dependents(const IndexName& identifier) const;
     
 protected:
     
+    /// Add a scope to the scopes the given index will be qualified with when
+    /// generated, if not present already.
+    void add_scope(const IndexName& identifier, const string& scope);
+
     /// The steps to be invoked in the plan. May be empty before the plan is
     /// actually planned.
     vector<RecipeName> steps;
     /// The indexes to create as outputs.
     set<IndexName> targets;
-    /// The scope qualifying each index in the plan (for example, if an index
+    /// The scopes qualifying each index in the plan (for example, if an index
     /// applies only to a particular sample, it will be scoped to the sample
-    /// name).
+    /// name). Stored deduplicated and in the order added.
     map<IndexName, vector<string>> scopes;
     
     /// The registry that the plan is using.
@@ -399,7 +411,19 @@ class IndexFile {
     
     /// Assign constructed filenames to this index
     void assign_constructed(const vector<string>& filenames);
-    
+
+    /// Append a new scope to qualify the files in the index, if the scope is
+    /// not already on the index.
+    ///
+    /// Should only be used on files that are actually filled in already;
+    /// scopes for files that don't exist yet are the responsibility of the
+    /// IndexingPlan.
+    void add_scope(const string& scope);
+    
+    /// Get all scopes qualifying the index (such as a sample name).
+    /// Scopes are deduplicated and produced in the order added.
+    const vector<string>& get_scopes() const;
+
     /// Returns true if the index has already been built or provided
     bool is_finished() const;
     
@@ -419,6 +443,11 @@ class IndexFile {
     
     // the filename(s) associated with the index
     vector<string> filenames;
+
+    // The dedplicated ordered scopes qualifying the index.
+    // TODO: Use an ordered set for good algorithmics for deduplicating scopes.
+    // Right now we have exactly one kind of scope so this doesn't matter.
+    vector<string> scopes;
     
     // keep track of whether the index was provided directly
     bool provided_directly = false;
diff --git a/src/subcommand/giraffe_main.cpp b/src/subcommand/giraffe_main.cpp
index 079947185d..c83d4930be 100644
--- a/src/subcommand/giraffe_main.cpp
+++ b/src/subcommand/giraffe_main.cpp
@@ -60,6 +60,8 @@ static long perf_event_open(struct perf_event_attr* hw_event, pid_t pid, int cpu
 }
 #endif
 
+#define debug
+
 using namespace std;
 using namespace vg;
 using namespace vg::subcommand;
@@ -768,7 +770,7 @@ int main_giraffe(int argc, char** argv) {
     // indexed for lookup. We only support one file per index (FASTQs get
     // handled separately).
     std::unordered_map<string, string> provided_indexes;
-    string index_basename, index_basename_override;
+    string index_basename_guess, index_basename;
 
     // For haplotype sampling.
     string haplotype_name, kff_name;
@@ -1139,8 +1141,7 @@ int main_giraffe(int argc, char** argv) {
                 provided_indexes.emplace("GBZ", optarg);
 
                 // If we have a GBZ we probably want to use its name as the base name.
-                // But see -g.
-                index_basename = strip_suffixes(std::string(optarg), { ".gbz", ".giraffe" });
+                index_basename_guess = strip_suffixes(std::string(optarg), { ".gbz", ".giraffe" });
 
                 break;
 
@@ -1148,17 +1149,15 @@ int main_giraffe(int argc, char** argv) {
                 provided_indexes.emplace("XG", optarg);
                 
                 // If we have an XG we probably want to use its name as the base name.
-                // But see -g.
-                index_basename = split_ext(optarg).first;
+                index_basename_guess = split_ext(optarg).first;
                 
                 break;
 
             case 'g':
                 provided_indexes.emplace("GBWTGraph", optarg);
                 
-                // But if we have a GBWTGraph we probably want to use *its* name as the base name.
-                // Whichever is specified last will win, unless we also have a FASTA input name.
-                index_basename = split_ext(optarg).first;
+                // If we have a GBWTGraph we probably want to use its name as the base name.
+                index_basename_guess = split_ext(optarg).first;
                 
                 break;
 
@@ -1195,7 +1194,7 @@ int main_giraffe(int argc, char** argv) {
                 provided_indexes.emplace("KFF Kmer Counts", optarg);
                 break;
             case OPT_INDEX_BASENAME:
-                index_basename_override = optarg;
+                index_basename = optarg;
                 break;
 
             case 'G':
@@ -1369,8 +1368,9 @@ int main_giraffe(int argc, char** argv) {
         }
         
         provided_indexes.emplace("Reference FASTA", fasta_filename);
-        // Everything else should be named like the FASTA by default
-        index_basename = fasta_parts.first;
+        // Everything else should be named like the FASTA by default. This is a
+        // better guess than the other siurces.
+        index_basename_guess = fasta_parts.first;
         
         if (have_input_file(optind, argc, argv)) {
             // Next one must be VCF, but check.
@@ -1395,6 +1395,11 @@ int main_giraffe(int argc, char** argv) {
         }
     }
 
+    if (index_basename.empty()) {
+        // Try to pick an index basename now
+        index_basename = index_basename_guess;
+    }
+
     // If we don't want rescue, let the user see we don't try it.
     if (parser->get_option_value<size_t>("rescue-attempts") == 0 || rescue_algorithm == MinimizerMapper::rescue_none) {
         // Replace any parsed values
@@ -1470,12 +1475,7 @@ int main_giraffe(int argc, char** argv) {
     // Haplotype sampling is active when explicitly requested (--haplotype-sampling)
     // or when either --haplotype-name or --kff-name is given.
     bool haplotype_sampling = haplotype_sampling_flag || provided_indexes.count("Haplotype Index") || provided_indexes.count("KFF Kmer Counts");
-    // Save the GBZ-derived basename before any override, so we can find
-    // dist and other indexes co-located with the source GBZ during sampling.
-    string original_basename = index_basename;
-    if (!index_basename_override.empty()) {
-        index_basename = index_basename_override;
-    }
+    
     if (haplotype_sampling) {
 
         // Determine sample name early so we can incorporate it into the
@@ -1504,7 +1504,6 @@ int main_giraffe(int argc, char** argv) {
         if (sample == "giraffe") {
             logger.warn() << "Using \"giraffe\" as a sample name may lead to filename collisions." << std::endl;
         }
-        index_basename = index_basename + "." + sample;
 
         // Pass parameters to the recipes through IndexingParameters.
         IndexingParameters::haplotype_reference_samples = reference_samples;
@@ -1515,45 +1514,52 @@ int main_giraffe(int argc, char** argv) {
             if (!fastq_filename_2.empty()) {
                 fastqs.push_back(fastq_filename_2);
             }
-            registry.provide("FASTQ", fastqs);
+            // Provide the FASTQs to the index registry, but mark everything
+            // derived from them as being scoped to the particular sample.
+            registry.provide("FASTQ", fastqs, sample);
         } else if (!provided_indexes.count("KFF Kmer Counts")) {
             // TODO: Eventually learn to kmer count from GAM
             logger.error() << "Cannot do haplotype sampling without kmer counts (--kff-name) or FASTQ input (--fastq-in/-f)" << std::endl; 
         }
         // Otherwise, we know we're sending kmer counts and will still do haplotype sampling
         
-        if (!registry.available("Top Level Chain Distance Index")) {
-            // Look for base GBZ distance index
-            string tcdist_candidate = original_basename + ".tcdist";
-            string dist_candidate = original_basename + ".dist";
-            if (file_exists(tcdist_candidate)) {
-                registry.provide("Top Level Chain Distance Index", tcdist_candidate);
-            } else if (file_exists(dist_candidate)) {
-                // Provide it as a top level chain index even though it has more stuff
-                registry.provide("Top Level Chain Distance Index", dist_candidate);
+        if (!index_basename.empty()) {
+            // We have some idea where indexes might be, so guess them.
+            if (!registry.available("Top Level Chain Distance Index")) {
+                // Look for base GBZ distance index
+                string tcdist_candidate = index_basename + ".tcdist";
+                string dist_candidate = index_basename + ".dist";
+                if (file_exists(tcdist_candidate)) {
+                    registry.provide("Top Level Chain Distance Index", tcdist_candidate);
+                } else if (file_exists(dist_candidate)) {
+                    // Provide it as a top level chain index even though it has more stuff
+                    registry.provide("Top Level Chain Distance Index", dist_candidate);
+                }
             }
-        }
-        
-        if (!registry.available("Haplotype Index")) {
-            // Look for hapl index
-            string hapl_candidate = original_basename + ".hapl";
-            if (file_exists(hapl_candidate)) {
-                registry.provide("Haplotype Index", hapl_candidate);
+            
+            if (!registry.available("Haplotype Index")) {
+                // Look for hapl index
+                string hapl_candidate = index_basename + ".hapl";
+                if (file_exists(hapl_candidate)) {
+                    registry.provide("Haplotype Index", hapl_candidate);
+                }
             }
-        }
 
-        if (!registry.available("r Index")) {
-            // Look for r index
-            string r_candidate = original_basename + ".ri";
-            if (file_exists(r_candidate)) {
-                registry.provide("r Index", r_candidate);
+            if (!registry.available("r Index")) {
+                // Look for r index
+                string r_candidate = index_basename + ".ri";
+                if (file_exists(r_candidate)) {
+                    registry.provide("r Index", r_candidate);
+                }
             }
         }
 
     }
-
-    // Use the basename we've determined for generated indexes
-    registry.set_prefix(index_basename);
+    
+    if (!index_basename.empty()) {
+        // Use the basename we've determined for generated indexes
+        registry.set_prefix(index_basename);
+    }
 
     // The IndexRegistry doesn't try to infer index files based on the
     // basename, so do that here. We can have multiple extension options that
diff --git a/test/haplotype-sampling/README.md b/test/haplotype-sampling/README.md
index 406e9e9b82..e188cf8a9c 100644
--- a/test/haplotype-sampling/README.md
+++ b/test/haplotype-sampling/README.md
@@ -147,3 +147,9 @@ The resulting fasta file can be used as the reference with any linear aligner.
 
 Documentation for the `vg haplotypes` subcommand can be found in the [vg wiki](https://github.com/vgteam/vg/wiki/Haplotype-Sampling).
 You may also want to read about the [best practices](https://github.com/vgteam/vg/wiki/Giraffe-best-practices) for mapping reads with Giraffe.
+
+Remember to clean up after trying all the examples:
+<!-- !test check Clean up after examples -->
+```
+rm -f HG003.gbz giraffe-integration.gam micb-kir3dl1.dist micb-kir3dl1.gbz micb-kir3dl1.giraffe.gbz micb-kir3dl1.hapl micb-kir3dl1.ri micb-kir3dl1.shortread.withzip.min micb-kir3dl1.shortread.zipcodes reference.fa to-original.gam to-sampled.gam
+```

From 79a5672f93e925e427787cc859c2fe6ac4819763 Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Tue, 10 Mar 2026 13:33:48 -0400
Subject: [PATCH 09/22] Simplify alias fulfillment

---
 src/index_registry.cpp | 40 ++++++++++++++++------------------------
 1 file changed, 16 insertions(+), 24 deletions(-)

diff --git a/src/index_registry.cpp b/src/index_registry.cpp
index 8a476ede15..c574640f4c 100644
--- a/src/index_registry.cpp
+++ b/src/index_registry.cpp
@@ -4826,35 +4826,27 @@ void IndexRegistry::make_indexes(const vector<IndexName>& identifiers) {
         }
         
         const auto& aliasee_filenames = get_index(aliasee)->get_filenames();
-        
-        // copy aliases for any that we need to (start past index 0 if we can move it)
-        for (size_t i = can_move; i < aliasors.size(); ++i) {
-            // We want the aliasor to point at the new paths
-            vector<string> new_aliasor_files;
-            for (size_t j = 0; j < aliasee_filenames.size(); ++j) {
-                
-                auto copy_filename = plan.output_filepath(aliasors[i], j, aliasee_filenames.size());
-                copy_file(aliasee_filenames[j], copy_filename);
-                new_aliasor_files.push_back(copy_filename);
-            }
-            // Point the aliasor index at the new paths
-            get_index(aliasors[i])->assign_constructed(new_aliasor_files);
-        }
-        // if we can move the aliasee (i.e. it is intermediate), then make
-        // one index (the 0th one) by moving instead of copying
-        if (can_move) {
+
+        for (auto it = aliasors.rbegin(); it != aliasors.rend(); ++it) {
+            // Fulfill the aliasor from the aliasee (in reverse order, so the
+            // 0th aliasor gets to have the moved files).
             vector<string> new_aliasor_files;
             for (size_t j = 0; j < aliasee_filenames.size(); ++j) {
-                auto move_filename = plan.output_filepath(aliasors[0], j, aliasee_filenames.size());
-                int code = rename(aliasee_filenames[j].c_str(), move_filename.c_str());
-                if (code) {
-                    // moving failed (maybe because the files on separate drives?) fall back on copying
-                    copy_file(aliasee_filenames[j], move_filename);
+                auto dest_filename = plan.output_filepath(*it, j, aliasee_filenames.size());
+                bool moved = false;
+                if (can_move && (it + 1 == aliasors.rend())) {
+                    // We can move and this is the last aliasor we will do, so try moving
+                    moved = rename(aliasee_filenames[j].c_str(), dest_filename.c_str()) == 0;
                 }
-                new_aliasor_files.push_back(move_filename);
+                if (!moved) {
+                    // moving failed or is not allowed, fall back on copying
+                    copy_file(aliasee_filenames[j], dest_filename);
+                }
+
+                new_aliasor_files.push_back(dest_filename);
             }
             // Point the aliasor index at the new paths
-            get_index(aliasors[0])->assign_constructed(new_aliasor_files);
+            get_index(*it)->assign_constructed(new_aliasor_files);
         }
     }
     

From ebd38aeacdb52dbd91c4eb2e11c0f401ba72c4fc Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Tue, 10 Mar 2026 14:31:31 -0400
Subject: [PATCH 10/22] Make plan really understand provided indexes and
 actually attach scopes to inputs

---
 src/index_registry.cpp          | 63 +++++++++++++++++++++++++--------
 src/index_registry.hpp          | 18 ++++++++--
 src/subcommand/giraffe_main.cpp | 60 +++++++++++++++++++------------
 3 files changed, 102 insertions(+), 39 deletions(-)

diff --git a/src/index_registry.cpp b/src/index_registry.cpp
index c574640f4c..b34c093a6a 100644
--- a/src/index_registry.cpp
+++ b/src/index_registry.cpp
@@ -4647,6 +4647,10 @@ set<RecipeName> IndexingPlan::dependents(const IndexName& identifier) const {
     IndexGroup successor_indexes{identifier};
     
     for (const auto& step : steps) {
+        if (!registry->has_recipe(step)) {
+            // This step is just a provided input and not a real recipe step.
+            continue;
+        }
                 
         // TODO: should this behavior change if some of the inputs were provided directly?
         
@@ -4669,12 +4673,14 @@ set<RecipeName> IndexingPlan::dependents(const IndexName& identifier) const {
 }
 
 void IndexingPlan::add_scope(const IndexName& identifier, const string& scope) {
-    auto index_scopes = scopes[identifier];
+    // Get a mutable reference to the scopes we need to add to.
+    auto& index_scopes = scopes[identifier];
 
     // TODO: Change the data structure here if we anticipate an appreciable number of scopes.
     auto found = std::find(index_scopes.begin(), index_scopes.end(), scope);
     if (found == index_scopes.end()) {
         index_scopes.push_back(scope);
+        std::cerr << "Plan scope " << scope << " for " << identifier << std::endl;
     }
 }
 
@@ -4894,7 +4900,9 @@ void IndexRegistry::provide(const IndexName& identifier, const vector<string>& f
             require_exists(context, filename);
         }
     }
-    get_index(identifier)->provide(filenames);
+    auto index = get_index(identifier);
+    index->provide(filenames);
+    index->add_scope(scope);
 }
 
 void IndexRegistry::reset(const IndexName& identifier) {
@@ -5503,6 +5511,8 @@ IndexingPlan IndexRegistry::make_plan(const IndexGroup& end_products) const {
         IndexGroup product_group{product};
         
         // records of (identifier, requesters, ordinal index of recipe selected)
+        // A max-value sentinel index means no recipe is used and the index is
+        // provided.
         vector<tuple<size_t, set<size_t>, size_t>> plan_path;
         
         // map dependency priority to requesters
@@ -5643,6 +5653,7 @@ IndexingPlan IndexRegistry::make_plan(const IndexGroup& end_products) const {
             
             // get the latest file in the dependency order that we have left to build
             auto it = queue.begin();
+            // Imagine starting with the first recipe for it
             plan_path.emplace_back(it->first, it->second, 0);
             
 #ifdef debug_index_registry
@@ -5666,6 +5677,10 @@ IndexingPlan IndexRegistry::make_plan(const IndexGroup& end_products) const {
 #ifdef debug_index_registry
                 cerr << "index has been provided as input" << endl;
 #endif
+
+                // Don't point to a recipe for it that might exist; point to a
+                // nonexistent sentinel recipe.
+                get<2>(plan_path.back()) = std::numeric_limits<size_t>::max();
                 continue;
             }
             else if (recipe_registry.count(index_group)) {
@@ -5728,7 +5743,13 @@ IndexingPlan IndexRegistry::make_plan(const IndexGroup& end_products) const {
 #ifdef debug_index_registry
         cerr << "final plan path for index " << product << ":" << endl;
         for (auto path_elem : plan_path) {
-            cerr << "\t" << to_string(identifier_order[get<0>(path_elem)]) << ", recipe " << get<2>(path_elem) << ", from:" << endl;
+            cerr << "\t" << to_string(identifier_order[get<0>(path_elem)]);
+            if (get<2>(path_elem) == std::numeric_limits<size_t>::max()) {
+                cerr << ", provided";
+            } else {
+                cerr << ", recipe " << get<2>(path_elem);
+            }
+            cerr << ", from:" << endl;
             for (auto d : get<1>(path_elem)) {
                 cerr << "\t\t" << to_string(identifier_order[d]) << endl;
             }
@@ -5764,7 +5785,7 @@ IndexingPlan IndexRegistry::make_plan(const IndexGroup& end_products) const {
 #ifdef debug_index_registry
     cerr << "plan before applying generalizations:" << endl;
     for (auto plan_elem : plan.steps) {
-        cerr << "\t" << to_string(plan_elem.first) << " " << plan_elem.second << endl;
+        cerr << "\t" << to_string(plan_elem.first) << " " << (plan_elem.second == std::numeric_limits<size_t>::max() ? "Provided" : std::to_string(plan_elem.second)) << endl;
     }
 #endif
     
@@ -5777,17 +5798,10 @@ IndexingPlan IndexRegistry::make_plan(const IndexGroup& end_products) const {
 #ifdef debug_index_registry
     cerr << "full plan including provided files:" << endl;
     for (auto plan_elem : plan.steps) {
-        cerr << "\t" << to_string(plan_elem.first) << " " << plan_elem.second << endl;
+        cerr << "\t" << to_string(plan_elem.first) << " " << (plan_elem.second == std::numeric_limits<size_t>::max() ? "Provided" : std::to_string(plan_elem.second)) << endl;
     }
 #endif
 
-    // Now remove the input data from the plan
-    plan.steps.resize(remove_if(plan.steps.begin(), plan.steps.end(), [&](const RecipeName& recipe_choice) {
-        return all_finished(recipe_choice.first);
-    }) - plan.steps.begin());
-
-    // Now we're allowed to use dependents()
-
     // Propagate scopes from input files
     for (auto plan_it = plan.steps.begin(); plan_it != plan.steps.end(); ++plan_it) {
         // For each step in ther plan in order
@@ -5797,15 +5811,18 @@ IndexingPlan IndexRegistry::make_plan(const IndexGroup& end_products) const {
             const IndexFile* index = get_index(index_name);
             if (!index->is_finished()) {
                 // Only finished indexes bring in new scopes
+                std::cerr << "Index " << index_name << " isn't finished and can't bring in scopes" << std::endl;
                 continue;
             }
 
             // Get the scopes that came in with this input
             auto& provided_scopes = index->get_scopes();
-            
+
             // We need to attach these scopes to anything based on this index.
             // TODO: Should we just propagate scopes step by step as we scan the plan instead?
-            for (const RecipeName& dependent_recipe : plan.dependents(index_name)) {
+            auto dependents = plan.dependents(index_name);
+            std::cerr << "Index " << index_name << " adds " << provided_scopes.size() << " scopes to " << dependents.size() << " dependents" << std::endl;
+            for (const RecipeName& dependent_recipe : dependents) {
                 // For each recipe transitively depending on this input
                 for (const IndexName& dependent_index_name : dependent_recipe.first) {
                     // For each index the recipe generates
@@ -5823,9 +5840,25 @@ IndexingPlan IndexRegistry::make_plan(const IndexGroup& end_products) const {
         }
     }
 
+    // Now remove the input data from the plan
+    plan.steps.resize(remove_if(plan.steps.begin(), plan.steps.end(), [&](const RecipeName& recipe_choice) {
+        return all_finished(recipe_choice.first);
+    }) - plan.steps.begin());
+
     return plan;
 }
 
+bool IndexRegistry::has_recipe(const RecipeName& recipe_name) const {
+    auto found = recipe_registry.find(recipe_name.first);
+    if (found == recipe_registry.end()) {
+        // No recipes at all for this
+        return false;
+    }
+    // Detemrine if this recipe index is in range.
+    // Sentinel max-value names are always out of range.
+    return recipe_name.second < found->second.size();
+}
+
 const IndexRecipe& IndexRegistry::get_recipe(const RecipeName& recipe_name) const {
     const auto& recipes = recipe_registry.at(recipe_name.first);
     assert(recipe_name.second < recipes.size());
@@ -6008,6 +6041,7 @@ void IndexFile::add_scope(const string& scope) {
     auto found = std::find(scopes.begin(), scopes.end(), scope);
     if (found == scopes.end()) {
         scopes.push_back(scope);
+        std::cerr << "Added scope " << scope << " to " << get_identifier() << std::endl;
     }
 }
 
@@ -6022,6 +6056,7 @@ bool IndexFile::is_finished() const {
 void IndexFile::reset() {
     filenames.clear();
     provided_directly = false;
+    scopes.clear();
 }
 
 IndexRecipe::IndexRecipe(const vector<const IndexFile*>& inputs,
diff --git a/src/index_registry.hpp b/src/index_registry.hpp
index 6726a419d0..29bdafd481 100644
--- a/src/index_registry.hpp
+++ b/src/index_registry.hpp
@@ -36,6 +36,9 @@ using IndexGroup = set<IndexName>;
 
 /**
  * Names a recipe in the collection of registered recipes.
+ *
+ * A max-value number means the indexes are provided and not generated by any
+ * recipe.
  */
 using RecipeName = pair<IndexGroup, size_t>;
 
@@ -195,9 +198,6 @@ class IndexingPlan {
     
     /// Returns the recipes in the plan that depend on this index, including the one in which
     /// it was created (if any).
-    ///
-    /// Requires that all indexes in the plan be generated by a recipe; inputs
-    /// must not be in the plan as steps in order to call this.
     set<RecipeName> dependents(const IndexName& identifier) const;
     
 protected:
@@ -279,10 +279,14 @@ class IndexRegistry {
     
     /// Indicate a serialized file that contains some identified index,
     /// optionally with a scope that propagates to descendant files.
+    ///
+    /// The empty string represents no scope.
     void provide(const IndexName& identifier, const string& filename, const std::string& scope = "");
     
     /// Indicate a list of serialized files that contains some identified index,
     /// optionally with a scope that propagates to descendant files.
+    ///
+    /// The empty string represents no scope.
     void provide(const IndexName& identifier, const vector<string>& filenames, const std::string& scope = "");
 
     /// Remove a provided index
@@ -342,6 +346,14 @@ class IndexRegistry {
     /// generate a plan to create the indexes
     IndexingPlan make_plan(const IndexGroup& end_products) const;
     
+    /// Check if a recipe identifier correesponds to a recipe.
+    ///
+    /// Recipe identifiers not corresponding to actual recipes are used during
+    /// planning to represent provided inputs.
+    ///
+    /// TODO: Refactor that with some kind of tagged union or optional.
+    bool has_recipe(const RecipeName& recipe_name) const;
+
     /// use a recipe identifier to get the recipe
     const IndexRecipe& get_recipe(const RecipeName& recipe_name) const;
     
diff --git a/src/subcommand/giraffe_main.cpp b/src/subcommand/giraffe_main.cpp
index c83d4930be..e16efe0c76 100644
--- a/src/subcommand/giraffe_main.cpp
+++ b/src/subcommand/giraffe_main.cpp
@@ -773,7 +773,7 @@ int main_giraffe(int argc, char** argv) {
     string index_basename_guess, index_basename;
 
     // For haplotype sampling.
-    string haplotype_name, kff_name;
+    string kff_filename;
     bool haplotype_sampling_flag = false;
     std::unordered_set<std::string> reference_samples;
 
@@ -1191,7 +1191,9 @@ int main_giraffe(int argc, char** argv) {
                 provided_indexes.emplace("Haplotype Index", optarg);
                 break;
             case OPT_KFF_NAME:
-                provided_indexes.emplace("KFF Kmer Counts", optarg);
+                // We need to provide the KFF with sample scope if we provide
+                // it, but we're not sure of the sample name yet.
+                kff_filename = optarg;
                 break;
             case OPT_INDEX_BASENAME:
                 index_basename = optarg;
@@ -1474,40 +1476,50 @@ int main_giraffe(int argc, char** argv) {
 
     // Haplotype sampling is active when explicitly requested (--haplotype-sampling)
     // or when either --haplotype-name or --kff-name is given.
-    bool haplotype_sampling = haplotype_sampling_flag || provided_indexes.count("Haplotype Index") || provided_indexes.count("KFF Kmer Counts");
+    // Otherwise we'd always sample when we had reads available.
+    bool haplotype_sampling = haplotype_sampling_flag || provided_indexes.count("Haplotype Index") || !kff_filename.empty();
     
+    string sample_scope;
     if (haplotype_sampling) {
 
-        // Determine sample name early so we can incorporate it into the
+        // Determine sample scope so we can incorporate it into the
         // index basename; downstream indexes (dist, min) will then be
-        // co-located with the sampled GBZ.
-        string sample = sample_name;
-        if (sample.empty()) {
-            if (!kff_name.empty()) {
-                sample = file_base_name(kff_name);
+        // scoped to the sample.
+        sample_scope = sample_name;
+        if (sample_scope.empty()) {
+            if (!kff_filename.empty()) {
+                sample_scope = file_base_name(kff_filename);
                 if (show_progress) {
-                    logger.info() << "Guessing from " << kff_name
-                                  << " that sample name is " << sample << std::endl;
+                    logger.info() << "Guessing from " << kff_filename
+                                  << " that sample name is " << sample_scope << std::endl;
                 }
             } else if (!fastq_filename_1.empty()) {
                 // Strip .gz then use file_base_name to strip .fq/.fastq, since
                 // strip_suffixes stops at the first one that's not there.
-                sample = file_base_name(strip_suffixes(fastq_filename_1, {".gz"}));
+                sample_scope = file_base_name(strip_suffixes(fastq_filename_1, {".gz"}));
                 if (show_progress) {
                     logger.info() << "Guessing from " << fastq_filename_1
-                                  << " that sample name is " << sample << std::endl;
+                                  << " that sample name is " << sample_scope << std::endl;
                 }
             } else {
-                sample = "sample";
+                sample_scope = "sample";
             }
         }
-        if (sample == "giraffe") {
+        if (sample_scope == "giraffe") {
             logger.warn() << "Using \"giraffe\" as a sample name may lead to filename collisions." << std::endl;
         }
 
         // Pass parameters to the recipes through IndexingParameters.
         IndexingParameters::haplotype_reference_samples = reference_samples;
+        
+        if (kff_filename.empty() && fastq_filename_1.empty()) {
+            // TODO: Eventually learn to kmer count from GAM
+            logger.error() << "Cannot do haplotype sampling without kmer counts (--kff-name) or FASTQ input (--fastq-in/-f)" << std::endl; 
+        }
 
+        if (!kff_filename.empty()) {
+            registry.provide("KFF Kmer Counts", kff_filename, sample_scope);
+        }
         if (!fastq_filename_1.empty()) {
             // If FASTQs are available, send those and we will do haplotype sampling.
             vector<string> fastqs = {fastq_filename_1};
@@ -1516,12 +1528,9 @@ int main_giraffe(int argc, char** argv) {
             }
             // Provide the FASTQs to the index registry, but mark everything
             // derived from them as being scoped to the particular sample.
-            registry.provide("FASTQ", fastqs, sample);
-        } else if (!provided_indexes.count("KFF Kmer Counts")) {
-            // TODO: Eventually learn to kmer count from GAM
-            logger.error() << "Cannot do haplotype sampling without kmer counts (--kff-name) or FASTQ input (--fastq-in/-f)" << std::endl; 
+            registry.provide("FASTQ", fastqs, sample_scope);
         }
-        // Otherwise, we know we're sending kmer counts and will still do haplotype sampling
+        
         
         if (!index_basename.empty()) {
             // We have some idea where indexes might be, so guess them.
@@ -1594,13 +1603,20 @@ int main_giraffe(int argc, char** argv) {
         bool found = false;
         for (auto& extension : index_and_extensions->second) {
             // For each extension in priority order
-            string inferred_filename = registry.get_prefix() + "." + extension;
+            string inferred_filename = registry.get_prefix();
+            if (haplotype_sampling) {
+                // All these are sample-scoped if we're haplotype sampling.
+                // TODO: Let index registry generate the filenames, which it knows how to do.
+                inferred_filename += "." + sample_scope;
+            }
+            
+            inferred_filename += "." + extension;
             if (file_exists(inferred_filename)) {
                 // A file with the appropriate name exists and we can read it.
                 // Report it because this may not be desired behavior.
                 logger.info() << "Guessing that " << inferred_filename
                               << " is " << index_and_extensions->first << endl;
-                registry.provide(index_and_extensions->first, inferred_filename);
+                registry.provide(index_and_extensions->first, inferred_filename, haplotype_sampling ? sample_scope : "");
                 found = true;
                 // Skip other extension options for the index
                 break;

From 955030861484f872c271394312199fcecf644844 Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Tue, 10 Mar 2026 14:37:18 -0400
Subject: [PATCH 11/22] Don't add empty scopes and quiet debugging

---
 src/index_registry.cpp          | 17 +++++++++++------
 src/subcommand/giraffe_main.cpp |  2 +-
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/src/index_registry.cpp b/src/index_registry.cpp
index b34c093a6a..5b21a8d6ff 100644
--- a/src/index_registry.cpp
+++ b/src/index_registry.cpp
@@ -67,9 +67,9 @@
 #include "algorithms/component.hpp"
 #include "algorithms/find_translation.hpp"
 
-#define debug_index_registry
+//#define debug_index_registry
 //#define debug_index_registry_setup
-#define debug_index_registry_recipes
+//#define debug_index_registry_recipes
 //#define debug_index_registry_path_state
 
 namespace std {
@@ -4631,7 +4631,6 @@ string IndexingPlan::output_filepath(const IndexName& identifier, size_t chunk,
         filepath << "." << to_string(chunk);
     }
     filepath << "." << registry->get_index(identifier)->get_suffix();
-    std::cerr << "Considering output file path " << filepath.str() << std::endl;
     return filepath.str();
 }
  
@@ -4680,7 +4679,6 @@ void IndexingPlan::add_scope(const IndexName& identifier, const string& scope) {
     auto found = std::find(index_scopes.begin(), index_scopes.end(), scope);
     if (found == index_scopes.end()) {
         index_scopes.push_back(scope);
-        std::cerr << "Plan scope " << scope << " for " << identifier << std::endl;
     }
 }
 
@@ -4902,7 +4900,9 @@ void IndexRegistry::provide(const IndexName& identifier, const vector<string>& f
     }
     auto index = get_index(identifier);
     index->provide(filenames);
-    index->add_scope(scope);
+    if (!scope.empty()) {
+        index->add_scope(scope);
+    }
 }
 
 void IndexRegistry::reset(const IndexName& identifier) {
@@ -5811,7 +5811,9 @@ IndexingPlan IndexRegistry::make_plan(const IndexGroup& end_products) const {
             const IndexFile* index = get_index(index_name);
             if (!index->is_finished()) {
                 // Only finished indexes bring in new scopes
+#ifdef debug_index_registry
                 std::cerr << "Index " << index_name << " isn't finished and can't bring in scopes" << std::endl;
+#endif
                 continue;
             }
 
@@ -5821,7 +5823,9 @@ IndexingPlan IndexRegistry::make_plan(const IndexGroup& end_products) const {
             // We need to attach these scopes to anything based on this index.
             // TODO: Should we just propagate scopes step by step as we scan the plan instead?
             auto dependents = plan.dependents(index_name);
+#ifdef debug_index_registry
             std::cerr << "Index " << index_name << " adds " << provided_scopes.size() << " scopes to " << dependents.size() << " dependents" << std::endl;
+#endif
             for (const RecipeName& dependent_recipe : dependents) {
                 // For each recipe transitively depending on this input
                 for (const IndexName& dependent_index_name : dependent_recipe.first) {
@@ -6018,7 +6022,6 @@ void IndexFile::provide(const vector<string>& filenames) {
 }
 
 void IndexFile::assign_constructed(const vector<string>& filenames) {
-    std::cerr << "Set " << get_identifier() << " to be at:";
     for (auto& f : filenames) {
         std::cerr << " " << f;
     }
@@ -6041,7 +6044,9 @@ void IndexFile::add_scope(const string& scope) {
     auto found = std::find(scopes.begin(), scopes.end(), scope);
     if (found == scopes.end()) {
         scopes.push_back(scope);
+#ifdef debug_index_registry
         std::cerr << "Added scope " << scope << " to " << get_identifier() << std::endl;
+#endif
     }
 }
 
diff --git a/src/subcommand/giraffe_main.cpp b/src/subcommand/giraffe_main.cpp
index e16efe0c76..8846c28530 100644
--- a/src/subcommand/giraffe_main.cpp
+++ b/src/subcommand/giraffe_main.cpp
@@ -60,7 +60,7 @@ static long perf_event_open(struct perf_event_attr* hw_event, pid_t pid, int cpu
 }
 #endif
 
-#define debug
+//#define debug
 
 using namespace std;
 using namespace vg;

From 5ae95d5e0569310dc7274b50f5b3f077e7072fe1 Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Tue, 10 Mar 2026 15:59:33 -0400
Subject: [PATCH 12/22] Add a terrible multi-extension system that needs hacky
 workarounds and give kmc a useful file limit

---
 src/index_registry.cpp            | 116 ++++++++++++++++++++++++------
 src/index_registry.hpp            |  27 +++++--
 src/subcommand/autoindex_main.cpp |   9 +++
 3 files changed, 126 insertions(+), 26 deletions(-)

diff --git a/src/index_registry.cpp b/src/index_registry.cpp
index 5b21a8d6ff..a2feedb04e 100644
--- a/src/index_registry.cpp
+++ b/src/index_registry.cpp
@@ -18,6 +18,7 @@
 #include <regex>
 #include <omp.h>
 #include <fcntl.h>
+#include <sys/resource.h>
 #include <sys/stat.h>
 #include <sys/wait.h>
 #include <unistd.h>
@@ -625,7 +626,7 @@ IndexRegistry VGIndexes::get_vg_index_registry() {
     
     registry.register_index("GBWTGraph", "gg");
     registry.register_index("GBZ", "gbz");
-    registry.register_index("Giraffe GBZ", "giraffe.gbz");
+    registry.register_index("Giraffe GBZ", std::vector<string>{"gbz", "giraffe.gbz"});
     registry.register_index("Haplotype-Sampled GBZ", "sampled.gbz");
     registry.register_index("Top Level Chain Distance Index", "tcdist");
     registry.register_index("r Index", "ri");
@@ -4426,6 +4427,7 @@ IndexRegistry VGIndexes::get_vg_index_registry() {
 
         string kmc_tmp_dir = tmp_dir + "/kmc_tmp";
         mkdir(kmc_tmp_dir.c_str(), 0700);
+        assert(std::filesystem::exists(kmc_tmp_dir));
 
         // Run kmc: count k-mers in KFF format with canonical k-mers.
         // Use fork()+execvp() instead of system() to avoid shell quoting issues.
@@ -4452,6 +4454,16 @@ IndexRegistry VGIndexes::get_vg_index_registry() {
                     close(devnull);
                 }
             }
+            
+            // Allow enough open files for kmc to work; see
+            // <https://github.com/refresh-bio/KMC?tab=readme-ov-file#quick-start>
+            // and <https://stackoverflow.com/a/42741227>
+            // TODO: Probably this isn't allowed after fork but we do it anyway.
+            struct rlimit new_limit;
+            new_limit.rlim_cur = 2048;
+            new_limit.rlim_max = RLIM_INFINITY;
+            setrlimit(RLIMIT_NOFILE, &new_limit);
+
             // execvp promises never to modify its arguments but casn't say
             // that in the C++ type system because it causes problems in the C
             // type system. See https://stackoverflow.com/a/19505361
@@ -4630,7 +4642,9 @@ string IndexingPlan::output_filepath(const IndexName& identifier, size_t chunk,
         // that start with digits)
         filepath << "." << to_string(chunk);
     }
-    filepath << "." << registry->get_index(identifier)->get_suffix();
+    // Use the suffix we've assigned, which is the first one we have that's
+    // unique within the scopes.
+    filepath << "." << assigned_suffix.at(identifier);
     return filepath.str();
 }
  
@@ -4859,26 +4873,41 @@ void IndexRegistry::make_indexes(const vector<IndexName>& identifiers) {
 }
 
 void IndexRegistry::register_index(const IndexName& identifier, const string& suffix) {
+    register_index(identifier, vector<string>{suffix});
+}
+
+void IndexRegistry::register_index(const IndexName& identifier, const vector<string>& suffixes) {
     // Add this index to the registry
     if (identifier.empty()) {
         error(context) << "indexes must have a non-empty identifier" << endl;
     }
-    if (suffix.empty()) {
-        error(context) << "indexes must have a non-empty suffix" << endl;
-    }
-    if (isdigit(suffix.front())) {
-        // this ensures that we can add numbers to the suffix to create a unique suffix
-        // for chunked workflows
-        error(context) << "suffixes cannot start with a digit" << endl;
-    }
     if (index_registry.count(identifier)) {
         error(context) << "index registry contains a duplicated identifier: " << identifier << endl;
     }
-    if (registered_suffixes.count(suffix)) {
-        error(context) << "index registry contains a duplicated suffix: " << suffix << endl;
+    bool has_unique_suffix = false;
+    for (auto suffix : suffixes) {
+        if (suffix.empty()) {
+            error(context) << "indexes must have a non-empty suffix" << endl;
+        }
+        if (isdigit(suffix.front())) {
+            // this ensures that we can add numbers to the suffix to create a unique suffix
+            // for chunked workflows
+            error(context) << "suffixes cannot start with a digit" << endl;
+        }
+        if (!registered_suffixes.count(suffix)) {
+            has_unique_suffix = true;
+        }
+    }
+    if (!has_unique_suffix) {
+        // We need each index to have at least one new suffix not yet used or
+        // we might not be able to find a place to put it when making the
+        // indexes in the order registered.
+        error(context) << "index registry contains other indexes for all suffixes for " << identifier << endl;
+    }
+    index_registry[identifier] = unique_ptr<IndexFile>(new IndexFile(identifier, suffixes));
+    for (auto suffix : suffixes) {
+        registered_suffixes.insert(suffix);
     }
-    index_registry[identifier] = unique_ptr<IndexFile>(new IndexFile(identifier, suffix));
-    registered_suffixes.insert(suffix);
 }
 
 
@@ -4934,7 +4963,12 @@ vector<string> IndexRegistry::get_possible_filenames(const IndexName& identifier
         error(context) << "cannot require unregistered index: " << identifier << endl;
     }
     const IndexFile* index = get_index(identifier);
-    return {get_prefix() + "." + index->get_suffix()};
+    vector<string> filenames;
+    for (auto& suffix : index->get_suffixes()) {
+        // Try all the possible suffixes
+        filenames.push_back(get_prefix() + "." + suffix);
+    }
+    return filenames;
 }
 
 vector<string> IndexRegistry::require(const IndexName& identifier) const {
@@ -5844,6 +5878,44 @@ IndexingPlan IndexRegistry::make_plan(const IndexGroup& end_products) const {
         }
     }
 
+    // Assign each index a unique scoped suffix, so we can use .<sample>.gbz
+    // for the Giraffe GBZ and .gbz for the plain one.
+    unordered_set<string> used_scoped_suffixes;
+    for (auto& step : plan.steps) {
+        for (const IndexName& index_name : step.first) {
+            auto index = get_index(index_name);
+            // If scopes were provided, this is an input, and just use those
+            vector<string> scopes = index->get_scopes();
+            if (scopes.empty()) {
+                // Otherwise see if we have planned scopes
+                scopes = plan.get_scopes(index_name);
+            }
+
+            // Build the part that is any scopes (either provided or planned)
+            std::stringstream suffix_base;
+            for (auto& scope : scopes) {
+                suffix_base << scope << ".";
+            }
+
+            for (auto& suffix : index->get_suffixes()) {
+                // Condider this suffix option on top of the scopes
+                string candidate_suffix = suffix_base.str() + suffix;
+                
+                auto found = used_scoped_suffixes.find(candidate_suffix);
+                if (found == used_scoped_suffixes.end()) {
+                    // This is a fresh one. Use it.
+                    used_scoped_suffixes.emplace_hint(found, candidate_suffix);
+                    // We only remember the suffix itself; we reconstruct the
+                    // actual name later, with chunk info inserted.
+                    // TODO: Instead of building an essentially fake string key
+                    // should we do something else?
+                    plan.assigned_suffix[index_name] = suffix;
+                    break;
+                }
+            }
+        }
+    }
+
     // Now remove the input data from the plan
     plan.steps.resize(remove_if(plan.steps.begin(), plan.steps.end(), [&](const RecipeName& recipe_choice) {
         return all_finished(recipe_choice.first);
@@ -5996,7 +6068,11 @@ string IndexRegistry::to_dot(const vector<IndexName>& targets) const {
     return strm.str();
 }
 
-IndexFile::IndexFile(const IndexName& identifier, const string& suffix) : identifier(identifier), suffix(suffix) {
+IndexFile::IndexFile(const IndexName& identifier, const string& suffix) : IndexFile(identifier, vector<string>{suffix}) {
+    // Nothing to do!
+}
+
+IndexFile::IndexFile(const IndexName& identifier, const vector<string>& suffixes) : identifier(identifier), suffixes(suffixes) {
     // nothing more to do
 }
 
@@ -6004,8 +6080,8 @@ const IndexName& IndexFile::get_identifier() const {
     return identifier;
 }
 
-const string& IndexFile::get_suffix() const {
-    return suffix;
+const vector<string>& IndexFile::get_suffixes() const {
+    return suffixes;
 }
 
 const vector<string>& IndexFile::get_filenames() const {
@@ -6022,10 +6098,6 @@ void IndexFile::provide(const vector<string>& filenames) {
 }
 
 void IndexFile::assign_constructed(const vector<string>& filenames) {
-    for (auto& f : filenames) {
-        std::cerr << " " << f;
-    }
-    std::cerr << std::endl;
     this->filenames = filenames;
     provided_directly = false;
 }
diff --git a/src/index_registry.hpp b/src/index_registry.hpp
index 29bdafd481..bdc08c8b1f 100644
--- a/src/index_registry.hpp
+++ b/src/index_registry.hpp
@@ -215,6 +215,9 @@ class IndexingPlan {
     /// applies only to a particular sample, it will be scoped to the sample
     /// name). Stored deduplicated and in the order added.
     map<IndexName, vector<string>> scopes;
+    /// The unique (when combined with scopes) suffix for each index in the
+    /// plan.
+    map<IndexName, string> assigned_suffix;
     
     /// The registry that the plan is using.
     /// The registry must not move while the plan is in use.
@@ -266,6 +269,11 @@ class IndexRegistry {
     
     /// Register an index containing the given identifier
     void register_index(const IndexName& identifier, const string& suffix);
+
+    /// Register an index containing the given identifier, with multiple possible suffixes.
+    /// The first suffix not already occupied will be used (allowing scopes to
+    /// disambiguate between unscoped and scoped indexes with the same suffix).
+    void register_index(const IndexName& identifier, const vector<string>& suffixes);
     
     /// Register a recipe to produce an index using other indexes
     /// or input files. Recipes registered earlier will have higher priority.
@@ -281,12 +289,20 @@ class IndexRegistry {
     /// optionally with a scope that propagates to descendant files.
     ///
     /// The empty string represents no scope.
+    ///
+    /// TODO: If scopes contain ".", we can run into problems with combinations
+    /// of different scopes producing the same final string. Right now we only
+    /// use one kind of scope, which avoids this.
     void provide(const IndexName& identifier, const string& filename, const std::string& scope = "");
     
     /// Indicate a list of serialized files that contains some identified index,
     /// optionally with a scope that propagates to descendant files.
     ///
     /// The empty string represents no scope.
+    ///
+    /// TODO: If scopes contain ".", we can run into problems with combinations
+    /// of different scopes producing the same final string. Right now we only
+    /// use one kind of scope, which avoids this.
     void provide(const IndexName& identifier, const vector<string>& filenames, const std::string& scope = "");
 
     /// Remove a provided index
@@ -408,12 +424,15 @@ class IndexFile {
     
     /// Create a new IndexFile with a unique identifier
     IndexFile(const IndexName& identifier, const string& suffix);
+    
+    /// Create a new IndexFile with a unique identifier, which may use one of several suffixes.
+    IndexFile(const IndexName& identifier, const vector<string>& suffixes);
         
     /// Get the globally unique identifier for this index
     const IndexName& get_identifier() const;
     
-    /// Returns the suffix to be used for this index
-    const string& get_suffix() const;
+    /// Returns the suffixes that can used for this index
+    const vector<string>& get_suffixes() const;
     
     /// Get the filename(s) that contain this index
     const vector<string>& get_filenames() const;
@@ -450,8 +469,8 @@ class IndexFile {
     // the global identifier for the
     IndexName identifier;
     
-    // the suffix it adds to output files
-    const string suffix;
+    // the suffixes it can add to output files
+    const vector<string> suffixes;
     
     // the filename(s) associated with the index
     vector<string> filenames;
diff --git a/src/subcommand/autoindex_main.cpp b/src/subcommand/autoindex_main.cpp
index fc16be62fd..5179424407 100644
--- a/src/subcommand/autoindex_main.cpp
+++ b/src/subcommand/autoindex_main.cpp
@@ -390,6 +390,15 @@ int main_autoindex(int argc, char** argv) {
             if (!registry.available(target)) {
                 vector<string> inferred_file_names = registry.get_possible_filenames(target);
                 for (const string& filename : inferred_file_names) {
+                    if (target == "Giraffe GBZ" && !ends_with(filename, ".giraffe.gbz")) {
+                        // TODO: Giraffe GBZ indexes can be saved as .<sample>.gbz or .giraffe.gbz.
+                        // But we can't pick up .<sample>.gbz automatically without possibly mistaking a plain GBZ for a Giraffe one.
+                        // And we don't handle haplotype samplign in autoindex yet anyway.
+                        // So only find Giraffe GBZs.
+                        // TODO: Allow the siffixes to have Snakemake-style
+                        // wildcards that populate scopes on the files.
+                        continue;
+                    }
                     if (file_exists(filename)) {
                         logger.info() << "Guessing that " << filename << " is " << target << endl;
                         registry.provide(target, filename);

From 888d65c4a814d1b7ac36b11d5168cd4b71f84f6f Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Tue, 10 Mar 2026 16:36:40 -0400
Subject: [PATCH 13/22] Add missing dependency

---
 Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile b/Dockerfile
index 2a1514a81e..1fbc47e12e 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -44,7 +44,7 @@ RUN apt-get -qq -y update && apt-get -qq -y upgrade && apt-get -y install \
     samtools curl unzip redland-utils librdf-dev cmake pkg-config wget gtk-doc-tools \
     raptor2-utils rasqal-utils bison flex gawk libgoogle-perftools-dev liblz4-dev liblzma-dev \
     libcairo2-dev libpixman-1-dev libffi-dev libcairo-dev libprotobuf-dev libboost-all-dev \
-    tabix bcftools libzstd-dev pybind11-dev python3-pybind11 pandoc libssl-dev
+    tabix bcftools libzstd-dev pybind11-dev python3-pybind11 pandoc libssl-dev kmc
 ###DEPS_END###
 
 FROM packages AS build

From a4869042eb053686adb00254a5a1856beec5dce4 Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Tue, 10 Mar 2026 17:34:33 -0400
Subject: [PATCH 14/22] Hook up CLI for non-diploid sampling and controlling
 haplotype count

---
 src/index_registry.cpp            | 27 +++++++-----
 src/index_registry.hpp            | 12 ++++--
 src/subcommand/giraffe_main.cpp   | 69 ++++++++++++++++++-------------
 test/haplotype-sampling/README.md |  2 +-
 test/t/54_vg_haplotypes.t         | 18 +++++++-
 5 files changed, 83 insertions(+), 45 deletions(-)

diff --git a/src/index_registry.cpp b/src/index_registry.cpp
index a2feedb04e..fdf79cbb38 100644
--- a/src/index_registry.cpp
+++ b/src/index_registry.cpp
@@ -130,9 +130,11 @@ int IndexingParameters::downsample_context_length = gbwtgraph::PATH_COVER_DEFAUL
 double IndexingParameters::max_memory_proportion = 0.75;
 double IndexingParameters::thread_chunk_inflation_factor = 2.0;
 IndexingParameters::Verbosity IndexingParameters::verbosity = IndexingParameters::Basic;
-std::unordered_set<std::string> IndexingParameters::haplotype_reference_samples = {};
-int IndexingParameters::haplotype_minimizer_k = 29;
-int IndexingParameters::haplotype_minimizer_w = 11;
+std::unordered_set<std::string> IndexingParameters::haplotype_sampling_reference_samples = {};
+size_t IndexingParameters::haplotype_sampling_num_haplotypes = Recombinator::NUM_CANDIDATES;
+bool IndexingParameters::haplotype_sampling_diploid = true;
+int IndexingParameters::haplotype_sampling_minimizer_k = 29;
+int IndexingParameters::haplotype_sampling_minimizer_w = 11;
 
 void copy_file(const string& from_fp, const string& to_fp) {
     require_exists(context, from_fp);
@@ -4215,14 +4217,14 @@ IndexRegistry VGIndexes::get_vg_index_registry() {
         load_gbz(gbz, gbz_filename, IndexingParameters::verbosity == IndexingParameters::Debug);
 
         // Override reference samples if requested.
-        if (!IndexingParameters::haplotype_reference_samples.empty()) {
+        if (!IndexingParameters::haplotype_sampling_reference_samples.empty()) {
             if (IndexingParameters::verbosity != IndexingParameters::None) {
                 info(context) << "Updating reference samples." << endl;
             }
-            size_t present = gbz.set_reference_samples(IndexingParameters::haplotype_reference_samples);
-            if (present != IndexingParameters::haplotype_reference_samples.size()) {
+            size_t present = gbz.set_reference_samples(IndexingParameters::haplotype_sampling_reference_samples);
+            if (present != IndexingParameters::haplotype_sampling_reference_samples.size()) {
                 warn(context) << "Only " << present << " out of "
-                              << IndexingParameters::haplotype_reference_samples.size()
+                              << IndexingParameters::haplotype_sampling_reference_samples.size()
                               << " reference samples are present." << endl;
             }
         }
@@ -4239,7 +4241,10 @@ IndexRegistry VGIndexes::get_vg_index_registry() {
                                                ? Haplotypes::verbosity_basic
                                                : Haplotypes::verbosity_silent);
         Recombinator recombinator(gbz, haplotypes, hap_verbosity);
-        Recombinator::Parameters parameters(Recombinator::Parameters::preset_diploid);
+        Recombinator::Parameters parameters(Recombinator::Parameters::preset_default);
+        parameters.num_haplotypes = IndexingParameters::haplotype_sampling_num_haplotypes;
+        parameters.diploid_sampling = IndexingParameters::haplotype_sampling_diploid;
+        parameters.include_reference = true;
         gbwt::GBWT sampled_gbwt = recombinator.generate_haplotypes(kff_filename, parameters);
 
         // Build GBZ.
@@ -4356,8 +4361,8 @@ IndexRegistry VGIndexes::get_vg_index_registry() {
 
         // Build minimizer index without payload.
         MinimizerIndexParameters minimizer_params;
-        minimizer_params.minimizers(IndexingParameters::haplotype_minimizer_k,
-                                    IndexingParameters::haplotype_minimizer_w)
+        minimizer_params.minimizers(IndexingParameters::haplotype_sampling_minimizer_k,
+                                    IndexingParameters::haplotype_sampling_minimizer_w)
                         .verbose(IndexingParameters::verbosity >= IndexingParameters::Debug);
         HaplotypePartitioner::minimizer_index_type minimizer_index =
             build_minimizer_index(gbz, nullptr, nullptr, minimizer_params);
@@ -4432,7 +4437,7 @@ IndexRegistry VGIndexes::get_vg_index_registry() {
         // Run kmc: count k-mers in KFF format with canonical k-mers.
         // Use fork()+execvp() instead of system() to avoid shell quoting issues.
         string reads_arg = "@" + list_file;
-        string kmc_k_arg = "-k" + to_string(IndexingParameters::haplotype_minimizer_k);
+        string kmc_k_arg = "-k" + to_string(IndexingParameters::haplotype_sampling_minimizer_k);
         const char* kmc_argv[] = {"kmc", kmc_k_arg.c_str(), "-m128", "-okff", "-hp",
             reads_arg.c_str(), kmc_prefix.c_str(), kmc_tmp_dir.c_str(), nullptr};
 
diff --git a/src/index_registry.hpp b/src/index_registry.hpp
index bdc08c8b1f..2e20291423 100644
--- a/src/index_registry.hpp
+++ b/src/index_registry.hpp
@@ -131,12 +131,16 @@ struct IndexingParameters {
     static double thread_chunk_inflation_factor;
     // whether indexing algorithms will log progress (if available) [Basic]
     static Verbosity verbosity;
-    // reference samples to override in GBZ during haplotype sampling [{}]
-    static std::unordered_set<std::string> haplotype_reference_samples;
+    // Set of samples to make references and retain during haplotype sampling [{}]
+    static std::unordered_set<std::string> haplotype_sampling_reference_samples;
+    // Number of haplotypes to sample during haplotype sampling [Recombinator::NUM_CANDIDATES]
+    static size_t haplotype_sampling_num_haplotypes;
+    // Whether to do diploid sampling during haplotype sampling [true]
+    static bool haplotype_sampling_diploid;
     // length of k-mer used in haplotype index and kmer counting [29]
-    static int haplotype_minimizer_k;
+    static int haplotype_sampling_minimizer_k;
     // length of window used in haplotype index [11]
-    static int haplotype_minimizer_w;
+    static int haplotype_sampling_minimizer_w;
 };
 
 /**
diff --git a/src/subcommand/giraffe_main.cpp b/src/subcommand/giraffe_main.cpp
index 8846c28530..50095f9e24 100644
--- a/src/subcommand/giraffe_main.cpp
+++ b/src/subcommand/giraffe_main.cpp
@@ -659,17 +659,21 @@ void help_giraffe(char** argv, const BaseOptionGroup& parser, const std::map<std
          << "  -i, --interleaved             GAM/FASTQ/FASTA input is interleaved pairs," << endl
          << "                                for paired-end alignment" << endl
          << "      --comments-as-tags        treat comments in name lines as SAM-style tags" << endl
-         << "                                and annotate alignments with them" << endl;
+         << "                                and annotate alignments with them" << endl
+         << "      --index-basename STR      name prefix for autodetected and generated" << endl
+         << "                                graph/index files (default: from graph and sample)" << endl;
 
     cerr << "haplotype sampling:" << endl
-         << "      --haplotype-sampling      enable fully automatic haplotype sampling" << endl
-         << "                                (build hapl index and count kmers automatically)" << endl
-         << "      --haplotype-name FILE     sample from haplotype information in FILE" << endl
-         << "      --kff-name FILE           sample according to kmer counts in FILE" << endl
-         << "      --index-basename STR      name prefix for generated graph/index files" << endl
-         << "                                (default: from graph name)" << endl
+         << "      --haplotype-sampling      use per-sample personalized graph (will be" << endl
+         << "                                built if not found)" << endl
+         << "      --haplotype-name FILE     sample from haplotype information in FILE (implies" << endl
+         << "                                --haplotype-sampling)" << endl
+         << "      --kff-name FILE           sample according to kmer counts in FILE (implies" << endl
+         << "                                --haplotype-sampling)" << endl
          << "      --set-reference STR       include this sample as a reference" << endl
-         << "                                in the personalized graph (may repeat)" << endl;
+         << "                                in the personalized graph (may repeat)" << endl
+         << "      --num-haplotypes N        haplotypes candidates to sample (default: " << IndexingParameters::haplotype_sampling_num_haplotypes << ")" << endl
+         << "      --no-diploid-sampling     turn off downsampling candidates to best diploid" << endl;
 
     cerr << "alternate graphs:" << endl
          << "  -x, --xg-name FILE            map to this graph (if no -Z / -g)," << endl
@@ -758,6 +762,8 @@ int main_giraffe(int argc, char** argv) {
     constexpr int OPT_INDEX_BASENAME = 1102;
     constexpr int OPT_SET_REFERENCE = 1103;
     constexpr int OPT_HAPLOTYPE_SAMPLING = 1104;
+    constexpr int OPT_NUM_HAPLOTYPES = 1105;
+    constexpr int OPT_NO_DIPLOID_SAMPLING = 1106;
 
 
     // initialize parameters with their default options
@@ -1071,11 +1077,13 @@ int main_giraffe(int argc, char** argv) {
         {"zipcode-name", required_argument, 0, 'z'},
         {"dist-name", required_argument, 0, 'd'},
         {"progress", no_argument, 0, 'p'},
+        {"index-basename", required_argument, 0, OPT_INDEX_BASENAME},
         {"haplotype-sampling", no_argument, 0, OPT_HAPLOTYPE_SAMPLING},
         {"haplotype-name", required_argument, 0, OPT_HAPLOTYPE_NAME},
         {"kff-name", required_argument, 0, OPT_KFF_NAME},
-        {"index-basename", required_argument, 0, OPT_INDEX_BASENAME},
         {"set-reference", required_argument, 0, OPT_SET_REFERENCE},
+        {"num-haplotypes", required_argument, 0, OPT_NUM_HAPLOTYPES},
+        {"no-diploid-sampling", no_argument, 0, OPT_NO_DIPLOID_SAMPLING},
         {"gam-in", required_argument, 0, 'G'},
         {"fastq-in", required_argument, 0, 'f'},
         {"interleaved", no_argument, 0, 'i'},
@@ -1184,19 +1192,37 @@ int main_giraffe(int argc, char** argv) {
                 show_progress = true;
                 break;
 
+            case OPT_INDEX_BASENAME:
+                index_basename = optarg;
+                break;
+
             case OPT_HAPLOTYPE_SAMPLING:
                 haplotype_sampling_flag = true;
                 break;
+
             case OPT_HAPLOTYPE_NAME:
                 provided_indexes.emplace("Haplotype Index", optarg);
                 break;
+
             case OPT_KFF_NAME:
                 // We need to provide the KFF with sample scope if we provide
                 // it, but we're not sure of the sample name yet.
                 kff_filename = optarg;
                 break;
-            case OPT_INDEX_BASENAME:
-                index_basename = optarg;
+
+            case OPT_SET_REFERENCE:
+                reference_samples.insert(optarg);
+                break;
+
+            case OPT_NUM_HAPLOTYPES:
+                IndexingParameters::haplotype_sampling_num_haplotypes = parse<size_t>(optarg);
+                if (IndexingParameters::haplotype_sampling_num_haplotypes == 0) {
+                    logger.error() << "number of haplotypes cannot be 0" << std::endl;
+                }
+                break;
+
+            case OPT_NO_DIPLOID_SAMPLING:
+                IndexingParameters::haplotype_sampling_diploid = false;
                 break;
 
             case 'G':
@@ -1263,10 +1289,6 @@ int main_giraffe(int argc, char** argv) {
                 output_basename = optarg;
                 break;
 
-            case OPT_SET_REFERENCE:
-                reference_samples.insert(optarg);
-                break;
-
             case OPT_REPORT_NAME:
                 report_name = ensure_writable(logger, optarg);
                 break;
@@ -1502,15 +1524,16 @@ int main_giraffe(int argc, char** argv) {
                                   << " that sample name is " << sample_scope << std::endl;
                 }
             } else {
-                sample_scope = "sample";
+                logger.error() << "Unable to determine a sample name for haplotype sampling; provide --sample" << endl;
             }
         }
         if (sample_scope == "giraffe") {
             logger.warn() << "Using \"giraffe\" as a sample name may lead to filename collisions." << std::endl;
         }
 
-        // Pass parameters to the recipes through IndexingParameters.
-        IndexingParameters::haplotype_reference_samples = reference_samples;
+        // Pass parameters not directly provided in option parsers to the
+        // recipes through IndexingParameters.
+        IndexingParameters::haplotype_sampling_reference_samples = reference_samples;
         
         if (kff_filename.empty() && fastq_filename_1.empty()) {
             // TODO: Eventually learn to kmer count from GAM
@@ -1631,16 +1654,6 @@ int main_giraffe(int argc, char** argv) {
         }
     }
 
-    if (haplotype_sampling && !registry.available("Giraffe GBZ") && !registry.available("GBZ")) {
-        // When doing haplotype sampling, we need to name our sampled GBZ just
-        // .gbz on top of the basename we put the sample name in, because the
-        // tests require it. This means we can't build a base .gbz and a
-        // sampled .gbz in the same indexing run, or they will fight over the
-        // name. So make sure we don't try.
-        logger.error() << "Cannot do haplotype sampling without a GBZ available" << std::endl;
-    }
-    // TODO: Actually force the index registry to use the name we want for the sampled GBZ even though it's not the one it would generate.
-
     //If we're making new zipcodes, we should rebuild the minimizers too
     if (!(indexes_and_extensions.count(std::string("Long Read Minimizers")) || indexes_and_extensions.count(std::string("Long Read PathMinimizers"))) && indexes_and_extensions.count(std::string("Long Read Zipcodes"))) {
         logger.info() << "Rebuilding minimizer index to include zipcodes" << endl;
diff --git a/test/haplotype-sampling/README.md b/test/haplotype-sampling/README.md
index e188cf8a9c..3acbb64aca 100644
--- a/test/haplotype-sampling/README.md
+++ b/test/haplotype-sampling/README.md
@@ -151,5 +151,5 @@ You may also want to read about the [best practices](https://github.com/vgteam/v
 Remember to clean up after trying all the examples:
 <!-- !test check Clean up after examples -->
 ```
-rm -f HG003.gbz giraffe-integration.gam micb-kir3dl1.dist micb-kir3dl1.gbz micb-kir3dl1.giraffe.gbz micb-kir3dl1.hapl micb-kir3dl1.ri micb-kir3dl1.shortread.withzip.min micb-kir3dl1.shortread.zipcodes reference.fa to-original.gam to-sampled.gam
+rm -f HG003.gbz giraffe-integration.gam micb-kir3dl1.dist micb-kir3dl1.gbz micb-kir3dl1.giraffe.gbz micb-kir3dl1.hapl micb-kir3dl1.ri micb-kir3dl1.shortread.withzip.min micb-kir3dl1.shortread.zipcodes reference.fa to-original.gam to-sampled.gam integration.*
 ```
diff --git a/test/t/54_vg_haplotypes.t b/test/t/54_vg_haplotypes.t
index 421d61f89e..500788b926 100644
--- a/test/t/54_vg_haplotypes.t
+++ b/test/t/54_vg_haplotypes.t
@@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap
 
 PATH=../bin:$PATH # for vg
 
-plan tests 40
+plan tests 44
 
 # The test graph consists of two subgraphs of the HPRC Minigraph-Cactus v1.1 graph:
 # - GRCh38#chr6:31498145-31511124 (micb)
@@ -118,6 +118,20 @@ vg giraffe -Z full.gbz --haplotype-sampling \
 is $? 0 "Giraffe does fully automatic haplotype sampling"
 is $(vg gbwt -H -Z auto_all.HG003.gbz) 4 "fully automatic haplotype sampling produces 2 diploid + 2 reference haplotypes"
 
+# Giraffe integration, non-diploid
+vg giraffe -Z full.gbz --haplotype-sampling --no-diploid-sampling --num-haplotypes 3 \
+    --index-basename auto_nondip -N HG003 \
+    -f haplotype-sampling/HG003.fq.gz > auto_nondip.gam 2> /dev/null
+is $? 0 "Giraffe does non-diploid haplotype sampling"
+is $(vg gbwt -H -Z auto_nondip.HG003.gbz) 5 "non-diploid haplotype sampling sampling produces 3 requested + 2 reference haplotypes"
+
+# Giraffe integration, single reference
+vg giraffe -Z full.gbz --haplotype-sampling --set-reference GRCh38 \
+    --index-basename auto_oneref -N HG003 \
+    -f haplotype-sampling/HG003.fq.gz > auto_oneref.gam 2> /dev/null
+is $? 0 "Giraffe does haplotype sampling when setting reference"
+is $(vg gbwt -H -Z auto_oneref.HG003.gbz) 3 "setting reference produces 2 diploid + 1 reference haplotypes"
+
 # Attempts to use mismatched files fail
 vg haplotypes -i full.hapl -k haplotype-sampling/HG003.kff -g /dev/null diploid.gbz 2> log.txt
 is $? 1 "sampling with mismatched GBZ and haplotype information fails"
@@ -133,4 +147,6 @@ rm -f GRCh38.HG003.* HG003_GRCh38.gam
 rm -f auto_hapl.HG003.* auto_hapl.gam
 rm -f auto_kff.HG003.* auto_kff.gam
 rm -f auto_all.HG003.* auto_all.gam
+rm -f auto_nondip.HG003.* auto_nondip.gam
+rm -f auto_oneref.HG003.* auto_oneref.gam
 rm -f log.txt

From 14950bd21140366845d7f2e5e11b6e50ec402844 Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Tue, 10 Mar 2026 18:22:32 -0400
Subject: [PATCH 15/22] Wrap long help

---
 src/subcommand/giraffe_main.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/subcommand/giraffe_main.cpp b/src/subcommand/giraffe_main.cpp
index 50095f9e24..910bfeab40 100644
--- a/src/subcommand/giraffe_main.cpp
+++ b/src/subcommand/giraffe_main.cpp
@@ -661,7 +661,8 @@ void help_giraffe(char** argv, const BaseOptionGroup& parser, const std::map<std
          << "      --comments-as-tags        treat comments in name lines as SAM-style tags" << endl
          << "                                and annotate alignments with them" << endl
          << "      --index-basename STR      name prefix for autodetected and generated" << endl
-         << "                                graph/index files (default: from graph and sample)" << endl;
+         << "                                graph/index files (default: from graph and" << endl
+         << "                                sample)" << endl;
 
     cerr << "haplotype sampling:" << endl
          << "      --haplotype-sampling      use per-sample personalized graph (will be" << endl

From b8260f4c10d33b46177b506274ebc38d4da1dfbd Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Tue, 10 Mar 2026 18:50:18 -0400
Subject: [PATCH 16/22] Fix CLI code to pass linting

---
 src/subcommand/giraffe_main.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/subcommand/giraffe_main.cpp b/src/subcommand/giraffe_main.cpp
index 910bfeab40..c336cee7f3 100644
--- a/src/subcommand/giraffe_main.cpp
+++ b/src/subcommand/giraffe_main.cpp
@@ -667,8 +667,8 @@ void help_giraffe(char** argv, const BaseOptionGroup& parser, const std::map<std
     cerr << "haplotype sampling:" << endl
          << "      --haplotype-sampling      use per-sample personalized graph (will be" << endl
          << "                                built if not found)" << endl
-         << "      --haplotype-name FILE     sample from haplotype information in FILE (implies" << endl
-         << "                                --haplotype-sampling)" << endl
+         << "      --haplotype-name FILE     sample from haplotype information in FILE" << endl
+         << "                                (implies --haplotype-sampling)" << endl
          << "      --kff-name FILE           sample according to kmer counts in FILE (implies" << endl
          << "                                --haplotype-sampling)" << endl
          << "      --set-reference STR       include this sample as a reference" << endl
@@ -1208,7 +1208,7 @@ int main_giraffe(int argc, char** argv) {
             case OPT_KFF_NAME:
                 // We need to provide the KFF with sample scope if we provide
                 // it, but we're not sure of the sample name yet.
-                kff_filename = optarg;
+                kff_filename = require_exists(logger, optarg);
                 break;
 
             case OPT_SET_REFERENCE:

From d8604bee15a99365c4b00c82360343445e35145b Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Wed, 11 Mar 2026 11:44:50 -0400
Subject: [PATCH 17/22] Implement Snakemake-y wildcards to try and let scopes
 control what suffix we use

---
 src/index_registry.cpp          | 229 ++++++++++++++++++++++----------
 src/index_registry.hpp          |  67 ++++++----
 src/subcommand/giraffe_main.cpp |  10 +-
 3 files changed, 202 insertions(+), 104 deletions(-)

diff --git a/src/index_registry.cpp b/src/index_registry.cpp
index fdf79cbb38..ce780fea5d 100644
--- a/src/index_registry.cpp
+++ b/src/index_registry.cpp
@@ -628,7 +628,7 @@ IndexRegistry VGIndexes::get_vg_index_registry() {
     
     registry.register_index("GBWTGraph", "gg");
     registry.register_index("GBZ", "gbz");
-    registry.register_index("Giraffe GBZ", std::vector<string>{"gbz", "giraffe.gbz"});
+    registry.register_index("Giraffe GBZ", std::vector<string>{"{sample}.gbz", "giraffe.gbz"});
     registry.register_index("Haplotype-Sampled GBZ", "sampled.gbz");
     registry.register_index("Top Level Chain Distance Index", "tcdist");
     registry.register_index("r Index", "ri");
@@ -4605,15 +4605,20 @@ bool IndexingPlan::is_intermediate(const IndexName& identifier) const {
     return !targets.count(identifier);
 }
 
-const vector<string> IndexingPlan::get_scopes(const IndexName& identifier) const {
+const map<string, string> IndexingPlan::get_scopes(const IndexName& identifier) const {
     auto found = scopes.find(identifier);
     if (found == scopes.end()) {
         return {};
     }
-    // TODO: we can't really return an empty vector reference so we are stuck doing a copy here.
+    // We can't refer into a new slot so we need to copy
     return found->second;
 }
 
+const map<string, string>& IndexingPlan::get_scopes(const IndexName& identifier) {
+    // We can create the entry if not present
+    return scopes[identifier];
+}
+
 int64_t IndexingPlan::target_memory_usage() const {
     return IndexingParameters::max_memory_proportion * registry->get_target_memory_usage();
 }
@@ -4638,18 +4643,58 @@ string IndexingPlan::output_filepath(const IndexName& identifier, size_t chunk,
         // we're not saving this file, make it temporary
         filepath << registry->get_work_dir() + "/" + sha1sum(identifier);
     }
-    for (auto& scope : get_scopes(identifier)) {
-        // This index belongs to just a sample or something.
-        filepath << "." << scope;
+    
+    // We need the planned scopes, not the scopes for a completed file.
+    auto planned_scopes = get_scopes(identifier);
+    string substituted_suffix;
+    for (auto& suffix : registry->get_index(identifier)->get_suffixes()) {
+        // Find the first suffix where all its wildcards are filled in by scopes
+        auto needed_keys = IndexRegistry::get_wildcards(suffix);
+        bool missing_keys = false;
+        for (auto& key : needed_keys) {
+            if (!planned_scopes.count(key)) {
+                missing_keys = true;
+                break;
+            }
+        }
+        if (missing_keys) {
+            continue;
+        }
+        // Now we know we want this suffix
+
+        // Substitute the suffix
+        substituted_suffix = IndexRegistry::substitute_wildcards(suffix, planned_scopes);
+
+        // Make the set of scopes not accounted for in the suffix.
+        // TODO: Merge with substitution?
+        // TODO: Just make all scope-able files say so?
+        set<string> unused_scopes;
+        for (auto& kv : planned_scopes) {
+            unused_scopes.insert(kv.first);
+        }
+        for (auto& used : needed_keys) {
+            unused_scopes.erase(used);
+        }
+
+        for (auto& key : unused_scopes) {
+            // Put the extra scopes in the filename
+            filepath << "." << planned_scopes.at(key);
+        }
+        
+        break;
+    }
+    if (substituted_suffix.empty()) {
+        // We really should find a suffix to use
+        throw std::runtime_error("No suffix for " + identifier + " can be used with available scopes.");
     }
     if (num_chunks > 1) {
         // we add digits to make the suffix unique for this chunk (the setup disallows suffixes
-        // that start with digits)
+        // that start with digits).
+        // TODO: What if a scope starts with digits? Put the numbers somewhere else?
         filepath << "." << to_string(chunk);
     }
-    // Use the suffix we've assigned, which is the first one we have that's
-    // unique within the scopes.
-    filepath << "." << assigned_suffix.at(identifier);
+    // Use the suffix we've substituted.
+    filepath << "." << substituted_suffix;
     return filepath.str();
 }
  
@@ -4690,14 +4735,13 @@ set<RecipeName> IndexingPlan::dependents(const IndexName& identifier) const {
     return dependent_steps;
 }
 
-void IndexingPlan::add_scope(const IndexName& identifier, const string& scope) {
+void IndexingPlan::add_scope(const IndexName& identifier, const string& key, const string& value) {
     // Get a mutable reference to the scopes we need to add to.
     auto& index_scopes = scopes[identifier];
 
-    // TODO: Change the data structure here if we anticipate an appreciable number of scopes.
-    auto found = std::find(index_scopes.begin(), index_scopes.end(), scope);
+    auto found = index_scopes.find(key);
     if (found == index_scopes.end()) {
-        index_scopes.push_back(scope);
+        index_scopes.emplace_hint(found, key, value);
     }
 }
 
@@ -4780,9 +4824,9 @@ void IndexRegistry::make_indexes(const vector<IndexName>& identifiers) {
                 if (!index->was_provided_directly()) {
                     // and assign the new (or first) ones
                     index->assign_constructed(results);
-                    for (auto& scope : plan.get_scopes(*it)) {
+                    for (auto& scope_kv : plan.get_scopes(*it)) {
                         // and keep their scopes (in case we ever need them)
-                        index->add_scope(scope);
+                        index->add_scope(scope_kv.first, scope_kv.second);
                     }
                 }
                 ++it;
@@ -4915,12 +4959,96 @@ void IndexRegistry::register_index(const IndexName& identifier, const vector<str
     }
 }
 
+set<string> IndexRegistry::get_wildcards(const string& pattern) {
+    set<string> result;
+    size_t wildcard_start = std::numeric_limits<size_t>::max();
+    for (size_t i = 0; i < pattern.size(); i++) {
+        // Scan the string and find the wildcards
+        auto& c = pattern[i];
+        if (wildcard_start == std::numeric_limits<size_t>::max()) {
+            // Need an open brace
+            if (c == '{') {
+                wildcard_start = i;
+            } else if (c == '}') {
+                throw std::runtime_error("Unmatched closing brace in " + pattern);
+            }
+        } else {
+            // Need a close brace
+            if (c == '}') {
+                if (i - wildcard_start < 2) {
+                    // There's no name here
+                    throw std::runtime_error("Empty wildcard in " + pattern);
+                }
+                string name = pattern.substr(wildcard_start + 1, i - wildcard_start - 2);
+                auto found = result.find(name);
+                if (found != result.end()) {
+                    throw std::runtime_error("Duplicate wildcard in " + pattern);
+                }
+                // Save the new wildcard
+                result.emplace_hint(found, name);
+                // Go back to not-in-a-wildcard state
+                wildcard_start = std::numeric_limits<size_t>::max();
+            }
+        }
+    }
+    if (wildcard_start != std::numeric_limits<size_t>::max()) {
+        // We ended while reading a wildcard
+        throw std::runtime_error("Unmatched opening brace in " + pattern);
+    }
+    return result;
+}
+
+string IndexRegistry::substitute_wildcards(const string& pattern, const map<string, string> values) {
+    // TODO: Deduplicate code with get_wildcards somehow
+
+    std::stringstream result;
+
+    size_t wildcard_start = std::numeric_limits<size_t>::max();
+    for (size_t i = 0; i < pattern.size(); i++) {
+        // Scan the string and find the wildcards
+        auto& c = pattern[i];
+        if (wildcard_start == std::numeric_limits<size_t>::max()) {
+            // Need an open brace
+            if (c == '{') {
+                wildcard_start = i;
+            } else if (c == '}') {
+                throw std::runtime_error("Unmatched closing brace in " + pattern);
+            } else {
+                result << c;
+            }
+        } else {
+            // Need a close brace
+            if (c == '}') {
+                if (i - wildcard_start < 2) {
+                    // There's no name here
+                    throw std::runtime_error("Empty wildcard in " + pattern);
+                }
+                string name = pattern.substr(wildcard_start + 1, i - wildcard_start - 2);
+                auto found = values.find(name);
+                if (found == values.end()) {
+                    throw std::runtime_error("Undefined wildcard " + name + " in " + pattern);
+                }
+                // Output the wildcard value
+                result << found->second;
+                // Go back to not-in-a-wildcard state
+                wildcard_start = std::numeric_limits<size_t>::max();
+            }
+        }
+    }
+    if (wildcard_start != std::numeric_limits<size_t>::max()) {
+        // We ended while reading a wildcard
+        throw std::runtime_error("Unmatched opening brace in " + pattern);
+    }
+
+    return result.str();
+}
+
 
-void IndexRegistry::provide(const IndexName& identifier, const string& filename, const std::string& scope) {
-    provide(identifier, vector<string>(1, filename), scope);
+void IndexRegistry::provide(const IndexName& identifier, const string& filename, const map<string, string>& scopes) {
+    provide(identifier, vector<string>(1, filename), scopes);
 }
 
-void IndexRegistry::provide(const IndexName& identifier, const vector<string>& filenames, const std::string& scope) {
+void IndexRegistry::provide(const IndexName& identifier, const vector<string>& filenames, const map<string, string>& scopes) {
     if (IndexingParameters::verbosity >= IndexingParameters::Debug) {
         info(context) << "Provided: " << identifier << endl;
     }
@@ -4934,8 +5062,8 @@ void IndexRegistry::provide(const IndexName& identifier, const vector<string>& f
     }
     auto index = get_index(identifier);
     index->provide(filenames);
-    if (!scope.empty()) {
-        index->add_scope(scope);
+    for (auto& scope_kv : scopes) {
+        index->add_scope(scope_kv.first, scope_kv.second);
     }
 }
 
@@ -5870,52 +5998,10 @@ IndexingPlan IndexRegistry::make_plan(const IndexGroup& end_products) const {
                 for (const IndexName& dependent_index_name : dependent_recipe.first) {
                     // For each index the recipe generates
                     
-                    for (auto& scope : provided_scopes) {
+                    for (auto& scope_kv : provided_scopes) {
                         // Make sure that index is scoped with all scopes on the input.
-                        plan.add_scope(dependent_index_name, scope);
+                        plan.add_scope(dependent_index_name, scope_kv.first, scope_kv.second);
                     }
-
-                    // Note that all outputs will get their scopes from the
-                    // inputs added in the same order (the order that the
-                    // inputs appear in the plan).
-                }
-            }
-        }
-    }
-
-    // Assign each index a unique scoped suffix, so we can use .<sample>.gbz
-    // for the Giraffe GBZ and .gbz for the plain one.
-    unordered_set<string> used_scoped_suffixes;
-    for (auto& step : plan.steps) {
-        for (const IndexName& index_name : step.first) {
-            auto index = get_index(index_name);
-            // If scopes were provided, this is an input, and just use those
-            vector<string> scopes = index->get_scopes();
-            if (scopes.empty()) {
-                // Otherwise see if we have planned scopes
-                scopes = plan.get_scopes(index_name);
-            }
-
-            // Build the part that is any scopes (either provided or planned)
-            std::stringstream suffix_base;
-            for (auto& scope : scopes) {
-                suffix_base << scope << ".";
-            }
-
-            for (auto& suffix : index->get_suffixes()) {
-                // Condider this suffix option on top of the scopes
-                string candidate_suffix = suffix_base.str() + suffix;
-                
-                auto found = used_scoped_suffixes.find(candidate_suffix);
-                if (found == used_scoped_suffixes.end()) {
-                    // This is a fresh one. Use it.
-                    used_scoped_suffixes.emplace_hint(found, candidate_suffix);
-                    // We only remember the suffix itself; we reconstruct the
-                    // actual name later, with chunk info inserted.
-                    // TODO: Instead of building an essentially fake string key
-                    // should we do something else?
-                    plan.assigned_suffix[index_name] = suffix;
-                    break;
                 }
             }
         }
@@ -6111,23 +6197,22 @@ bool IndexFile::was_provided_directly() const {
     return provided_directly;
 }
 
-void IndexFile::add_scope(const string& scope) {
+void IndexFile::add_scope(const string& key, const string& value) {
     if (!is_finished()) {
         // Scopes for nonexistent files belong to the plan, not us.
-        throw std::logic_error("Cannot assign " + scope + " scope to unfinished " + get_identifier() + " index");
+        throw std::logic_error("Cannot assign " + key + " = " + value + " scope to unfinished " + get_identifier() + " index");
     }
 
-    // TODO: Change the data structure here if we anticipate an appreciable number of scopes.
-    auto found = std::find(scopes.begin(), scopes.end(), scope);
+    auto found = scopes.find(key);
     if (found == scopes.end()) {
-        scopes.push_back(scope);
+        scopes.emplace_hint(found, key, value);
 #ifdef debug_index_registry
-        std::cerr << "Added scope " << scope << " to " << get_identifier() << std::endl;
+        std::cerr << "Added scope " << key << " = " + value + " to " << get_identifier() << std::endl;
 #endif
     }
 }
 
-const vector<string>& IndexFile::get_scopes() const {
+const map<string, string>& IndexFile::get_scopes() const {
     return scopes;
 }
 
diff --git a/src/index_registry.hpp b/src/index_registry.hpp
index 2e20291423..87dbb3d222 100644
--- a/src/index_registry.hpp
+++ b/src/index_registry.hpp
@@ -180,6 +180,13 @@ class IndexingPlan {
     string output_filepath(const IndexName& identifier) const;
     
     /// Get the suffix with which to save the given index's files.
+    ///
+    /// The path will substitute scopes into any {wildcards} in the index's
+    /// suffix, and scopes not used in the suffix will appear in the
+    /// filename in key order.
+    ///
+    /// The suffix used will be the first one for which all wildcards can be
+    /// substituted. 
     string output_filepath(const IndexName& identifier, size_t chunk, size_t num_chunks) const;
     
     /// Ge the steps of the plan
@@ -189,10 +196,15 @@ class IndexingPlan {
     /// plan, and false if it is to be preserved.
     bool is_intermediate(const IndexName& identifier) const;
 
-    /// Get the deduplicated scopes, in addition order, that the given index
-    /// will be qualified with when generated. Only works for indexes that will
-    /// be generated by the plan, not for inputs.
-    const vector<string> get_scopes(const IndexName& identifier) const;
+    /// Get the scopes that the given index will be qualified with when
+    /// generated. Only works for indexes that will be generated by the plan,
+    /// not for inputs.
+    const map<string, string> get_scopes(const IndexName& identifier) const;
+
+    /// Get the scopes that the given index will be qualified with when
+    /// generated. Only works for indexes that will be generated by the plan,
+    /// not for inputs.
+    const map<string, string>& get_scopes(const IndexName& identifier);
     
     // TODO: is this where this function wants to live?
     /// The memory limit, with a little slosh for prediction inaccuracy
@@ -208,7 +220,7 @@ class IndexingPlan {
     
     /// Add a scope to the scopes the given index will be qualified with when
     /// generated, if not present already.
-    void add_scope(const IndexName& identifier, const string& scope);
+    void add_scope(const IndexName& identifier, const string& key, const string& value);
 
     /// The steps to be invoked in the plan. May be empty before the plan is
     /// actually planned.
@@ -217,11 +229,9 @@ class IndexingPlan {
     set<IndexName> targets;
     /// The scopes qualifying each index in the plan (for example, if an index
     /// applies only to a particular sample, it will be scoped to the sample
-    /// name). Stored deduplicated and in the order added.
-    map<IndexName, vector<string>> scopes;
-    /// The unique (when combined with scopes) suffix for each index in the
-    /// plan.
-    map<IndexName, string> assigned_suffix;
+    /// name). Stored using an ordered map to ensure a consistent iteration
+    /// order.
+    map<IndexName, map<string, string>> scopes;
     
     /// The registry that the plan is using.
     /// The registry must not move while the plan is in use.
@@ -275,9 +285,15 @@ class IndexRegistry {
     void register_index(const IndexName& identifier, const string& suffix);
 
     /// Register an index containing the given identifier, with multiple possible suffixes.
-    /// The first suffix not already occupied will be used (allowing scopes to
-    /// disambiguate between unscoped and scoped indexes with the same suffix).
+    /// The first suffix where all {wildcards} can be substituted with scopes will be used.
     void register_index(const IndexName& identifier, const vector<string>& suffixes);
+
+    /// Get the names of all brace-enclosed {wildcards} in the given pattern
+    static set<string> get_wildcards(const string& pattern);
+
+    /// Substitute wildcards into the given pattern. All wildcards must have
+    /// values assigned. Extra values not used are allowed.
+    static string substitute_wildcards(const string& pattern, const map<string, string> values);
     
     /// Register a recipe to produce an index using other indexes
     /// or input files. Recipes registered earlier will have higher priority.
@@ -288,26 +304,22 @@ class IndexRegistry {
     /// Indicate one recipe is a broadened version of another. The indexes consumed and produced
     /// by the generalization must be semantically identical to those of the generalizee
     void register_generalization(const RecipeName& generalizer, const RecipeName& generalizee);
-    
+
     /// Indicate a serialized file that contains some identified index,
-    /// optionally with a scope that propagates to descendant files.
-    ///
-    /// The empty string represents no scope.
+    /// optionally with scopes that propagates to descendant files.
     ///
     /// TODO: If scopes contain ".", we can run into problems with combinations
     /// of different scopes producing the same final string. Right now we only
     /// use one kind of scope, which avoids this.
-    void provide(const IndexName& identifier, const string& filename, const std::string& scope = "");
+    void provide(const IndexName& identifier, const string& filename, const map<string, string>& scopes = {});
     
     /// Indicate a list of serialized files that contains some identified index,
-    /// optionally with a scope that propagates to descendant files.
-    ///
-    /// The empty string represents no scope.
+    /// optionally with scopes that propagates to descendant files.
     ///
     /// TODO: If scopes contain ".", we can run into problems with combinations
     /// of different scopes producing the same final string. Right now we only
     /// use one kind of scope, which avoids this.
-    void provide(const IndexName& identifier, const vector<string>& filenames, const std::string& scope = "");
+    void provide(const IndexName& identifier, const vector<string>& filenames, const map<string, string>& scopes = {});
 
     /// Remove a provided index
     void reset(const IndexName& identifier);
@@ -447,17 +459,16 @@ class IndexFile {
     /// Assign constructed filenames to this index
     void assign_constructed(const vector<string>& filenames);
 
-    /// Append a new scope to qualify the files in the index, if the scope is
+    /// Add a new scope to qualify the files in the index, if the scope is
     /// not already on the index.
     ///
     /// Should only be used on files that are actually filled in already;
     /// scopes for files that don't exist yet are the responsibility of the
     /// IndexingPlan.
-    void add_scope(const string& scope);
+    void add_scope(const string& key, const string& value);
     
     /// Get all scopes qualifying the index (such as a sample name).
-    /// Scopes are deduplicated and produced in the order added.
-    const vector<string>& get_scopes() const;
+    const map<string, string>& get_scopes() const;
 
     /// Returns true if the index has already been built or provided
     bool is_finished() const;
@@ -479,10 +490,8 @@ class IndexFile {
     // the filename(s) associated with the index
     vector<string> filenames;
 
-    // The dedplicated ordered scopes qualifying the index.
-    // TODO: Use an ordered set for good algorithmics for deduplicating scopes.
-    // Right now we have exactly one kind of scope so this doesn't matter.
-    vector<string> scopes;
+    // The scopes qualifying the index.
+    map<string, string> scopes;
     
     // keep track of whether the index was provided directly
     bool provided_directly = false;
diff --git a/src/subcommand/giraffe_main.cpp b/src/subcommand/giraffe_main.cpp
index c336cee7f3..522b15f41d 100644
--- a/src/subcommand/giraffe_main.cpp
+++ b/src/subcommand/giraffe_main.cpp
@@ -1542,7 +1542,7 @@ int main_giraffe(int argc, char** argv) {
         }
 
         if (!kff_filename.empty()) {
-            registry.provide("KFF Kmer Counts", kff_filename, sample_scope);
+            registry.provide("KFF Kmer Counts", kff_filename, {{"sample", sample_scope}});
         }
         if (!fastq_filename_1.empty()) {
             // If FASTQs are available, send those and we will do haplotype sampling.
@@ -1552,7 +1552,7 @@ int main_giraffe(int argc, char** argv) {
             }
             // Provide the FASTQs to the index registry, but mark everything
             // derived from them as being scoped to the particular sample.
-            registry.provide("FASTQ", fastqs, sample_scope);
+            registry.provide("FASTQ", fastqs, {{"sample", sample_scope}});
         }
         
         
@@ -1640,7 +1640,11 @@ int main_giraffe(int argc, char** argv) {
                 // Report it because this may not be desired behavior.
                 logger.info() << "Guessing that " << inferred_filename
                               << " is " << index_and_extensions->first << endl;
-                registry.provide(index_and_extensions->first, inferred_filename, haplotype_sampling ? sample_scope : "");
+                std::map<std::string, std::string> scopes;
+                if (haplotype_sampling) {
+                    scopes.emplace("sample", sample_scope);
+                }
+                registry.provide(index_and_extensions->first, inferred_filename, scopes);
                 found = true;
                 // Skip other extension options for the index
                 break;

From 9272a72c26f354add2fe6e15e4a544663f822cf7 Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Wed, 11 Mar 2026 11:49:15 -0400
Subject: [PATCH 18/22] Don't show wildcards in get_possible_filenames()

---
 src/index_registry.cpp | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/src/index_registry.cpp b/src/index_registry.cpp
index ce780fea5d..4ab5276b32 100644
--- a/src/index_registry.cpp
+++ b/src/index_registry.cpp
@@ -5096,9 +5096,25 @@ vector<string> IndexRegistry::get_possible_filenames(const IndexName& identifier
         error(context) << "cannot require unregistered index: " << identifier << endl;
     }
     const IndexFile* index = get_index(identifier);
+    auto& index_scopes = index->get_scopes();
     vector<string> filenames;
     for (auto& suffix : index->get_suffixes()) {
-        // Try all the possible suffixes
+        // Find all suffixes where all wildcards are filled in by scopes.
+        // TODO: We might want to really match the wildcards against filenames,
+        // Snakemake-style.
+        auto needed_keys = IndexRegistry::get_wildcards(suffix);
+        bool missing_keys = false;
+        for (auto& key : needed_keys) {
+            if (!index_scopes.count(key)) {
+                missing_keys = true;
+                break;
+            }
+        }
+        if (missing_keys) {
+            continue;
+        }
+
+        // Try all those suffixes
         filenames.push_back(get_prefix() + "." + suffix);
     }
     return filenames;

From 2171d4a30d6270a11223d28950e77bfa9e87e691 Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Wed, 11 Mar 2026 12:18:05 -0400
Subject: [PATCH 19/22] Fix wildcard parsing

---
 src/index_registry.cpp | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/index_registry.cpp b/src/index_registry.cpp
index 4ab5276b32..4368fd5e5e 100644
--- a/src/index_registry.cpp
+++ b/src/index_registry.cpp
@@ -4654,6 +4654,9 @@ string IndexingPlan::output_filepath(const IndexName& identifier, size_t chunk,
         for (auto& key : needed_keys) {
             if (!planned_scopes.count(key)) {
                 missing_keys = true;
+#ifdef debug_index_registry
+                std::cerr << "Pattern " << suffix << " can't be used because " << key << " scope is not available for " << identifier << std::endl;
+#endif
                 break;
             }
         }
@@ -4676,6 +4679,10 @@ string IndexingPlan::output_filepath(const IndexName& identifier, size_t chunk,
             unused_scopes.erase(used);
         }
 
+#ifdef debug_index_registry
+        std::cerr << "Pattern " << suffix << " substituted as " << substituted_suffix << " for " << identifier << "; " << unused_scopes.size() << " extra scopes" << std::endl;
+#endif
+
         for (auto& key : unused_scopes) {
             // Put the extra scopes in the filename
             filepath << "." << planned_scopes.at(key);
@@ -4979,7 +4986,7 @@ set<string> IndexRegistry::get_wildcards(const string& pattern) {
                     // There's no name here
                     throw std::runtime_error("Empty wildcard in " + pattern);
                 }
-                string name = pattern.substr(wildcard_start + 1, i - wildcard_start - 2);
+                string name = pattern.substr(wildcard_start + 1, i - wildcard_start - 1);
                 auto found = result.find(name);
                 if (found != result.end()) {
                     throw std::runtime_error("Duplicate wildcard in " + pattern);
@@ -5023,7 +5030,7 @@ string IndexRegistry::substitute_wildcards(const string& pattern, const map<stri
                     // There's no name here
                     throw std::runtime_error("Empty wildcard in " + pattern);
                 }
-                string name = pattern.substr(wildcard_start + 1, i - wildcard_start - 2);
+                string name = pattern.substr(wildcard_start + 1, i - wildcard_start - 1);
                 auto found = values.find(name);
                 if (found == values.end()) {
                     throw std::runtime_error("Undefined wildcard " + name + " in " + pattern);

From 293ce4925af17bb082f352e9d250e9c498a4007d Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Wed, 11 Mar 2026 12:28:18 -0400
Subject: [PATCH 20/22] Add tests to keep wildcard system working

---
 src/index_registry.cpp          |  4 ++
 src/unittest/index_registry.cpp | 65 +++++++++++++++++++++++++++++++++
 2 files changed, 69 insertions(+)

diff --git a/src/index_registry.cpp b/src/index_registry.cpp
index 4368fd5e5e..b58748d5b2 100644
--- a/src/index_registry.cpp
+++ b/src/index_registry.cpp
@@ -4995,6 +4995,8 @@ set<string> IndexRegistry::get_wildcards(const string& pattern) {
                 result.emplace_hint(found, name);
                 // Go back to not-in-a-wildcard state
                 wildcard_start = std::numeric_limits<size_t>::max();
+            } else if (c == '{') {
+                throw std::runtime_error("Opening brace inside wildcard " + pattern);
             }
         }
     }
@@ -5039,6 +5041,8 @@ string IndexRegistry::substitute_wildcards(const string& pattern, const map<stri
                 result << found->second;
                 // Go back to not-in-a-wildcard state
                 wildcard_start = std::numeric_limits<size_t>::max();
+            } else if (c == '{') {
+                throw std::runtime_error("Opening brace inside wildcard " + pattern);
             }
         }
     }
diff --git a/src/unittest/index_registry.cpp b/src/unittest/index_registry.cpp
index 6c71fd8587..babbc377ac 100644
--- a/src/unittest/index_registry.cpp
+++ b/src/unittest/index_registry.cpp
@@ -279,5 +279,70 @@ TEST_CASE("IndexRegistry can make plans on a dummy recipe graph", "[indexregistr
 //    }
 }
 
+TEST_CASE("IndexRegistry wildcard parsing works", "[indexregistry]") {
+    
+    SECTION("empty string") {
+        auto wildcards = IndexRegistry::get_wildcards("");
+        REQUIRE(wildcards.empty());
+    }
+
+    SECTION("string with no wildcards") {
+        auto wildcards = IndexRegistry::get_wildcards("no wildcards here");
+        REQUIRE(wildcards.empty());
+    }
+
+    SECTION("string with one wildcard") {
+        auto wildcards = IndexRegistry::get_wildcards("some {wildcards} here");
+        REQUIRE(wildcards.size() == 1);
+        REQUIRE(wildcards.count("wildcards"));
+    }
+
+    SECTION("string with several wildcards") {
+        auto wildcards = IndexRegistry::get_wildcards("{some} {wildcards} {here}");
+        REQUIRE(wildcards.size() == 3);
+        REQUIRE(wildcards.count("some"));
+        REQUIRE(wildcards.count("wildcards"));
+        REQUIRE(wildcards.count("here"));
+
+        // Make sure they iterate in alphabetical order
+        std::vector<std::string> found;
+        for (auto& w : wildcards) {
+            found.push_back(w);
+        }
+        REQUIRE(found.at(0) == "here");
+        REQUIRE(found.at(1) == "some");
+        REQUIRE(found.at(2) == "wildcards");
+    }
+}
+
+TEST_CASE("IndexRegistry wildcard substitution works", "[indexregistry]") {
+
+    SECTION("empty string, empty valuation") {
+        auto result = IndexRegistry::substitute_wildcards("", {});
+        REQUIRE(result == "");
+    }
+
+    SECTION("empty string, nonempty valuation") {
+        auto result = IndexRegistry::substitute_wildcards("", {{"a", "thing"}, {"b", "another thing"}});
+        REQUIRE(result == "");
+    }
+
+    SECTION("nonempty string, nonempty valuation") {
+        auto result = IndexRegistry::substitute_wildcards("stuff", {{"a", "thing"}, {"b", "another thing"}});
+        REQUIRE(result == "stuff");
+    }
+
+    SECTION("nonempty string with some wildcards, nonempty valuation") {
+        auto result = IndexRegistry::substitute_wildcards("st{a}ff", {{"a", "thing"}, {"b", "another thing"}});
+        REQUIRE(result == "stthingff");
+    }
+
+    SECTION("nonempty string with all wildcards, nonempty valuation") {
+        auto result = IndexRegistry::substitute_wildcards("st{a}ff{b}", {{"a", "thing"}, {"b", "another thing"}});
+        REQUIRE(result == "stthingffanother thing");
+    }
+
+}
+
 }
 }

From 89b6bc6581f90c9af0e7b38f35dd23291cd24da3 Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Mon, 16 Mar 2026 16:26:00 -0400
Subject: [PATCH 21/22] Add missing filesystem include

---
 src/index_registry.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/index_registry.cpp b/src/index_registry.cpp
index 7bdeb97bbd..0eb26fedbb 100644
--- a/src/index_registry.cpp
+++ b/src/index_registry.cpp
@@ -15,6 +15,7 @@
 #include <cstdio>
 #include <cerrno>
 #include <cstdlib>
+#include <filesystem>
 #include <regex>
 #include <omp.h>
 #include <fcntl.h>

From 62dfb3ccf2cc9dd1e2c5aa924da8e8c33b283306 Mon Sep 17 00:00:00 2001
From: Adam Novak <anovak@soe.ucsc.edu>
Date: Tue, 17 Mar 2026 14:15:20 -0400
Subject: [PATCH 22/22] Save Giraffe logs and clean up any wayward old outputs

---
 test/t/54_vg_haplotypes.t | 34 +++++++++++++++++++++-------------
 1 file changed, 21 insertions(+), 13 deletions(-)

diff --git a/test/t/54_vg_haplotypes.t b/test/t/54_vg_haplotypes.t
index 500788b926..b94d4d12af 100644
--- a/test/t/54_vg_haplotypes.t
+++ b/test/t/54_vg_haplotypes.t
@@ -70,24 +70,27 @@ is $(vg gbwt -C -Z diploid3.gbz) 2 "2 contigs"
 is $(vg gbwt -H -Z diploid3.gbz) 3 "2 generated + 1 reference haplotypes"
 
 # Giraffe integration, guessed output name
+rm -f full.HG003.* default.gam
 vg giraffe -Z full.gbz --haplotype-name full.hapl --kff-name haplotype-sampling/HG003.kff \
-    -f haplotype-sampling/HG003.fq.gz > default.gam 2> /dev/null
+    -f haplotype-sampling/HG003.fq.gz > default.gam
 is $? 0 "Giraffe integration with a guessed output name"
 cmp diploid.gbz full.HG003.gbz
 is $? 0 "the sampled graph is identical to a manually sampled one"
 
 # Giraffe integration, specified output name
+rm -f sampled.003HG.* specified.gam
 vg giraffe -Z full.gbz --haplotype-name full.hapl --kff-name haplotype-sampling/HG003.kff \
     --index-basename sampled -N 003HG \
-    -f haplotype-sampling/HG003.fq.gz > specified.gam 2> /dev/null
+    -f haplotype-sampling/HG003.fq.gz > specified.gam
 is $? 0 "Giraffe integration with a specified output name"
 cmp full.HG003.gbz sampled.003HG.gbz
 is $? 0 "the sampled graphs are identical"
 
 # Giraffe integration, specified reference sample
+rm -f GRCh38.HG003.* HG003_GRCh38.gam
 vg giraffe -Z full.gbz --haplotype-name full.hapl --kff-name haplotype-sampling/HG003.kff \
     --index-basename GRCh38 -N HG003 --set-reference GRCh38 \
-    -f haplotype-sampling/HG003.fq.gz > HG003_GRCh38.gam 2> /dev/null
+    -f haplotype-sampling/HG003.fq.gz > HG003_GRCh38.gam
 is $? 0 "Giraffe integration with a specified reference sample"
 cmp diploid3.gbz GRCh38.HG003.gbz
 is $? 0 "the sampled graph is identical to a manually sampled one"
@@ -95,42 +98,47 @@ is $? 0 "the sampled graph is identical to a manually sampled one"
 # Giraffe integration, haplotype index built automatically from the GBZ.
 # Providing --kff-name without --haplotype-name implies haplotype sampling;
 # the haplotype index (.hapl) is built by the IndexRegistry from the GBZ.
+rm -f auto_hapl.HG003.* auto_hapl.gam
 vg giraffe -Z full.gbz --kff-name haplotype-sampling/HG003.kff \
     --index-basename auto_hapl -N HG003 \
-    -f haplotype-sampling/HG003.fq.gz > auto_hapl.gam 2> /dev/null
+    -f haplotype-sampling/HG003.fq.gz > auto_hapl.gam
 is $? 0 "Giraffe builds haplotype index automatically"
-is $(vg gbwt -H -Z auto_hapl.HG003.gbz) 4 "auto-built hapl produces 2 diploid + 2 reference haplotypes"
+is "$(vg gbwt -H -Z auto_hapl.HG003.gbz)" 4 "auto-built hapl produces 2 diploid + 2 reference haplotypes"
 
 # Giraffe integration, kmer counting done automatically from reads.
 # Providing --haplotype-name without --kff-name implies haplotype sampling;
 # the kmer counts (.kff) are built by the IndexRegistry from the reads using kmc.
+rm -f auto_kff.HG003.* auto_kff.gam
 vg giraffe -Z full.gbz --haplotype-name full.hapl \
     --index-basename auto_kff -N HG003 \
-    -f haplotype-sampling/HG003.fq.gz > auto_kff.gam 2> /dev/null
+    -f haplotype-sampling/HG003.fq.gz > auto_kff.gam
 is $? 0 "Giraffe counts kmers from reads automatically"
-is $(vg gbwt -H -Z auto_kff.HG003.gbz) 4 "auto kmer counting produces 2 diploid + 2 reference haplotypes"
+is "$(vg gbwt -H -Z auto_kff.HG003.gbz)" 4 "auto kmer counting produces 2 diploid + 2 reference haplotypes"
 
 # Giraffe integration, fully automatic: both hapl and kff are built by the
 # IndexRegistry. Triggered by --haplotype-sampling without either input file.
+rm -f auto_all.HG003.* auto_all.gam
 vg giraffe -Z full.gbz --haplotype-sampling \
     --index-basename auto_all -N HG003 \
-    -f haplotype-sampling/HG003.fq.gz > auto_all.gam 2> /dev/null
+    -f haplotype-sampling/HG003.fq.gz > auto_all.gam
 is $? 0 "Giraffe does fully automatic haplotype sampling"
-is $(vg gbwt -H -Z auto_all.HG003.gbz) 4 "fully automatic haplotype sampling produces 2 diploid + 2 reference haplotypes"
+is "$(vg gbwt -H -Z auto_all.HG003.gbz)" 4 "fully automatic haplotype sampling produces 2 diploid + 2 reference haplotypes"
 
 # Giraffe integration, non-diploid
+rm -f auto_nondip.HG003.* auto_nondip.gam
 vg giraffe -Z full.gbz --haplotype-sampling --no-diploid-sampling --num-haplotypes 3 \
     --index-basename auto_nondip -N HG003 \
-    -f haplotype-sampling/HG003.fq.gz > auto_nondip.gam 2> /dev/null
+    -f haplotype-sampling/HG003.fq.gz > auto_nondip.gam
 is $? 0 "Giraffe does non-diploid haplotype sampling"
-is $(vg gbwt -H -Z auto_nondip.HG003.gbz) 5 "non-diploid haplotype sampling sampling produces 3 requested + 2 reference haplotypes"
+is "$(vg gbwt -H -Z auto_nondip.HG003.gbz)" 5 "non-diploid haplotype sampling sampling produces 3 requested + 2 reference haplotypes"
 
 # Giraffe integration, single reference
+rm -f auto_oneref.HG003.* auto_oneref.gam
 vg giraffe -Z full.gbz --haplotype-sampling --set-reference GRCh38 \
     --index-basename auto_oneref -N HG003 \
-    -f haplotype-sampling/HG003.fq.gz > auto_oneref.gam 2> /dev/null
+    -f haplotype-sampling/HG003.fq.gz > auto_oneref.gam
 is $? 0 "Giraffe does haplotype sampling when setting reference"
-is $(vg gbwt -H -Z auto_oneref.HG003.gbz) 3 "setting reference produces 2 diploid + 1 reference haplotypes"
+is "$(vg gbwt -H -Z auto_oneref.HG003.gbz)" 3 "setting reference produces 2 diploid + 1 reference haplotypes"
 
 # Attempts to use mismatched files fail
 vg haplotypes -i full.hapl -k haplotype-sampling/HG003.kff -g /dev/null diploid.gbz 2> log.txt