diff --git a/Brewfile b/Brewfile index 2869f99db40..9a7376ad589 100644 --- a/Brewfile +++ b/Brewfile @@ -22,3 +22,5 @@ brew "boost" brew "pybind11" brew "pandoc" brew "openssl" +tap "brewsci/bio" +brew "brewsci/bio/kmc" diff --git a/Dockerfile b/Dockerfile index 2a1514a81e5..1fbc47e12ed 100644 --- a/Dockerfile +++ b/Dockerfile @@ -44,7 +44,7 @@ RUN apt-get -qq -y update && apt-get -qq -y upgrade && apt-get -y install \ samtools curl unzip redland-utils librdf-dev cmake pkg-config wget gtk-doc-tools \ raptor2-utils rasqal-utils bison flex gawk libgoogle-perftools-dev liblz4-dev liblzma-dev \ libcairo2-dev libpixman-1-dev libffi-dev libcairo-dev libprotobuf-dev libboost-all-dev \ - tabix bcftools libzstd-dev pybind11-dev python3-pybind11 pandoc libssl-dev + tabix bcftools libzstd-dev pybind11-dev python3-pybind11 pandoc libssl-dev kmc ###DEPS_END### FROM packages AS build diff --git a/README.md b/README.md index 592270d4634..a3e1d5e4cd7 100644 --- a/README.md +++ b/README.md @@ -91,7 +91,7 @@ On other distros, or if you do not have root access, you will need to perform th automake gettext autopoint libtool jq bsdmainutils bc rs parallel \ npm curl unzip redland-utils librdf-dev bison flex gawk lzma-dev \ liblzma-dev liblz4-dev libffi-dev libcairo-dev libboost-all-dev \ - libzstd-dev pybind11-dev python3-pybind11 libssl-dev + libzstd-dev pybind11-dev python3-pybind11 libssl-dev kmc At present, you will need GCC version 9 or greater, with support for C++17, to compile vg. (Check your version with `gcc --version`.) GCC up to 11.4.0 is supported. diff --git a/doc/test-docs.sh b/doc/test-docs.sh index 246e7c07aa0..c9f0e6f9b05 100755 --- a/doc/test-docs.sh +++ b/doc/test-docs.sh @@ -16,6 +16,9 @@ cd "${HERE}/../test" echo txm --jobs 1 "${HERE}/../README.md" txm --jobs 1 "${HERE}/../README.md" +# Test extra test READMEs +(cd haplotype-sampling && txm --jobs 1 "README.md") + # Run all the wiki tests find "${HERE}/wiki" -name "*.md" | xargs -n 1 -t txm --jobs 1 diff --git a/src/index_registry.cpp b/src/index_registry.cpp index db3aafc8960..0eb26fedbbf 100644 --- a/src/index_registry.cpp +++ b/src/index_registry.cpp @@ -15,15 +15,20 @@ #include #include #include +#include #include #include +#include +#include #include #include +#include #include #include #include #include +#include #include #include #include @@ -54,6 +59,7 @@ #include "gfa.hpp" #include "job_schedule.hpp" #include "path.hpp" +#include "recombinator.hpp" #include "zip_code.hpp" #include "io/save_handle_graph.hpp" @@ -125,6 +131,11 @@ int IndexingParameters::downsample_context_length = gbwtgraph::PATH_COVER_DEFAUL double IndexingParameters::max_memory_proportion = 0.75; double IndexingParameters::thread_chunk_inflation_factor = 2.0; IndexingParameters::Verbosity IndexingParameters::verbosity = IndexingParameters::Basic; +gbwtgraph::sample_name_set IndexingParameters::haplotype_sampling_reference_samples = {}; +size_t IndexingParameters::haplotype_sampling_num_haplotypes = Recombinator::NUM_CANDIDATES; +bool IndexingParameters::haplotype_sampling_diploid = true; +int IndexingParameters::haplotype_sampling_minimizer_k = 29; +int IndexingParameters::haplotype_sampling_minimizer_w = 11; void copy_file(const string& from_fp, const string& to_fp) { require_exists(context, from_fp); @@ -618,7 +629,13 @@ IndexRegistry VGIndexes::get_vg_index_registry() { registry.register_index("GBWTGraph", "gg"); registry.register_index("GBZ", "gbz"); - registry.register_index("Giraffe GBZ", "giraffe.gbz"); + registry.register_index("Giraffe GBZ", std::vector{"{sample}.gbz", "giraffe.gbz"}); + registry.register_index("Haplotype-Sampled GBZ", "sampled.gbz"); + registry.register_index("Top Level Chain Distance Index", "tcdist"); + registry.register_index("r Index", "ri"); + registry.register_index("FASTQ", "fastq"); + registry.register_index("Haplotype Index", "hapl"); + registry.register_index("KFF Kmer Counts", "kff"); registry.register_index("Short Read Minimizers", "shortread.withzip.min"); registry.register_index("Short Read Zipcodes", "shortread.zipcodes"); @@ -3955,10 +3972,12 @@ IndexRegistry VGIndexes::get_vg_index_registry() { }); //////////////////////////////////// - // GBZ Recipes + // Giraffe GBZ Recipes //////////////////////////////////// - - registry.register_recipe({"Giraffe GBZ"}, {"GBZ"}, + + // Top priority: If we have the inputs to do haplotype sampling, use a + // haloptype sampled GBZ. + registry.register_recipe({"Giraffe GBZ"}, {"Haplotype-Sampled GBZ"}, [](const vector& inputs, const IndexingPlan* plan, AliasGraph& alias_graph, @@ -3966,62 +3985,18 @@ IndexRegistry VGIndexes::get_vg_index_registry() { alias_graph.register_alias(*constructing.begin(), inputs[0]); return vector>(1, inputs.front()->get_filenames()); }); - - registry.register_recipe({"GBZ"}, {"Reference GFA w/ Haplotypes"}, + + // Next priority: if we were handed a GBZ, use it as the Giraffe GBZ + registry.register_recipe({"Giraffe GBZ"}, {"GBZ"}, [](const vector& inputs, const IndexingPlan* plan, AliasGraph& alias_graph, const IndexGroup& constructing) { - if (IndexingParameters::verbosity != IndexingParameters::None) { - info(context) << "Constructing a GBZ from GFA input." << endl; - } - - assert(inputs.size() == 1); - if (inputs[0]->get_filenames().size() != 1) { - error(context) << "Graph construction does not support multiple GFAs at this time." << endl; - } - auto gfa_filename = inputs[0]->get_filenames().front(); - - assert(constructing.size() == 1); - vector> all_outputs(constructing.size()); - auto gbz_output = *constructing.begin(); - auto& output_names = all_outputs[0]; - - string output_name = plan->output_filepath(gbz_output); - - gbwtgraph::GFAParsingParameters params = get_best_gbwtgraph_gfa_parsing_parameters(); - // TODO: there's supposedly a heuristic to set batch size that could perform better than this global param, - // but it would be kind of a pain to update it like we do the global param - params.batch_size = IndexingParameters::gbwt_insert_batch_size; - params.sample_interval = IndexingParameters::gbwt_sampling_interval; - params.max_node_length = IndexingParameters::max_node_size; - // TODO: Here we assume that the GFA file contains 100 haplotypes. If there are more, - // we could safely launch more jobs. Using 600 as the divisor would be a better - // estimate of the number of segments, but we chop long segments to 32 bp nodes. - params.parallel_jobs = guess_parallel_gbwt_jobs( - std::max(get_file_size(gfa_filename) / 300, std::int64_t(1)), - 100, - plan->target_memory_usage(), - params.batch_size - ); - params.show_progress = IndexingParameters::verbosity == IndexingParameters::Debug; - if (IndexingParameters::verbosity >= IndexingParameters::Debug) { - info(context) << "Running " << params.parallel_jobs << " jobs in parallel" << std::endl; - } - - // jointly generate the GBWT and record sequences - unique_ptr gbwt_index; - unique_ptr seq_source; - tie(gbwt_index, seq_source) = gbwtgraph::gfa_to_gbwt(gfa_filename, params); - - // convert sequences into GBZ and save - gbwtgraph::GBZ gbz(gbwt_index, seq_source); - save_gbz(gbz, output_name, IndexingParameters::verbosity == IndexingParameters::Debug); - - output_names.push_back(output_name); - return all_outputs; + alias_graph.register_alias(*constructing.begin(), inputs[0]); + return vector>(1, inputs.front()->get_filenames()); }); - + + // After that, start trying the old GBWT-based workflows registry.register_recipe({"Giraffe GBZ"}, {"GBWTGraph", "Giraffe GBWT"}, [](const vector& inputs, const IndexingPlan* plan, @@ -4054,7 +4029,7 @@ IndexRegistry VGIndexes::get_vg_index_registry() { return all_outputs; }); - // Thses used to be a GBWTGraph recipe, but we don't want to produce GBWTGraphs anymore. + // These used to be a GBWTGraph recipe, but we don't want to produce GBWTGraphs anymore. registry.register_recipe({"Giraffe GBZ"}, {"Giraffe GBWT", "NamedNodeBackTranslation", "XG"}, [](const vector& inputs, @@ -4149,6 +4124,374 @@ IndexRegistry VGIndexes::get_vg_index_registry() { return all_outputs; }); + //////////////////////////////////// + // General GBZ Recipes + //////////////////////////////////// + + registry.register_recipe({"GBZ"}, {"Reference GFA w/ Haplotypes"}, + [](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + if (IndexingParameters::verbosity != IndexingParameters::None) { + info(context) << "Constructing a GBZ from GFA input." << endl; + } + + assert(inputs.size() == 1); + if (inputs[0]->get_filenames().size() != 1) { + error(context) << "Graph construction does not support multiple GFAs at this time." << endl; + } + auto gfa_filename = inputs[0]->get_filenames().front(); + + assert(constructing.size() == 1); + vector> all_outputs(constructing.size()); + auto gbz_output = *constructing.begin(); + auto& output_names = all_outputs[0]; + + string output_name = plan->output_filepath(gbz_output); + + gbwtgraph::GFAParsingParameters params = get_best_gbwtgraph_gfa_parsing_parameters(); + // TODO: there's supposedly a heuristic to set batch size that could perform better than this global param, + // but it would be kind of a pain to update it like we do the global param + params.batch_size = IndexingParameters::gbwt_insert_batch_size; + params.sample_interval = IndexingParameters::gbwt_sampling_interval; + params.max_node_length = IndexingParameters::max_node_size; + // TODO: Here we assume that the GFA file contains 100 haplotypes. If there are more, + // we could safely launch more jobs. Using 600 as the divisor would be a better + // estimate of the number of segments, but we chop long segments to 32 bp nodes. + params.parallel_jobs = guess_parallel_gbwt_jobs( + std::max(get_file_size(gfa_filename) / 300, std::int64_t(1)), + 100, + plan->target_memory_usage(), + params.batch_size + ); + params.show_progress = IndexingParameters::verbosity == IndexingParameters::Debug; + if (IndexingParameters::verbosity >= IndexingParameters::Debug) { + info(context) << "Running " << params.parallel_jobs << " jobs in parallel" << std::endl; + } + + // jointly generate the GBWT and record sequences + unique_ptr gbwt_index; + unique_ptr seq_source; + tie(gbwt_index, seq_source) = gbwtgraph::gfa_to_gbwt(gfa_filename, params); + + // convert sequences into GBZ and save + gbwtgraph::GBZ gbz(gbwt_index, seq_source); + save_gbz(gbz, output_name, IndexingParameters::verbosity == IndexingParameters::Debug); + + output_names.push_back(output_name); + return all_outputs; + }); + + //////////////////////////////////// + // Haplotype Sampling Recipes + //////////////////////////////////// + + // Haplotype sampling: produce a personalized "Haplotype-Sampled GBZ" + registry.register_recipe({"Haplotype-Sampled GBZ"}, {"GBZ", "Haplotype Index", "KFF Kmer Counts"}, + [](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + if (IndexingParameters::verbosity != IndexingParameters::None) { + info(context) << "Sampling haplotypes." << endl; + } + + assert(inputs.size() == 3); + auto gbz_filenames = inputs[0]->get_filenames(); + auto hapl_filenames = inputs[1]->get_filenames(); + auto kff_filenames = inputs[2]->get_filenames(); + assert(gbz_filenames.size() == 1); + assert(hapl_filenames.size() == 1); + assert(kff_filenames.size() == 1); + auto gbz_filename = gbz_filenames.front(); + auto hapl_filename = hapl_filenames.front(); + auto kff_filename = kff_filenames.front(); + + assert(constructing.size() == 1); + vector> all_outputs(constructing.size()); + auto gbz_output = *constructing.begin(); + auto& output_names = all_outputs[0]; + + // Load GBZ. + gbwtgraph::GBZ gbz; + load_gbz(gbz, gbz_filename, IndexingParameters::verbosity == IndexingParameters::Debug); + + // Override reference samples if requested. + if (!IndexingParameters::haplotype_sampling_reference_samples.empty()) { + if (IndexingParameters::verbosity != IndexingParameters::None) { + info(context) << "Updating reference samples." << endl; + } + size_t present = gbz.set_reference_samples(IndexingParameters::haplotype_sampling_reference_samples); + if (present != IndexingParameters::haplotype_sampling_reference_samples.size()) { + warn(context) << "Only " << present << " out of " + << IndexingParameters::haplotype_sampling_reference_samples.size() + << " reference samples are present." << endl; + } + } + + // Load haplotype information. + if (IndexingParameters::verbosity != IndexingParameters::None) { + info(context) << "Loading haplotype information from " << hapl_filename << "." << endl; + } + Haplotypes haplotypes; + haplotypes.load_from(hapl_filename); + + // Sample haplotypes. + Haplotypes::Verbosity hap_verbosity = (IndexingParameters::verbosity != IndexingParameters::None + ? Haplotypes::verbosity_basic + : Haplotypes::verbosity_silent); + Recombinator recombinator(gbz, haplotypes, hap_verbosity); + Recombinator::Parameters parameters(Recombinator::Parameters::preset_default); + parameters.num_haplotypes = IndexingParameters::haplotype_sampling_num_haplotypes; + parameters.diploid_sampling = IndexingParameters::haplotype_sampling_diploid; + parameters.include_reference = true; + gbwt::GBWT sampled_gbwt = recombinator.generate_haplotypes(kff_filename, parameters); + + // Build GBZ. + if (IndexingParameters::verbosity != IndexingParameters::None) { + info(context) << "Building GBZ." << endl; + } + gbwtgraph::GBZ sampled_graph(std::move(sampled_gbwt), gbz); + string output_name = plan->output_filepath(gbz_output); + save_gbz(sampled_graph, output_name, IndexingParameters::verbosity == IndexingParameters::Debug); + + if (IndexingParameters::verbosity != IndexingParameters::None) { + info(context) << "Saved GBZ to " << output_name << endl; + } + + output_names.push_back(output_name); + return all_outputs; + }); + + // Build the r-index for a GBZ from its GBWT. + registry.register_recipe({"r Index"}, {"GBZ"}, + [](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + if (IndexingParameters::verbosity != IndexingParameters::None) { + info(context) << "Building r-index for GBZ." << endl; + } + + assert(inputs.size() == 1); + auto gbz_filename = inputs[0]->get_filenames().front(); + + assert(constructing.size() == 1); + vector> all_outputs(constructing.size()); + auto& output_names = all_outputs[0]; + + gbwtgraph::GBZ gbz; + load_gbz(gbz, gbz_filename, IndexingParameters::verbosity == IndexingParameters::Debug); + + gbwt::FastLocate r_index(gbz.index); + string output_name = plan->output_filepath(*constructing.begin()); + save_r_index(r_index, output_name, IndexingParameters::verbosity != IndexingParameters::None); + + output_names.push_back(output_name); + return all_outputs; + }); + + registry.register_recipe({"Top Level Chain Distance Index"}, + {"GBZ"}, + [](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + if (IndexingParameters::verbosity != IndexingParameters::None) { + info(context) << "Building top-level-chain distance index for GBZ." << endl; + } + + assert(inputs.size() == 1); + auto gbz_filename = inputs[0]->get_filenames().front(); + + assert(constructing.size() == 1); + vector> all_outputs(constructing.size()); + auto& output_names = all_outputs[0]; + + gbwtgraph::GBZ gbz; + load_gbz(gbz, gbz_filename, IndexingParameters::verbosity == IndexingParameters::Debug); + + string output_name = plan->output_filepath(*constructing.begin()); + SnarlDistanceIndex distance_index; + IntegratedSnarlFinder snarl_finder(gbz.graph); + // Only do top level chains + fill_in_distance_index(&distance_index, &gbz.graph, &snarl_finder, + 50000, true); + distance_index.serialize(output_name); + + output_names.push_back(output_name); + return all_outputs; + }); + + // Build "Haplotype Index" (.hapl) from the GBZ, its top-level-chain + // distance index, and its r-index. + registry.register_recipe({"Haplotype Index"}, + {"GBZ", + "Top Level Chain Distance Index", + "r Index"}, + [](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + if (IndexingParameters::verbosity != IndexingParameters::None) { + info(context) << "Generating haplotype information." << endl; + } + + assert(inputs.size() == 3); + auto gbz_filename = inputs[0]->get_filenames().front(); + auto dist_filename = inputs[1]->get_filenames().front(); + auto ri_filename = inputs[2]->get_filenames().front(); + + assert(constructing.size() == 1); + vector> all_outputs(constructing.size()); + auto& output_names = all_outputs[0]; + + // Load GBZ. + gbwtgraph::GBZ gbz; + load_gbz(gbz, gbz_filename, IndexingParameters::verbosity == IndexingParameters::Debug); + + // Load distance index. + SnarlDistanceIndex distance_index; + distance_index.deserialize(dist_filename); + + // Load r-index and restore the GBWT pointer. + gbwt::FastLocate r_index; + load_r_index(r_index, ri_filename, IndexingParameters::verbosity != IndexingParameters::None); + r_index.setGBWT(gbz.index); + + // Build minimizer index without payload. + MinimizerIndexParameters minimizer_params; + minimizer_params.minimizers(IndexingParameters::haplotype_sampling_minimizer_k, + IndexingParameters::haplotype_sampling_minimizer_w) + .verbose(IndexingParameters::verbosity >= IndexingParameters::Debug); + HaplotypePartitioner::minimizer_index_type minimizer_index = + build_minimizer_index(gbz, nullptr, nullptr, minimizer_params); + + // Partition the haplotypes. + Haplotypes::Verbosity hap_verbosity = (IndexingParameters::verbosity != IndexingParameters::None + ? Haplotypes::verbosity_basic + : Haplotypes::verbosity_silent); + HaplotypePartitioner partitioner(gbz, r_index, distance_index, minimizer_index, hap_verbosity); + HaplotypePartitioner::Parameters partitioner_params; + Haplotypes haplotypes = partitioner.partition_haplotypes(partitioner_params); + + // Save the haplotype information. + string output_name = plan->output_filepath(*constructing.begin()); + haplotypes.serialize_to(output_name); + + output_names.push_back(output_name); + return all_outputs; + }); + + // Build "KFF Kmer Counts" (.kff) by counting k-mers from the provided reads + // using kmc. + // + // Having the FASTQ available to the indexing logic is the limiting reagent + // that should determine if we do haplotype sampling or not, when given a + // GBZ. + registry.register_recipe({"KFF Kmer Counts"}, {"FASTQ"}, + [](const vector& inputs, + const IndexingPlan* plan, + AliasGraph& alias_graph, + const IndexGroup& constructing) { + if (IndexingParameters::verbosity != IndexingParameters::None) { + info(context) << "Counting k-mers from reads for haplotype sampling." << endl; + } + + assert(inputs.size() == 1); + auto fastq_filenames = inputs[0]->get_filenames(); + if (fastq_filenames.empty()) { + error(context) << "No reads provided for k-mer counting." << endl; + } + + assert(constructing.size() == 1); + vector> all_outputs(constructing.size()); + auto& output_names = all_outputs[0]; + + string output_name = plan->output_filepath(*constructing.begin()); + + // Create a temp directory for kmc intermediate files. + string tmp_dir = temp_file::create_directory(); + + // Write a file listing the input reads. + string list_file = tmp_dir + "/reads.txt"; + { + ofstream list_out(list_file); + for (const string& fq : fastq_filenames) { + list_out << fq << "\n"; + } + } + + // kmc produces output_prefix.kff; strip the ".kff" extension to get the prefix. + string kmc_prefix; + if (output_name.size() >= 4 && output_name.substr(output_name.size() - 4) == ".kff") { + kmc_prefix = output_name.substr(0, output_name.size() - 4); + } else { + error(context) << "KFF file has wrong extension: " << output_name << endl; + } + + string kmc_tmp_dir = tmp_dir + "/kmc_tmp"; + mkdir(kmc_tmp_dir.c_str(), 0700); + assert(std::filesystem::exists(kmc_tmp_dir)); + + // Run kmc: count k-mers in KFF format with canonical k-mers. + // Use fork()+execvp() instead of system() to avoid shell quoting issues. + string reads_arg = "@" + list_file; + string kmc_k_arg = "-k" + to_string(IndexingParameters::haplotype_sampling_minimizer_k); + const char* kmc_argv[] = {"kmc", kmc_k_arg.c_str(), "-m128", "-okff", "-hp", + reads_arg.c_str(), kmc_prefix.c_str(), kmc_tmp_dir.c_str(), nullptr}; + + // Flush shared output streams before forking + cout.flush(); + cerr.flush(); + fflush(nullptr); + + pid_t pid = fork(); + if (pid == -1) { + error(context) << "fork() failed for kmc: " << strerror(errno) << endl; + } else if (pid == 0) { + // Child: redirect stdout/stderr to /dev/null when not verbose. + if (IndexingParameters::verbosity == IndexingParameters::None) { + int devnull = open("/dev/null", O_WRONLY); + if (devnull != -1) { + dup2(devnull, STDOUT_FILENO); + dup2(devnull, STDERR_FILENO); + close(devnull); + } + } + + // Allow enough open files for kmc to work; see + // + // and + // TODO: Probably this isn't allowed after fork but we do it anyway. + struct rlimit new_limit; + new_limit.rlim_cur = 2048; + new_limit.rlim_max = RLIM_INFINITY; + setrlimit(RLIMIT_NOFILE, &new_limit); + + // execvp promises never to modify its arguments but casn't say + // that in the C++ type system because it causes problems in the C + // type system. See https://stackoverflow.com/a/19505361 + execvp("kmc", (char**)kmc_argv); + // execvp only returns on failure. + _exit(127); + } else { + int child_stat; + waitpid(pid, &child_stat, 0); + int ret = WIFEXITED(child_stat) ? WEXITSTATUS(child_stat) : -1; + if (ret == 127) { + error(context) << "kmc could not be executed. Make sure kmc is installed and in your PATH." << endl; + } else if (ret != 0) { + error(context) << "kmc failed with exit code " << ret << "." << endl; + } + } + + output_names.push_back(output_name); + return all_outputs; + }); + + //////////////////////////////////// // Minimizers Recipes //////////////////////////////////// @@ -4263,6 +4606,20 @@ bool IndexingPlan::is_intermediate(const IndexName& identifier) const { return !targets.count(identifier); } +const map IndexingPlan::get_scopes(const IndexName& identifier) const { + auto found = scopes.find(identifier); + if (found == scopes.end()) { + return {}; + } + // We can't refer into a new slot so we need to copy + return found->second; +} + +const map& IndexingPlan::get_scopes(const IndexName& identifier) { + // We can create the entry if not present + return scopes[identifier]; +} + int64_t IndexingPlan::target_memory_usage() const { return IndexingParameters::max_memory_proportion * registry->get_target_memory_usage(); } @@ -4277,23 +4634,76 @@ string IndexingPlan::output_filepath(const IndexName& identifier) const { string IndexingPlan::output_filepath(const IndexName& identifier, size_t chunk, size_t num_chunks) const { - string filepath; + std::stringstream filepath; if (registry->keep_intermediates || (!is_intermediate(identifier) && !registry->get_index(identifier)->was_provided_directly())) { // we're saving this file, put it at the output prefix - filepath = registry->output_prefix; + filepath << registry->output_prefix; } else { // we're not saving this file, make it temporary - filepath = registry->get_work_dir() + "/" + sha1sum(identifier); + filepath << registry->get_work_dir() + "/" + sha1sum(identifier); + } + + // We need the planned scopes, not the scopes for a completed file. + auto planned_scopes = get_scopes(identifier); + string substituted_suffix; + for (auto& suffix : registry->get_index(identifier)->get_suffixes()) { + // Find the first suffix where all its wildcards are filled in by scopes + auto needed_keys = IndexRegistry::get_wildcards(suffix); + bool missing_keys = false; + for (auto& key : needed_keys) { + if (!planned_scopes.count(key)) { + missing_keys = true; +#ifdef debug_index_registry + std::cerr << "Pattern " << suffix << " can't be used because " << key << " scope is not available for " << identifier << std::endl; +#endif + break; + } + } + if (missing_keys) { + continue; + } + // Now we know we want this suffix + + // Substitute the suffix + substituted_suffix = IndexRegistry::substitute_wildcards(suffix, planned_scopes); + + // Make the set of scopes not accounted for in the suffix. + // TODO: Merge with substitution? + // TODO: Just make all scope-able files say so? + set unused_scopes; + for (auto& kv : planned_scopes) { + unused_scopes.insert(kv.first); + } + for (auto& used : needed_keys) { + unused_scopes.erase(used); + } + +#ifdef debug_index_registry + std::cerr << "Pattern " << suffix << " substituted as " << substituted_suffix << " for " << identifier << "; " << unused_scopes.size() << " extra scopes" << std::endl; +#endif + + for (auto& key : unused_scopes) { + // Put the extra scopes in the filename + filepath << "." << planned_scopes.at(key); + } + + break; + } + if (substituted_suffix.empty()) { + // We really should find a suffix to use + throw std::runtime_error("No suffix for " + identifier + " can be used with available scopes."); } if (num_chunks > 1) { // we add digits to make the suffix unique for this chunk (the setup disallows suffixes - // that start with digits) - filepath += "." + to_string(chunk); + // that start with digits). + // TODO: What if a scope starts with digits? Put the numbers somewhere else? + filepath << "." << to_string(chunk); } - filepath += "." + registry->get_index(identifier)->get_suffix(); - return filepath; + // Use the suffix we've substituted. + filepath << "." << substituted_suffix; + return filepath.str(); } const vector& IndexingPlan::get_steps() const { @@ -4308,6 +4718,10 @@ set IndexingPlan::dependents(const IndexName& identifier) const { IndexGroup successor_indexes{identifier}; for (const auto& step : steps) { + if (!registry->has_recipe(step)) { + // This step is just a provided input and not a real recipe step. + continue; + } // TODO: should this behavior change if some of the inputs were provided directly? @@ -4329,6 +4743,16 @@ set IndexingPlan::dependents(const IndexName& identifier) const { return dependent_steps; } +void IndexingPlan::add_scope(const IndexName& identifier, const string& key, const string& value) { + // Get a mutable reference to the scopes we need to add to. + auto& index_scopes = scopes[identifier]; + + auto found = index_scopes.find(key); + if (found == index_scopes.end()) { + index_scopes.emplace_hint(found, key, value); + } +} + IndexRegistry::~IndexRegistry() { if (!work_dir.empty()) { // Clean up our work directory with its temporary indexes. @@ -4408,6 +4832,10 @@ void IndexRegistry::make_indexes(const vector& identifiers) { if (!index->was_provided_directly()) { // and assign the new (or first) ones index->assign_constructed(results); + for (auto& scope_kv : plan.get_scopes(*it)) { + // and keep their scopes (in case we ever need them) + index->add_scope(scope_kv.first, scope_kv.second); + } } ++it; } @@ -4464,32 +4892,36 @@ void IndexRegistry::make_indexes(const vector& identifiers) { auto f = find(aliasors.begin(), aliasors.end(), aliasee); bool can_move = f == aliasors.end() && !get_index(aliasee)->was_provided_directly(); if (!can_move) { - // just remove the "alias" so we don't need to deal with it + // We need to copy. + // Just remove the "alias" for the index itself so that from here + // on we only need to wory about things that *do* need to be + // created. std::swap(*f, aliasors.back()); aliasors.pop_back(); } const auto& aliasee_filenames = get_index(aliasee)->get_filenames(); - - // copy aliases for any that we need to (start past index 0 if we can move it) - for (size_t i = can_move; i < aliasors.size(); ++i) { - for (size_t j = 0; j < aliasee_filenames.size(); ++j) { - - auto copy_filename = plan.output_filepath(aliasors[i], j, aliasee_filenames.size()); - copy_file(aliasee_filenames[j], copy_filename); - } - } - // if we can move the aliasee (i.e. it is intermediate), then make - // one index by moving instead of copying - if (can_move) { + + for (auto it = aliasors.rbegin(); it != aliasors.rend(); ++it) { + // Fulfill the aliasor from the aliasee (in reverse order, so the + // 0th aliasor gets to have the moved files). + vector new_aliasor_files; for (size_t j = 0; j < aliasee_filenames.size(); ++j) { - auto move_filename = plan.output_filepath(aliasors[0], j, aliasee_filenames.size()); - int code = rename(aliasee_filenames[j].c_str(), move_filename.c_str()); - if (code) { - // moving failed (maybe because the files on separate drives?) fall back on copying - copy_file(aliasee_filenames[j], move_filename); + auto dest_filename = plan.output_filepath(*it, j, aliasee_filenames.size()); + bool moved = false; + if (can_move && (it + 1 == aliasors.rend())) { + // We can move and this is the last aliasor we will do, so try moving + moved = rename(aliasee_filenames[j].c_str(), dest_filename.c_str()) == 0; } + if (!moved) { + // moving failed or is not allowed, fall back on copying + copy_file(aliasee_filenames[j], dest_filename); + } + + new_aliasor_files.push_back(dest_filename); } + // Point the aliasor index at the new paths + get_index(*it)->assign_constructed(new_aliasor_files); } } @@ -4498,34 +4930,137 @@ void IndexRegistry::make_indexes(const vector& identifiers) { } void IndexRegistry::register_index(const IndexName& identifier, const string& suffix) { + register_index(identifier, vector{suffix}); +} + +void IndexRegistry::register_index(const IndexName& identifier, const vector& suffixes) { // Add this index to the registry if (identifier.empty()) { error(context) << "indexes must have a non-empty identifier" << endl; } - if (suffix.empty()) { - error(context) << "indexes must have a non-empty suffix" << endl; - } - if (isdigit(suffix.front())) { - // this ensures that we can add numbers to the suffix to create a unique suffix - // for chunked workflows - error(context) << "suffixes cannot start with a digit" << endl; - } if (index_registry.count(identifier)) { error(context) << "index registry contains a duplicated identifier: " << identifier << endl; } - if (registered_suffixes.count(suffix)) { - error(context) << "index registry contains a duplicated suffix: " << suffix << endl; + bool has_unique_suffix = false; + for (auto suffix : suffixes) { + if (suffix.empty()) { + error(context) << "indexes must have a non-empty suffix" << endl; + } + if (isdigit(suffix.front())) { + // this ensures that we can add numbers to the suffix to create a unique suffix + // for chunked workflows + error(context) << "suffixes cannot start with a digit" << endl; + } + if (!registered_suffixes.count(suffix)) { + has_unique_suffix = true; + } + } + if (!has_unique_suffix) { + // We need each index to have at least one new suffix not yet used or + // we might not be able to find a place to put it when making the + // indexes in the order registered. + error(context) << "index registry contains other indexes for all suffixes for " << identifier << endl; + } + index_registry[identifier] = unique_ptr(new IndexFile(identifier, suffixes)); + for (auto suffix : suffixes) { + registered_suffixes.insert(suffix); } - index_registry[identifier] = unique_ptr(new IndexFile(identifier, suffix)); - registered_suffixes.insert(suffix); } +set IndexRegistry::get_wildcards(const string& pattern) { + set result; + size_t wildcard_start = std::numeric_limits::max(); + for (size_t i = 0; i < pattern.size(); i++) { + // Scan the string and find the wildcards + auto& c = pattern[i]; + if (wildcard_start == std::numeric_limits::max()) { + // Need an open brace + if (c == '{') { + wildcard_start = i; + } else if (c == '}') { + throw std::runtime_error("Unmatched closing brace in " + pattern); + } + } else { + // Need a close brace + if (c == '}') { + if (i - wildcard_start < 2) { + // There's no name here + throw std::runtime_error("Empty wildcard in " + pattern); + } + string name = pattern.substr(wildcard_start + 1, i - wildcard_start - 1); + auto found = result.find(name); + if (found != result.end()) { + throw std::runtime_error("Duplicate wildcard in " + pattern); + } + // Save the new wildcard + result.emplace_hint(found, name); + // Go back to not-in-a-wildcard state + wildcard_start = std::numeric_limits::max(); + } else if (c == '{') { + throw std::runtime_error("Opening brace inside wildcard " + pattern); + } + } + } + if (wildcard_start != std::numeric_limits::max()) { + // We ended while reading a wildcard + throw std::runtime_error("Unmatched opening brace in " + pattern); + } + return result; +} + +string IndexRegistry::substitute_wildcards(const string& pattern, const map values) { + // TODO: Deduplicate code with get_wildcards somehow -void IndexRegistry::provide(const IndexName& identifier, const string& filename) { - provide(identifier, vector(1, filename)); + std::stringstream result; + + size_t wildcard_start = std::numeric_limits::max(); + for (size_t i = 0; i < pattern.size(); i++) { + // Scan the string and find the wildcards + auto& c = pattern[i]; + if (wildcard_start == std::numeric_limits::max()) { + // Need an open brace + if (c == '{') { + wildcard_start = i; + } else if (c == '}') { + throw std::runtime_error("Unmatched closing brace in " + pattern); + } else { + result << c; + } + } else { + // Need a close brace + if (c == '}') { + if (i - wildcard_start < 2) { + // There's no name here + throw std::runtime_error("Empty wildcard in " + pattern); + } + string name = pattern.substr(wildcard_start + 1, i - wildcard_start - 1); + auto found = values.find(name); + if (found == values.end()) { + throw std::runtime_error("Undefined wildcard " + name + " in " + pattern); + } + // Output the wildcard value + result << found->second; + // Go back to not-in-a-wildcard state + wildcard_start = std::numeric_limits::max(); + } else if (c == '{') { + throw std::runtime_error("Opening brace inside wildcard " + pattern); + } + } + } + if (wildcard_start != std::numeric_limits::max()) { + // We ended while reading a wildcard + throw std::runtime_error("Unmatched opening brace in " + pattern); + } + + return result.str(); +} + + +void IndexRegistry::provide(const IndexName& identifier, const string& filename, const map& scopes) { + provide(identifier, vector(1, filename), scopes); } -void IndexRegistry::provide(const IndexName& identifier, const vector& filenames) { +void IndexRegistry::provide(const IndexName& identifier, const vector& filenames, const map& scopes) { if (IndexingParameters::verbosity >= IndexingParameters::Debug) { info(context) << "Provided: " << identifier << endl; } @@ -4537,7 +5072,11 @@ void IndexRegistry::provide(const IndexName& identifier, const vector& f require_exists(context, filename); } } - get_index(identifier)->provide(filenames); + auto index = get_index(identifier); + index->provide(filenames); + for (auto& scope_kv : scopes) { + index->add_scope(scope_kv.first, scope_kv.second); + } } void IndexRegistry::reset(const IndexName& identifier) { @@ -4569,7 +5108,28 @@ vector IndexRegistry::get_possible_filenames(const IndexName& identifier error(context) << "cannot require unregistered index: " << identifier << endl; } const IndexFile* index = get_index(identifier); - return {get_prefix() + "." + index->get_suffix()}; + auto& index_scopes = index->get_scopes(); + vector filenames; + for (auto& suffix : index->get_suffixes()) { + // Find all suffixes where all wildcards are filled in by scopes. + // TODO: We might want to really match the wildcards against filenames, + // Snakemake-style. + auto needed_keys = IndexRegistry::get_wildcards(suffix); + bool missing_keys = false; + for (auto& key : needed_keys) { + if (!index_scopes.count(key)) { + missing_keys = true; + break; + } + } + if (missing_keys) { + continue; + } + + // Try all those suffixes + filenames.push_back(get_prefix() + "." + suffix); + } + return filenames; } vector IndexRegistry::require(const IndexName& identifier) const { @@ -5146,6 +5706,8 @@ IndexingPlan IndexRegistry::make_plan(const IndexGroup& end_products) const { IndexGroup product_group{product}; // records of (identifier, requesters, ordinal index of recipe selected) + // A max-value sentinel index means no recipe is used and the index is + // provided. vector, size_t>> plan_path; // map dependency priority to requesters @@ -5286,6 +5848,7 @@ IndexingPlan IndexRegistry::make_plan(const IndexGroup& end_products) const { // get the latest file in the dependency order that we have left to build auto it = queue.begin(); + // Imagine starting with the first recipe for it plan_path.emplace_back(it->first, it->second, 0); #ifdef debug_index_registry @@ -5309,6 +5872,10 @@ IndexingPlan IndexRegistry::make_plan(const IndexGroup& end_products) const { #ifdef debug_index_registry cerr << "index has been provided as input" << endl; #endif + + // Don't point to a recipe for it that might exist; point to a + // nonexistent sentinel recipe. + get<2>(plan_path.back()) = std::numeric_limits::max(); continue; } else if (recipe_registry.count(index_group)) { @@ -5371,7 +5938,13 @@ IndexingPlan IndexRegistry::make_plan(const IndexGroup& end_products) const { #ifdef debug_index_registry cerr << "final plan path for index " << product << ":" << endl; for (auto path_elem : plan_path) { - cerr << "\t" << to_string(identifier_order[get<0>(path_elem)]) << ", recipe " << get<2>(path_elem) << ", from:" << endl; + cerr << "\t" << to_string(identifier_order[get<0>(path_elem)]); + if (get<2>(path_elem) == std::numeric_limits::max()) { + cerr << ", provided"; + } else { + cerr << ", recipe " << get<2>(path_elem); + } + cerr << ", from:" << endl; for (auto d : get<1>(path_elem)) { cerr << "\t\t" << to_string(identifier_order[d]) << endl; } @@ -5386,6 +5959,14 @@ IndexingPlan IndexRegistry::make_plan(const IndexGroup& end_products) const { // Now fill in the plan struct that the recipes need to know how to run. IndexingPlan plan; + + // The plan has methods that need to interact with the registry, including + // some we need to use here. + // + // Some of them modify the registry. We're not going to use any of those, + // but we have to hand off a non-const pointer to ourselves so the plan can + // modify us later. + plan.registry = const_cast(this); // Copy over the end products std::copy(end_products.begin(), end_products.end(), std::inserter(plan.targets, plan.targets.begin())); @@ -5399,7 +5980,7 @@ IndexingPlan IndexRegistry::make_plan(const IndexGroup& end_products) const { #ifdef debug_index_registry cerr << "plan before applying generalizations:" << endl; for (auto plan_elem : plan.steps) { - cerr << "\t" << to_string(plan_elem.first) << " " << plan_elem.second << endl; + cerr << "\t" << to_string(plan_elem.first) << " " << (plan_elem.second == std::numeric_limits::max() ? "Provided" : std::to_string(plan_elem.second)) << endl; } #endif @@ -5412,23 +5993,67 @@ IndexingPlan IndexRegistry::make_plan(const IndexGroup& end_products) const { #ifdef debug_index_registry cerr << "full plan including provided files:" << endl; for (auto plan_elem : plan.steps) { - cerr << "\t" << to_string(plan_elem.first) << " " << plan_elem.second << endl; + cerr << "\t" << to_string(plan_elem.first) << " " << (plan_elem.second == std::numeric_limits::max() ? "Provided" : std::to_string(plan_elem.second)) << endl; } #endif + // Propagate scopes from input files + for (auto plan_it = plan.steps.begin(); plan_it != plan.steps.end(); ++plan_it) { + // For each step in ther plan in order + + for (const IndexName& index_name : plan_it->first) { + // For each index that step would generate + const IndexFile* index = get_index(index_name); + if (!index->is_finished()) { + // Only finished indexes bring in new scopes +#ifdef debug_index_registry + std::cerr << "Index " << index_name << " isn't finished and can't bring in scopes" << std::endl; +#endif + continue; + } + + // Get the scopes that came in with this input + auto& provided_scopes = index->get_scopes(); + + // We need to attach these scopes to anything based on this index. + // TODO: Should we just propagate scopes step by step as we scan the plan instead? + auto dependents = plan.dependents(index_name); +#ifdef debug_index_registry + std::cerr << "Index " << index_name << " adds " << provided_scopes.size() << " scopes to " << dependents.size() << " dependents" << std::endl; +#endif + for (const RecipeName& dependent_recipe : dependents) { + // For each recipe transitively depending on this input + for (const IndexName& dependent_index_name : dependent_recipe.first) { + // For each index the recipe generates + + for (auto& scope_kv : provided_scopes) { + // Make sure that index is scoped with all scopes on the input. + plan.add_scope(dependent_index_name, scope_kv.first, scope_kv.second); + } + } + } + } + } + // Now remove the input data from the plan plan.steps.resize(remove_if(plan.steps.begin(), plan.steps.end(), [&](const RecipeName& recipe_choice) { return all_finished(recipe_choice.first); }) - plan.steps.begin()); - - // The plan has methods that can come back and modify the registry. - // We're not going to call any of them, but we have to hand off a non-const - // pointer to ourselves so the plan can modify us later. - plan.registry = const_cast(this); - + return plan; } +bool IndexRegistry::has_recipe(const RecipeName& recipe_name) const { + auto found = recipe_registry.find(recipe_name.first); + if (found == recipe_registry.end()) { + // No recipes at all for this + return false; + } + // Detemrine if this recipe index is in range. + // Sentinel max-value names are always out of range. + return recipe_name.second < found->second.size(); +} + const IndexRecipe& IndexRegistry::get_recipe(const RecipeName& recipe_name) const { const auto& recipes = recipe_registry.at(recipe_name.first); assert(recipe_name.second < recipes.size()); @@ -5562,20 +6187,20 @@ string IndexRegistry::to_dot(const vector& targets) const { return strm.str(); } -IndexFile::IndexFile(const IndexName& identifier, const string& suffix) : identifier(identifier), suffix(suffix) { - // nothing more to do +IndexFile::IndexFile(const IndexName& identifier, const string& suffix) : IndexFile(identifier, vector{suffix}) { + // Nothing to do! } -bool IndexFile::is_finished() const { - return !filenames.empty(); +IndexFile::IndexFile(const IndexName& identifier, const vector& suffixes) : identifier(identifier), suffixes(suffixes) { + // nothing more to do } const IndexName& IndexFile::get_identifier() const { return identifier; } -const string& IndexFile::get_suffix() const { - return suffix; +const vector& IndexFile::get_suffixes() const { + return suffixes; } const vector& IndexFile::get_filenames() const { @@ -5600,9 +6225,33 @@ bool IndexFile::was_provided_directly() const { return provided_directly; } +void IndexFile::add_scope(const string& key, const string& value) { + if (!is_finished()) { + // Scopes for nonexistent files belong to the plan, not us. + throw std::logic_error("Cannot assign " + key + " = " + value + " scope to unfinished " + get_identifier() + " index"); + } + + auto found = scopes.find(key); + if (found == scopes.end()) { + scopes.emplace_hint(found, key, value); +#ifdef debug_index_registry + std::cerr << "Added scope " << key << " = " + value + " to " << get_identifier() << std::endl; +#endif + } +} + +const map& IndexFile::get_scopes() const { + return scopes; +} + +bool IndexFile::is_finished() const { + return !filenames.empty(); +} + void IndexFile::reset() { filenames.clear(); provided_directly = false; + scopes.clear(); } IndexRecipe::IndexRecipe(const vector& inputs, diff --git a/src/index_registry.hpp b/src/index_registry.hpp index 1cb84a4d804..d6545bf9f46 100644 --- a/src/index_registry.hpp +++ b/src/index_registry.hpp @@ -12,6 +12,8 @@ #include #include +#include + namespace vg { using namespace std; @@ -36,6 +38,9 @@ using IndexGroup = set; /** * Names a recipe in the collection of registered recipes. + * + * A max-value number means the indexes are provided and not generated by any + * recipe. */ using RecipeName = pair; @@ -128,6 +133,16 @@ struct IndexingParameters { static double thread_chunk_inflation_factor; // whether indexing algorithms will log progress (if available) [Basic] static Verbosity verbosity; + // Set of samples to make references and retain during haplotype sampling [{}] + static gbwtgraph::sample_name_set haplotype_sampling_reference_samples; + // Number of haplotypes to sample during haplotype sampling [Recombinator::NUM_CANDIDATES] + static size_t haplotype_sampling_num_haplotypes; + // Whether to do diploid sampling during haplotype sampling [true] + static bool haplotype_sampling_diploid; + // length of k-mer used in haplotype index and kmer counting [29] + static int haplotype_sampling_minimizer_k; + // length of window used in haplotype index [11] + static int haplotype_sampling_minimizer_w; }; /** @@ -167,6 +182,13 @@ class IndexingPlan { string output_filepath(const IndexName& identifier) const; /// Get the suffix with which to save the given index's files. + /// + /// The path will substitute scopes into any {wildcards} in the index's + /// suffix, and scopes not used in the suffix will appear in the + /// filename in key order. + /// + /// The suffix used will be the first one for which all wildcards can be + /// substituted. string output_filepath(const IndexName& identifier, size_t chunk, size_t num_chunks) const; /// Ge the steps of the plan @@ -175,24 +197,43 @@ class IndexingPlan { /// Returns true if the given index is to be intermediate under the given /// plan, and false if it is to be preserved. bool is_intermediate(const IndexName& identifier) const; + + /// Get the scopes that the given index will be qualified with when + /// generated. Only works for indexes that will be generated by the plan, + /// not for inputs. + const map get_scopes(const IndexName& identifier) const; + + /// Get the scopes that the given index will be qualified with when + /// generated. Only works for indexes that will be generated by the plan, + /// not for inputs. + const map& get_scopes(const IndexName& identifier); // TODO: is this where this function wants to live? /// The memory limit, with a little slosh for prediction inaccuracy int64_t target_memory_usage() const; - /// The mmeory limit with no slosh + /// The memory limit with no slosh int64_t literal_target_memory_usage() const; /// Returns the recipes in the plan that depend on this index, including the one in which - /// it was created (if any) + /// it was created (if any). set dependents(const IndexName& identifier) const; protected: + /// Add a scope to the scopes the given index will be qualified with when + /// generated, if not present already. + void add_scope(const IndexName& identifier, const string& key, const string& value); + /// The steps to be invoked in the plan. May be empty before the plan is /// actually planned. vector steps; /// The indexes to create as outputs. set targets; + /// The scopes qualifying each index in the plan (for example, if an index + /// applies only to a particular sample, it will be scoped to the sample + /// name). Stored using an ordered map to ensure a consistent iteration + /// order. + map> scopes; /// The registry that the plan is using. /// The registry must not move while the plan is in use. @@ -244,6 +285,17 @@ class IndexRegistry { /// Register an index containing the given identifier void register_index(const IndexName& identifier, const string& suffix); + + /// Register an index containing the given identifier, with multiple possible suffixes. + /// The first suffix where all {wildcards} can be substituted with scopes will be used. + void register_index(const IndexName& identifier, const vector& suffixes); + + /// Get the names of all brace-enclosed {wildcards} in the given pattern + static set get_wildcards(const string& pattern); + + /// Substitute wildcards into the given pattern. All wildcards must have + /// values assigned. Extra values not used are allowed. + static string substitute_wildcards(const string& pattern, const map values); /// Register a recipe to produce an index using other indexes /// or input files. Recipes registered earlier will have higher priority. @@ -254,12 +306,22 @@ class IndexRegistry { /// Indicate one recipe is a broadened version of another. The indexes consumed and produced /// by the generalization must be semantically identical to those of the generalizee void register_generalization(const RecipeName& generalizer, const RecipeName& generalizee); - - /// Indicate a serialized file that contains some identified index - void provide(const IndexName& identifier, const string& filename); - - /// Indicate a list of serialized files that contains some identified index - void provide(const IndexName& identifier, const vector& filenames); + + /// Indicate a serialized file that contains some identified index, + /// optionally with scopes that propagates to descendant files. + /// + /// TODO: If scopes contain ".", we can run into problems with combinations + /// of different scopes producing the same final string. Right now we only + /// use one kind of scope, which avoids this. + void provide(const IndexName& identifier, const string& filename, const map& scopes = {}); + + /// Indicate a list of serialized files that contains some identified index, + /// optionally with scopes that propagates to descendant files. + /// + /// TODO: If scopes contain ".", we can run into problems with combinations + /// of different scopes producing the same final string. Right now we only + /// use one kind of scope, which avoids this. + void provide(const IndexName& identifier, const vector& filenames, const map& scopes = {}); /// Remove a provided index void reset(const IndexName& identifier); @@ -269,6 +331,7 @@ class IndexRegistry { bool available(const IndexName& identifier) const; /// Get the possible filename(s) associated with the given index with the given prefix. + /// TODO: Get this to account for sample-scoped indexes. vector get_possible_filenames(const IndexName& identifier) const; /// Get the filename(s) associated with the given index. Aborts if the @@ -317,6 +380,14 @@ class IndexRegistry { /// generate a plan to create the indexes IndexingPlan make_plan(const IndexGroup& end_products) const; + /// Check if a recipe identifier correesponds to a recipe. + /// + /// Recipe identifiers not corresponding to actual recipes are used during + /// planning to represent provided inputs. + /// + /// TODO: Refactor that with some kind of tagged union or optional. + bool has_recipe(const RecipeName& recipe_name) const; + /// use a recipe identifier to get the recipe const IndexRecipe& get_recipe(const RecipeName& recipe_name) const; @@ -371,12 +442,15 @@ class IndexFile { /// Create a new IndexFile with a unique identifier IndexFile(const IndexName& identifier, const string& suffix); + + /// Create a new IndexFile with a unique identifier, which may use one of several suffixes. + IndexFile(const IndexName& identifier, const vector& suffixes); /// Get the globally unique identifier for this index const IndexName& get_identifier() const; - /// Returns the suffix to be used for this index - const string& get_suffix() const; + /// Returns the suffixes that can used for this index + const vector& get_suffixes() const; /// Get the filename(s) that contain this index const vector& get_filenames() const; @@ -386,7 +460,18 @@ class IndexFile { /// Assign constructed filenames to this index void assign_constructed(const vector& filenames); - + + /// Add a new scope to qualify the files in the index, if the scope is + /// not already on the index. + /// + /// Should only be used on files that are actually filled in already; + /// scopes for files that don't exist yet are the responsibility of the + /// IndexingPlan. + void add_scope(const string& key, const string& value); + + /// Get all scopes qualifying the index (such as a sample name). + const map& get_scopes() const; + /// Returns true if the index has already been built or provided bool is_finished() const; @@ -401,11 +486,14 @@ class IndexFile { // the global identifier for the IndexName identifier; - // the suffix it adds to output files - const string suffix; + // the suffixes it can add to output files + const vector suffixes; // the filename(s) associated with the index vector filenames; + + // The scopes qualifying the index. + map scopes; // keep track of whether the index was provided directly bool provided_directly = false; diff --git a/src/subcommand/autoindex_main.cpp b/src/subcommand/autoindex_main.cpp index 31777f0d118..51794244071 100644 --- a/src/subcommand/autoindex_main.cpp +++ b/src/subcommand/autoindex_main.cpp @@ -390,7 +390,16 @@ int main_autoindex(int argc, char** argv) { if (!registry.available(target)) { vector inferred_file_names = registry.get_possible_filenames(target); for (const string& filename : inferred_file_names) { - if (ifstream(filename).is_open()) { + if (target == "Giraffe GBZ" && !ends_with(filename, ".giraffe.gbz")) { + // TODO: Giraffe GBZ indexes can be saved as ..gbz or .giraffe.gbz. + // But we can't pick up ..gbz automatically without possibly mistaking a plain GBZ for a Giraffe one. + // And we don't handle haplotype samplign in autoindex yet anyway. + // So only find Giraffe GBZs. + // TODO: Allow the siffixes to have Snakemake-style + // wildcards that populate scopes on the files. + continue; + } + if (file_exists(filename)) { logger.info() << "Guessing that " << filename << " is " << target << endl; registry.provide(target, filename); break; diff --git a/src/subcommand/giraffe_main.cpp b/src/subcommand/giraffe_main.cpp index 1a98efe112d..6ab7816b5eb 100644 --- a/src/subcommand/giraffe_main.cpp +++ b/src/subcommand/giraffe_main.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -26,12 +27,12 @@ #include "../hts_alignment_emitter.hpp" #include "../minimizer_mapper.hpp" #include "../index_registry.hpp" +#include "../utility.hpp" #include "../watchdog.hpp" #include "../crash.hpp" #include #include "../gbwtgraph_helper.hpp" -#include "../recombinator.hpp" #include #include @@ -59,6 +60,8 @@ static long perf_event_open(struct perf_event_attr* hw_event, pid_t pid, int cpu } #endif +//#define debug + using namespace std; using namespace vg; using namespace vg::subcommand; @@ -617,15 +620,6 @@ std::string strip_suffixes(std::string filename, const std::vector& return filename; } -// Returns the name of the sampled GBZ. -std::string sample_haplotypes( - const Logger& logger, - const std::vector>& indexes, - const gbwtgraph::sample_name_set& reference_samples, - const std::string& basename, const std::string& sample_name, const std::string& haplotype_file, const std::string& kff_file, - bool progress -); - //---------------------------------------------------------------------------- void help_giraffe(char** argv, const BaseOptionGroup& parser, const std::map& presets, bool full_help) { @@ -665,15 +659,22 @@ void help_giraffe(char** argv, const BaseOptionGroup& parser, const std::map> provided_indexes; - string index_basename, index_basename_override; + // immediately, because we may want to do haplotype sampling. We need them + // indexed for lookup. We only support one file per index (FASTQs get + // handled separately). + std::unordered_map provided_indexes; + string index_basename_guess, index_basename; // For haplotype sampling. - string haplotype_name, kff_name; + string kff_filename; + bool haplotype_sampling_flag = false; gbwtgraph::sample_name_set reference_samples; string output_basename; @@ -1071,10 +1078,13 @@ int main_giraffe(int argc, char** argv) { {"zipcode-name", required_argument, 0, 'z'}, {"dist-name", required_argument, 0, 'd'}, {"progress", no_argument, 0, 'p'}, + {"index-basename", required_argument, 0, OPT_INDEX_BASENAME}, + {"haplotype-sampling", no_argument, 0, OPT_HAPLOTYPE_SAMPLING}, {"haplotype-name", required_argument, 0, OPT_HAPLOTYPE_NAME}, {"kff-name", required_argument, 0, OPT_KFF_NAME}, - {"index-basename", required_argument, 0, OPT_INDEX_BASENAME}, {"set-reference", required_argument, 0, OPT_SET_REFERENCE}, + {"num-haplotypes", required_argument, 0, OPT_NUM_HAPLOTYPES}, + {"no-diploid-sampling", no_argument, 0, OPT_NO_DIPLOID_SAMPLING}, {"gam-in", required_argument, 0, 'G'}, {"fastq-in", required_argument, 0, 'f'}, {"interleaved", no_argument, 0, 'i'}, @@ -1137,63 +1147,83 @@ int main_giraffe(int argc, char** argv) { case 'Z': // All provided_indexes are later validated // with require_exists() - provided_indexes.emplace_back("Giraffe GBZ", optarg); + provided_indexes.emplace("GBZ", optarg); // If we have a GBZ we probably want to use its name as the base name. - // But see -g. - index_basename = strip_suffixes(std::string(optarg), { ".gbz", ".giraffe" }); + index_basename_guess = strip_suffixes(std::string(optarg), { ".gbz", ".giraffe" }); break; case 'x': - provided_indexes.emplace_back("XG", optarg); + provided_indexes.emplace("XG", optarg); // If we have an XG we probably want to use its name as the base name. - // But see -g. - index_basename = split_ext(optarg).first; + index_basename_guess = split_ext(optarg).first; break; case 'g': - provided_indexes.emplace_back("GBWTGraph", optarg); + provided_indexes.emplace("GBWTGraph", optarg); - // But if we have a GBWTGraph we probably want to use *its* name as the base name. - // Whichever is specified last will win, unless we also have a FASTA input name. - index_basename = split_ext(optarg).first; + // If we have a GBWTGraph we probably want to use its name as the base name. + index_basename_guess = split_ext(optarg).first; break; case 'H': - provided_indexes.emplace_back("Giraffe GBWT", optarg); + provided_indexes.emplace("Giraffe GBWT", optarg); break; case 'm': - provided_indexes.emplace_back("Long Read Minimizers", optarg); - provided_indexes.emplace_back("Short Read Minimizers", optarg); - provided_indexes.emplace_back("Long Read PathMinimizers", optarg); + provided_indexes.emplace("Long Read Minimizers", optarg); + provided_indexes.emplace("Short Read Minimizers", optarg); + provided_indexes.emplace("Long Read PathMinimizers", optarg); break; case 'z': - provided_indexes.emplace_back("Long Read Zipcodes", optarg); - provided_indexes.emplace_back("Short Read Zipcodes", optarg); - provided_indexes.emplace_back("Long Read PathZipcodes", optarg); + provided_indexes.emplace("Long Read Zipcodes", optarg); + provided_indexes.emplace("Short Read Zipcodes", optarg); + provided_indexes.emplace("Long Read PathZipcodes", optarg); break; case 'd': - provided_indexes.emplace_back("Giraffe Distance Index", optarg); + provided_indexes.emplace("Giraffe Distance Index", optarg); break; case 'p': show_progress = true; break; + case OPT_INDEX_BASENAME: + index_basename = optarg; + break; + + case OPT_HAPLOTYPE_SAMPLING: + haplotype_sampling_flag = true; + break; + case OPT_HAPLOTYPE_NAME: - haplotype_name = optarg; + provided_indexes.emplace("Haplotype Index", optarg); break; + case OPT_KFF_NAME: - kff_name = optarg; + // We need to provide the KFF with sample scope if we provide + // it, but we're not sure of the sample name yet. + kff_filename = require_exists(logger, optarg); break; - case OPT_INDEX_BASENAME: - index_basename_override = optarg; + + case OPT_SET_REFERENCE: + reference_samples.insert(optarg); + break; + + case OPT_NUM_HAPLOTYPES: + IndexingParameters::haplotype_sampling_num_haplotypes = parse(optarg); + if (IndexingParameters::haplotype_sampling_num_haplotypes == 0) { + logger.error() << "number of haplotypes cannot be 0" << std::endl; + } + break; + + case OPT_NO_DIPLOID_SAMPLING: + IndexingParameters::haplotype_sampling_diploid = false; break; case 'G': @@ -1260,10 +1290,6 @@ int main_giraffe(int argc, char** argv) { output_basename = optarg; break; - case OPT_SET_REFERENCE: - reference_samples.insert(optarg); - break; - case OPT_REPORT_NAME: report_name = ensure_writable(logger, optarg); break; @@ -1366,9 +1392,10 @@ int main_giraffe(int argc, char** argv) { << " is not named like a FASTA" << std::endl; } - provided_indexes.emplace_back("Reference FASTA", fasta_filename); - // Everything else should be named like the FASTA by default - index_basename = fasta_parts.first; + provided_indexes.emplace("Reference FASTA", fasta_filename); + // Everything else should be named like the FASTA by default. This is a + // better guess than the other siurces. + index_basename_guess = fasta_parts.first; if (have_input_file(optind, argc, argv)) { // Next one must be VCF, but check. @@ -1389,10 +1416,15 @@ int main_giraffe(int argc, char** argv) { string file_type = IndexRegistry::vcf_is_phased(vcf_filename) ? "VCF w/ Phasing" : "VCF"; // Feed it to the index registry to maybe use - provided_indexes.emplace_back(file_type, vcf_filename); + provided_indexes.emplace(file_type, vcf_filename); } } + if (index_basename.empty()) { + // Try to pick an index basename now + index_basename = index_basename_guess; + } + // If we don't want rescue, let the user see we don't try it. if (parser->get_option_value("rescue-attempts") == 0 || rescue_algorithm == MinimizerMapper::rescue_none) { // Replace any parsed values @@ -1402,8 +1434,8 @@ int main_giraffe(int argc, char** argv) { // Now all the arguments are parsed, so see if they make sense - for (const auto& filename : provided_indexes) { - require_exists(logger, filename.second); + for (const auto& type_and_filename : provided_indexes) { + require_exists(logger, type_and_filename.second); } // Decide if we are outputting to an htslib format @@ -1459,24 +1491,108 @@ int main_giraffe(int argc, char** argv) { // We don't have file paths to load defined for recombination-aware short-read minimizers. logger.error() << "Path minimizers cannot be used with short reads." << endl; } - bool haplotype_sampling = !haplotype_name.empty() & !kff_name.empty(); - if (!index_basename_override.empty()) { - index_basename = index_basename_override; + + // Use all the provided indexes + for (auto& index : provided_indexes) { + registry.provide(index.first, index.second); } + + // Haplotype sampling is active when explicitly requested (--haplotype-sampling) + // or when either --haplotype-name or --kff-name is given. + // Otherwise we'd always sample when we had reads available. + bool haplotype_sampling = haplotype_sampling_flag || provided_indexes.count("Haplotype Index") || !kff_filename.empty(); + + string sample_scope; if (haplotype_sampling) { - // If we do haplotype sampling, we get a new GBZ and later build indexes for it. - string gbz_name = sample_haplotypes(logger, provided_indexes, reference_samples, index_basename, - sample_name, haplotype_name, kff_name, show_progress); - registry.provide("Giraffe GBZ", gbz_name); - index_basename = split_ext(gbz_name).first; - } else { - // Otherwise we use the provided indexes. - for (auto& index : provided_indexes) { - registry.provide(index.first, index.second); + // Determine sample scope so we can incorporate it into the + // index basename; downstream indexes (dist, min) will then be + // scoped to the sample. + sample_scope = sample_name; + if (sample_scope.empty()) { + if (!kff_filename.empty()) { + sample_scope = file_base_name(kff_filename); + if (show_progress) { + logger.info() << "Guessing from " << kff_filename + << " that sample name is " << sample_scope << std::endl; + } + } else if (!fastq_filename_1.empty()) { + // Strip .gz then use file_base_name to strip .fq/.fastq, since + // strip_suffixes stops at the first one that's not there. + sample_scope = file_base_name(strip_suffixes(fastq_filename_1, {".gz"})); + if (show_progress) { + logger.info() << "Guessing from " << fastq_filename_1 + << " that sample name is " << sample_scope << std::endl; + } + } else { + logger.error() << "Unable to determine a sample name for haplotype sampling; provide --sample" << endl; + } + } + if (sample_scope == "giraffe") { + logger.warn() << "Using \"giraffe\" as a sample name may lead to filename collisions." << std::endl; + } + + // Pass parameters not directly provided in option parsers to the + // recipes through IndexingParameters. + IndexingParameters::haplotype_sampling_reference_samples = reference_samples; + + if (kff_filename.empty() && fastq_filename_1.empty()) { + // TODO: Eventually learn to kmer count from GAM + logger.error() << "Cannot do haplotype sampling without kmer counts (--kff-name) or FASTQ input (--fastq-in/-f)" << std::endl; } + + if (!kff_filename.empty()) { + registry.provide("KFF Kmer Counts", kff_filename, {{"sample", sample_scope}}); + } + if (!fastq_filename_1.empty()) { + // If FASTQs are available, send those and we will do haplotype sampling. + vector fastqs = {fastq_filename_1}; + if (!fastq_filename_2.empty()) { + fastqs.push_back(fastq_filename_2); + } + // Provide the FASTQs to the index registry, but mark everything + // derived from them as being scoped to the particular sample. + registry.provide("FASTQ", fastqs, {{"sample", sample_scope}}); + } + + + if (!index_basename.empty()) { + // We have some idea where indexes might be, so guess them. + if (!registry.available("Top Level Chain Distance Index")) { + // Look for base GBZ distance index + string tcdist_candidate = index_basename + ".tcdist"; + string dist_candidate = index_basename + ".dist"; + if (file_exists(tcdist_candidate)) { + registry.provide("Top Level Chain Distance Index", tcdist_candidate); + } else if (file_exists(dist_candidate)) { + // Provide it as a top level chain index even though it has more stuff + registry.provide("Top Level Chain Distance Index", dist_candidate); + } + } + + if (!registry.available("Haplotype Index")) { + // Look for hapl index + string hapl_candidate = index_basename + ".hapl"; + if (file_exists(hapl_candidate)) { + registry.provide("Haplotype Index", hapl_candidate); + } + } + + if (!registry.available("r Index")) { + // Look for r index + string r_candidate = index_basename + ".ri"; + if (file_exists(r_candidate)) { + registry.provide("r Index", r_candidate); + } + } + } + + } + + if (!index_basename.empty()) { + // Use the basename we've determined for generated indexes + registry.set_prefix(index_basename); } - registry.set_prefix(index_basename); // The IndexRegistry doesn't try to infer index files based on the // basename, so do that here. We can have multiple extension options that @@ -1511,19 +1627,25 @@ int main_giraffe(int argc, char** argv) { bool found = false; for (auto& extension : index_and_extensions->second) { // For each extension in priority order - string inferred_filename = registry.get_prefix() + "." + extension; - if (ifstream(inferred_filename).is_open()) { + string inferred_filename = registry.get_prefix(); + if (haplotype_sampling) { + // All these are sample-scoped if we're haplotype sampling. + // TODO: Let index registry generate the filenames, which it knows how to do. + inferred_filename += "." + sample_scope; + } + + inferred_filename += "." + extension; + if (file_exists(inferred_filename)) { // A file with the appropriate name exists and we can read it. + // Report it because this may not be desired behavior. + logger.info() << "Guessing that " << inferred_filename + << " is " << index_and_extensions->first << endl; + std::map scopes; if (haplotype_sampling) { - // If we did haplotype sampling, we are going to overwrite existing indexes. - logger.warn() << inferred_filename << " exists and will be overwritten" << endl; - } else { - // Report it because this may not be desired behavior. - logger.info() << "Guessing that " << inferred_filename - << " is " << index_and_extensions->first << endl; - registry.provide(index_and_extensions->first, inferred_filename); - found = true; + scopes.emplace("sample", sample_scope); } + registry.provide(index_and_extensions->first, inferred_filename, scopes); + found = true; // Skip other extension options for the index break; } @@ -2296,91 +2418,5 @@ int main_giraffe(int argc, char** argv) { //---------------------------------------------------------------------------- -std::string sample_haplotypes( - const Logger& logger, - const std::vector>& indexes, - const gbwtgraph::sample_name_set& reference_samples, - const std::string& basename, const std::string& sample_name, const std::string& haplotype_file, const std::string& kff_file, - bool progress -) { - if (progress) { - logger.info() << "Sampling haplotypes" << std::endl; - } - - // Sanity checks. - if (haplotype_file.empty() || kff_file.empty()) { - logger.error() << "Haplotype sampling requires --haplotype-name and --kff-name." << std::endl; - } - - // Determine output name. - std::string sample = sample_name; - if (sample.empty()) { - sample = file_base_name(kff_file); - if (progress) { - logger.info() << "Guessing from " << kff_file - << " that sample name is " << sample << std::endl; - } - } - if (sample == "giraffe") { - logger.warn() << "Using \"giraffe\" as a sample name may lead to filename collisions." << std::endl; - } - std::string output_name = basename + "." + sample + ".gbz"; - - // Load GBZ. - gbwtgraph::GBZ gbz; - if (indexes.size() == 1 && indexes[0].first == "Giraffe GBZ") { - load_gbz(gbz, indexes[0].second, progress); - } else if (indexes.size() == 2 && indexes[0].first == "Giraffe GBWT" && indexes[1].first == "GBWTGraph") { - load_gbz(gbz, indexes[0].second, indexes[1].second, progress); - } else if (indexes.size() == 2 && indexes[0].first == "GBWTGraph" && indexes[1].first == "Giraffe GBWT") { - load_gbz(gbz, indexes[1].second, indexes[0].second, progress); - } else { - logger.error() << "Haplotype sampling requires either -Z " - << "or (-g and -H) with no other indexes." << std::endl; - } - - // Override reference samples if necessary. - if (!reference_samples.empty()) { - if (progress) { - logger.info() << "Updating reference samples" << std::endl; - } - size_t present = gbz.set_reference_samples(reference_samples); - if (present != reference_samples.size()) { - logger.warn() << "Only " << present << " out of " << reference_samples.size() - << " reference samples are present" << std::endl; - } - } - - // Load haplotype information. - if (progress) { - logger.info() << "Loading haplotype information from " - << haplotype_file << std::endl; - } - Haplotypes haplotypes; - haplotypes.load_from(haplotype_file); - - // Sample haplotypes. - Haplotypes::Verbosity verbosity = (progress ? Haplotypes::verbosity_basic : Haplotypes::verbosity_silent); - Recombinator recombinator(gbz, haplotypes, verbosity); - Recombinator::Parameters parameters(Recombinator::Parameters::preset_diploid); - gbwt::GBWT sampled_gbwt; - try { - sampled_gbwt = recombinator.generate_haplotypes(kff_file, parameters); - } catch (const std::runtime_error& e) { - logger.error() << "Haplotype sampling failed: " << e.what() << std::endl; - } - - // Create GBWTGraph and save GBZ. - if (progress) { - logger.info() << "Building GBWTGraph" << std::endl; - } - gbwtgraph::GBZ sampled_graph(std::move(sampled_gbwt), gbz); - save_gbz(sampled_graph, output_name, progress); - - return output_name; -} - -//---------------------------------------------------------------------------- - // Register subcommand static Subcommand vg_giraffe("giraffe", "fast haplotype-aware read alignment", PIPELINE, 6, main_giraffe); diff --git a/src/unittest/index_registry.cpp b/src/unittest/index_registry.cpp index 6c71fd85876..babbc377ac6 100644 --- a/src/unittest/index_registry.cpp +++ b/src/unittest/index_registry.cpp @@ -279,5 +279,70 @@ TEST_CASE("IndexRegistry can make plans on a dummy recipe graph", "[indexregistr // } } +TEST_CASE("IndexRegistry wildcard parsing works", "[indexregistry]") { + + SECTION("empty string") { + auto wildcards = IndexRegistry::get_wildcards(""); + REQUIRE(wildcards.empty()); + } + + SECTION("string with no wildcards") { + auto wildcards = IndexRegistry::get_wildcards("no wildcards here"); + REQUIRE(wildcards.empty()); + } + + SECTION("string with one wildcard") { + auto wildcards = IndexRegistry::get_wildcards("some {wildcards} here"); + REQUIRE(wildcards.size() == 1); + REQUIRE(wildcards.count("wildcards")); + } + + SECTION("string with several wildcards") { + auto wildcards = IndexRegistry::get_wildcards("{some} {wildcards} {here}"); + REQUIRE(wildcards.size() == 3); + REQUIRE(wildcards.count("some")); + REQUIRE(wildcards.count("wildcards")); + REQUIRE(wildcards.count("here")); + + // Make sure they iterate in alphabetical order + std::vector found; + for (auto& w : wildcards) { + found.push_back(w); + } + REQUIRE(found.at(0) == "here"); + REQUIRE(found.at(1) == "some"); + REQUIRE(found.at(2) == "wildcards"); + } +} + +TEST_CASE("IndexRegistry wildcard substitution works", "[indexregistry]") { + + SECTION("empty string, empty valuation") { + auto result = IndexRegistry::substitute_wildcards("", {}); + REQUIRE(result == ""); + } + + SECTION("empty string, nonempty valuation") { + auto result = IndexRegistry::substitute_wildcards("", {{"a", "thing"}, {"b", "another thing"}}); + REQUIRE(result == ""); + } + + SECTION("nonempty string, nonempty valuation") { + auto result = IndexRegistry::substitute_wildcards("stuff", {{"a", "thing"}, {"b", "another thing"}}); + REQUIRE(result == "stuff"); + } + + SECTION("nonempty string with some wildcards, nonempty valuation") { + auto result = IndexRegistry::substitute_wildcards("st{a}ff", {{"a", "thing"}, {"b", "another thing"}}); + REQUIRE(result == "stthingff"); + } + + SECTION("nonempty string with all wildcards, nonempty valuation") { + auto result = IndexRegistry::substitute_wildcards("st{a}ff{b}", {{"a", "thing"}, {"b", "another thing"}}); + REQUIRE(result == "stthingffanother thing"); + } + +} + } } diff --git a/test/haplotype-sampling/README.md b/test/haplotype-sampling/README.md index 89e0d3e6b6f..3acbb64aca2 100644 --- a/test/haplotype-sampling/README.md +++ b/test/haplotype-sampling/README.md @@ -1,3 +1,4 @@ + # Haplotype sampling example This directory contains a small test case for haplotype sampling using the `vg haplotypes` subcommand. @@ -17,6 +18,7 @@ File `HG003.kff` contains 29-mer counts for the reads computed using KMC. First we need to build the indexes used by Giraffe: + ``` vg autoindex --workflow giraffe --prefix micb-kir3dl1 -g micb-kir3dl1.gfa ``` @@ -25,6 +27,7 @@ This converts the graph into GBZ format (`micb-kir3dl1.giraffe.gbz`) and builds Then we can map the reads, specifying the GBZ file, the fastq file, and the output file: + ``` vg giraffe -Z micb-kir3dl1.giraffe.gbz -f HG003.fq.gz -p > to-original.gam ``` @@ -35,6 +38,7 @@ However, because the graph contains two sets of reference paths (GRCh38 and CHM1 We can use the `vg stats` subcommand to compute statistics on the aligned reads: + ``` vg stats -a to-original.gam micb-kir3dl1.giraffe.gbz ``` @@ -46,6 +50,7 @@ vg stats -a to-original.gam micb-kir3dl1.giraffe.gbz In order to build the haplotype information used for haplotype sampling, we need a GBZ graph, a distance index, and an r-index. We build them using the following commands: + ``` vg gbwt -G micb-kir3dl1.gfa --max-node 32 --gbz-format -g micb-kir3dl1.gbz -r micb-kir3dl1.ri vg index -j micb-kir3dl1.dist micb-kir3dl1.gbz @@ -56,6 +61,7 @@ This makes the resulting GBZ graph (and hence the distance index) indentical to Now we can build the haplotype information file `micb-kir3dl1.hapl`: + ``` vg haplotypes -v 2 --subchain-length 300 -H micb-kir3dl1.hapl micb-kir3dl1.gbz ``` @@ -67,6 +73,7 @@ The latter makes the example more interesting than the default 10000 bp. We can now sample the haplotypes using the haplotype information, the k-mer counts, and the original graph: + ``` vg haplotypes -v 2 -i micb-kir3dl1.hapl -k HG003.kff \ --include-reference --diploid-sampling \ @@ -79,9 +86,10 @@ Now we have the sampled graph `HG003.gbz`. We could build the indexes for Giraffe manually. But because the graph is a temporary object intended only for mapping a specific set of reads, we can let Giraffe handle index construction: + ``` # Set the temporary directory as you wish -export TMPDIR=/scratch/tmp +# export TMPDIR=/scratch/tmp vg giraffe -Z HG003.gbz -f HG003.fq.gz -p --index-basename ${TMPDIR}/HG003 > to-sampled.gam ``` @@ -90,40 +98,44 @@ Here we specify that the indexes go to the temporary directory with base name `H We can again use `vg stats` to compute statistics: + ``` vg stats -a to-sampled.gam HG003.gbz ``` ### Giraffe integration -Instead of sampling the haplotypes manually, we can let Giraffe handle everything: +Instead of sampling the haplotypes manually, we can let Giraffe handle the sampling: + ``` # Set the temporary directory as you wish -export TMPDIR=/scratch/tmp +# export TMPDIR=/scratch/tmp vg giraffe -Z micb-kir3dl1.gbz --haplotype-name micb-kir3dl1.hapl --kff-name HG003.kff \ - -f HG003.fq.gz -p --index-basename ${TMPDIR}/integration > giraffe-integration.gam + -f HG003.fq.gz -p --index-basename integration > giraffe-integration.gam ``` This uses the best practices for haplotype sampling, which currently means including the reference paths and using diploid sampling. We put the sampled graph and the automatically built indexes into the temporary directory. -The naming scheme (e.g. `${TMPDIR}/integration.HG003.gbz`) consists of three parts: +The naming scheme (e.g. `integration.HG003.gbz`) consists of three parts: -* Base name `${TMPDIR}/integration` specified with option `--index-basename`. +* Base name `integration` specified with option `--index-basename`. * Sample name `HG003` guessed from the name of the KFF file or specified with option `--sample`. * Extension (e.g. `gbz`) that depends on the file. The statistics should be similar to those in the previous section: + ``` -vg stats -a giraffe-integration.gam ${TMPDIR}/integration.HG003.gbz +vg stats -a giraffe-integration.gam integration.HG003.gbz ``` ## Mapping reads to a linear reference Given a GBZ file, we can extract the paths corresponding to a given sample in fasta format: + ``` vg paths -x micb-kir3dl1.gbz --extract-fasta --sample GRCh38 > reference.fa ``` @@ -135,3 +147,9 @@ The resulting fasta file can be used as the reference with any linear aligner. Documentation for the `vg haplotypes` subcommand can be found in the [vg wiki](https://github.com/vgteam/vg/wiki/Haplotype-Sampling). You may also want to read about the [best practices](https://github.com/vgteam/vg/wiki/Giraffe-best-practices) for mapping reads with Giraffe. + +Remember to clean up after trying all the examples: + +``` +rm -f HG003.gbz giraffe-integration.gam micb-kir3dl1.dist micb-kir3dl1.gbz micb-kir3dl1.giraffe.gbz micb-kir3dl1.hapl micb-kir3dl1.ri micb-kir3dl1.shortread.withzip.min micb-kir3dl1.shortread.zipcodes reference.fa to-original.gam to-sampled.gam integration.* +``` diff --git a/test/t/54_vg_haplotypes.t b/test/t/54_vg_haplotypes.t index b9512838001..b94d4d12af5 100644 --- a/test/t/54_vg_haplotypes.t +++ b/test/t/54_vg_haplotypes.t @@ -5,7 +5,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 34 +plan tests 44 # The test graph consists of two subgraphs of the HPRC Minigraph-Cactus v1.1 graph: # - GRCh38#chr6:31498145-31511124 (micb) @@ -70,28 +70,76 @@ is $(vg gbwt -C -Z diploid3.gbz) 2 "2 contigs" is $(vg gbwt -H -Z diploid3.gbz) 3 "2 generated + 1 reference haplotypes" # Giraffe integration, guessed output name +rm -f full.HG003.* default.gam vg giraffe -Z full.gbz --haplotype-name full.hapl --kff-name haplotype-sampling/HG003.kff \ - -f haplotype-sampling/HG003.fq.gz > default.gam 2> /dev/null + -f haplotype-sampling/HG003.fq.gz > default.gam is $? 0 "Giraffe integration with a guessed output name" cmp diploid.gbz full.HG003.gbz is $? 0 "the sampled graph is identical to a manually sampled one" # Giraffe integration, specified output name +rm -f sampled.003HG.* specified.gam vg giraffe -Z full.gbz --haplotype-name full.hapl --kff-name haplotype-sampling/HG003.kff \ --index-basename sampled -N 003HG \ - -f haplotype-sampling/HG003.fq.gz > specified.gam 2> /dev/null + -f haplotype-sampling/HG003.fq.gz > specified.gam is $? 0 "Giraffe integration with a specified output name" cmp full.HG003.gbz sampled.003HG.gbz is $? 0 "the sampled graphs are identical" # Giraffe integration, specified reference sample +rm -f GRCh38.HG003.* HG003_GRCh38.gam vg giraffe -Z full.gbz --haplotype-name full.hapl --kff-name haplotype-sampling/HG003.kff \ --index-basename GRCh38 -N HG003 --set-reference GRCh38 \ - -f haplotype-sampling/HG003.fq.gz > HG003_GRCh38.gam 2> /dev/null + -f haplotype-sampling/HG003.fq.gz > HG003_GRCh38.gam is $? 0 "Giraffe integration with a specified reference sample" cmp diploid3.gbz GRCh38.HG003.gbz is $? 0 "the sampled graph is identical to a manually sampled one" +# Giraffe integration, haplotype index built automatically from the GBZ. +# Providing --kff-name without --haplotype-name implies haplotype sampling; +# the haplotype index (.hapl) is built by the IndexRegistry from the GBZ. +rm -f auto_hapl.HG003.* auto_hapl.gam +vg giraffe -Z full.gbz --kff-name haplotype-sampling/HG003.kff \ + --index-basename auto_hapl -N HG003 \ + -f haplotype-sampling/HG003.fq.gz > auto_hapl.gam +is $? 0 "Giraffe builds haplotype index automatically" +is "$(vg gbwt -H -Z auto_hapl.HG003.gbz)" 4 "auto-built hapl produces 2 diploid + 2 reference haplotypes" + +# Giraffe integration, kmer counting done automatically from reads. +# Providing --haplotype-name without --kff-name implies haplotype sampling; +# the kmer counts (.kff) are built by the IndexRegistry from the reads using kmc. +rm -f auto_kff.HG003.* auto_kff.gam +vg giraffe -Z full.gbz --haplotype-name full.hapl \ + --index-basename auto_kff -N HG003 \ + -f haplotype-sampling/HG003.fq.gz > auto_kff.gam +is $? 0 "Giraffe counts kmers from reads automatically" +is "$(vg gbwt -H -Z auto_kff.HG003.gbz)" 4 "auto kmer counting produces 2 diploid + 2 reference haplotypes" + +# Giraffe integration, fully automatic: both hapl and kff are built by the +# IndexRegistry. Triggered by --haplotype-sampling without either input file. +rm -f auto_all.HG003.* auto_all.gam +vg giraffe -Z full.gbz --haplotype-sampling \ + --index-basename auto_all -N HG003 \ + -f haplotype-sampling/HG003.fq.gz > auto_all.gam +is $? 0 "Giraffe does fully automatic haplotype sampling" +is "$(vg gbwt -H -Z auto_all.HG003.gbz)" 4 "fully automatic haplotype sampling produces 2 diploid + 2 reference haplotypes" + +# Giraffe integration, non-diploid +rm -f auto_nondip.HG003.* auto_nondip.gam +vg giraffe -Z full.gbz --haplotype-sampling --no-diploid-sampling --num-haplotypes 3 \ + --index-basename auto_nondip -N HG003 \ + -f haplotype-sampling/HG003.fq.gz > auto_nondip.gam +is $? 0 "Giraffe does non-diploid haplotype sampling" +is "$(vg gbwt -H -Z auto_nondip.HG003.gbz)" 5 "non-diploid haplotype sampling sampling produces 3 requested + 2 reference haplotypes" + +# Giraffe integration, single reference +rm -f auto_oneref.HG003.* auto_oneref.gam +vg giraffe -Z full.gbz --haplotype-sampling --set-reference GRCh38 \ + --index-basename auto_oneref -N HG003 \ + -f haplotype-sampling/HG003.fq.gz > auto_oneref.gam +is $? 0 "Giraffe does haplotype sampling when setting reference" +is "$(vg gbwt -H -Z auto_oneref.HG003.gbz)" 3 "setting reference produces 2 diploid + 1 reference haplotypes" + # Attempts to use mismatched files fail vg haplotypes -i full.hapl -k haplotype-sampling/HG003.kff -g /dev/null diploid.gbz 2> log.txt is $? 1 "sampling with mismatched GBZ and haplotype information fails" @@ -104,4 +152,9 @@ rm -f diploid.gbz diploid2.gbz diploid3.gbz rm -f full.HG003.* default.gam rm -f sampled.003HG.* specified.gam rm -f GRCh38.HG003.* HG003_GRCh38.gam -rm -f log.txt \ No newline at end of file +rm -f auto_hapl.HG003.* auto_hapl.gam +rm -f auto_kff.HG003.* auto_kff.gam +rm -f auto_all.HG003.* auto_all.gam +rm -f auto_nondip.HG003.* auto_nondip.gam +rm -f auto_oneref.HG003.* auto_oneref.gam +rm -f log.txt