From 9efb2033d0e0239e9971605a8097ad98868bd3f6 Mon Sep 17 00:00:00 2001
From: Garret Rieger <grieger@google.com>
Date: Wed, 18 Feb 2026 23:01:16 +0000
Subject: [PATCH 1/3] Very basic implementation of a segmenter config
 generator.

This new util can analyze an input font and generate a segmenter config for it. Currently very early stages.
---
 README.md                                  |  10 +
 util/BUILD                                 |  58 +++
 util/auto_segmenter_config.cc              | 535 +++++++++++++++++++++
 util/auto_segmenter_config.h               |  37 ++
 util/auto_segmenter_config_test.cc         | 279 +++++++++++
 util/closure_glyph_keyed_segmenter_util.cc |  22 +-
 util/generate_segmenter_config.cc          |  55 +++
 util/load_codepoints_test.cc               |   5 +
 8 files changed, 998 insertions(+), 3 deletions(-)
 create mode 100644 util/auto_segmenter_config.cc
 create mode 100644 util/auto_segmenter_config.h
 create mode 100644 util/auto_segmenter_config_test.cc
 create mode 100644 util/generate_segmenter_config.cc

diff --git a/README.md b/README.md
index d861e63d..6b2549ff 100644
--- a/README.md
+++ b/README.md
@@ -55,6 +55,16 @@ script:
 ./check-format.sh --fix
 ```
 
+## Documentation
+
+The documents under [docs/experimental](docs/experimental) provide some more detailed designs of various aspects of the IFT encoder. Of note:
+* [compiler.md](docs/experimental)
+* [closure_glyph_segmentation.md](docs/experimental/closure_glyph_segmentation.md)
+* [closure_glyph_segmentation_merging.md](docs/experimental/closure_glyph_segmentation_merging.md)
+* [closure_glyph_segmentation_complex_conditions.md](docs/experimental/closure_glyph_segmentation_complex_conditions.md)
+
+Provide a detailed design of how the two major pieces (segmentation and compilation) of IFT font encoding work.
+
 ## Generating compile_commands.json for IDE
 
 This repo is configured to use [hedron](https://github.com/hedronvision/bazel-compile-commands-extractor) to produce a
diff --git a/util/BUILD b/util/BUILD
index 6248f0c8..e46fec96 100644
--- a/util/BUILD
+++ b/util/BUILD
@@ -103,6 +103,7 @@ cc_binary(
         "@ift_encoder_data//:freq_data",
     ],
     deps = [
+        ":auto_segmenter_config",
         ":load_codepoints",
         ":segmentation_plan_cc_proto",
         ":segmenter_config_cc_proto",
@@ -154,6 +155,23 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "auto_segmenter_config",
+    srcs = [
+        "auto_segmenter_config.cc",
+    ],
+    hdrs = [
+        "auto_segmenter_config.h",
+    ],
+    deps = [
+        ":load_codepoints",
+        ":segmenter_config_cc_proto",
+        "//common",
+        "@abseil-cpp//absl/container:flat_hash_set",
+        "@harfbuzz",
+    ],
+)
+
 cc_library(
     name = "load_codepoints",
     srcs = [
@@ -192,6 +210,24 @@ cc_library(
     ],
 )
 
+cc_test(
+    name = "auto_segmenter_config_test",
+    size = "small",
+    srcs = [
+        "auto_segmenter_config_test.cc",
+    ],
+    data = [
+        "//common:testdata",
+        "@ift_encoder_data//:freq_data",
+    ],
+    deps = [
+        ":auto_segmenter_config",
+        "//common",
+        "@googletest//:gtest_main",
+        "@harfbuzz",
+    ],
+)
+
 cc_test(
     name = "convert_iftb_test",
     size = "small",
@@ -247,6 +283,28 @@ cc_test(
     ],
 )
 
+cc_binary(
+    name = "generate_segmenter_config",
+    srcs = [
+        "generate_segmenter_config.cc",
+    ],
+    data = [
+        "@ift_encoder_data//:freq_data",
+    ],
+    deps = [
+        ":auto_segmenter_config",
+        ":load_codepoints",
+        ":segmenter_config_cc_proto",
+        "//common",
+        "@abseil-cpp//absl/flags:flag",
+        "@abseil-cpp//absl/flags:parse",
+        "@abseil-cpp//absl/log:initialize",
+        "@abseil-cpp//absl/status",
+        "@harfbuzz",
+        "@protobuf",
+    ],
+)
+
 cc_binary(
     name = "iftb2config",
     srcs = [
diff --git a/util/auto_segmenter_config.cc b/util/auto_segmenter_config.cc
new file mode 100644
index 00000000..f3f99f26
--- /dev/null
+++ b/util/auto_segmenter_config.cc
@@ -0,0 +1,535 @@
+#include "util/auto_segmenter_config.h"
+
+#include <cctype>
+#include <string>
+#include <unordered_map>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/log.h"
+#include "absl/strings/match.h"
+#include "absl/strings/strip.h"
+#include "common/font_helper.h"
+#include "common/int_set.h"
+#include "common/try.h"
+#include "hb.h"
+#include "util/load_codepoints.h"
+#include "util/segmenter_config.pb.h"
+
+using absl::btree_set;
+using absl::flat_hash_map;
+using absl::flat_hash_set;
+using absl::Status;
+using absl::StatusOr;
+using common::CodepointSet;
+using common::FontHelper;
+
+namespace util {
+
+static constexpr uint32_t kMinimumGroupSize = 4;
+
+// TODO(garretrieger): define a very basic set of quality levels first (see next TODO),
+//   start with just a lowest and highest to set the upper and lower bounds for quality
+//   settings (maybe also a mid point). To begin use number of codepoints to select quality
+//   level. Do some testing on segmentation times at low and high to get a sense of
+//   how times are impacted.
+
+// TODO(garretrieger): do something analagous to brotli quality levels
+// where we define a series of levels which correspond to a set of
+// values for the quality/performance tradeoff settings (including setting the
+// brotli) quality level. Then we need a heuristic to pick a quality level for a
+// font.
+//
+// If we have the ability to estimate the number of brotli ops resulting from
+// a specific quality level (including a multiplier for the particular brotli
+// quality) then we can select a quality level which keeps brotli ops and closure
+// ops within a specific range.
+//
+// Then can also have a flag/input to force a specific quality level.
+//
+// To start, the list of parameters we can use to make quality/performance
+// tradeoffs:
+//
+// - unmapped_glyph_handling (global)
+//     Lower quality is to not find conditions (so use patch or init font), high quality
+//     is to find conditions.
+//
+// - generate_feature_segments (global): high quality generate segment per feature, low quality put all optional features
+//     in one segment.
+//
+// - brotli_quality (global)
+//     Use estimated number of brotli ops (per merge group) to set this. Take
+//     into account the affects of preprocess merging prior to selecting this.
+//     to start use 0, 9 or 11 (avoid qualities less than 9 other than 0)
+//
+// - brotli_quality_for_initial_font_merging (global)
+//     Use estiamted number of brotli ops for the init font processing to set
+//     this (by looking at what's potentially inscope)
+//
+// - preprocess_merging_group_size_for_ungrouped (global)
+//     Would be reasonable to always have this set to at least the minimum group
+//     size.
+//
+// - condition_analysis_mode: always use CLOSURE_AND_DEP_GRAPH.
+//
+// Merge group settings:
+//
+// - preprocess_merging_group_size (merge group)
+// - preprocess_merging_probability_threshold (merge group)
+//     Set these for merge groups with very large size, using probability
+//     threshold first, then group size to clamp ops to a reasonable value.
+//     group size always starts at the min group size.
+//
+// - use_bigrams (merge group, cost)
+//     Probably always want this on, use other settings instead to increase
+//     performance. On very lowest quality could be disabled
+//
+// - optimization_cutoff_fraction (merge group, cost)
+//     For now, probably ok with a global setting of somewhere around 1 to 2.5%
+//     (doesn't vary).
+//
+// - initial_font_merge_probability_threshold (merge group, cost)
+//     May be ok with a global setting, start with 50%
+//
+// - best_case_size_reduction_fraction (mergr group, cost)
+//     Default is probably fine, but may be worth changing.
+//
+// - min/max patch_size (merge group, heuristic):
+//     Probably fixed value for all qualities, has minimal impact on performance.
+//
+// We may want a quality level per merge group, for the init font merge,
+// and global
+//
+// Utilizing quality levels:
+// - Have a configurable setting to the auto config call which specifies a rough encoding budget
+//   (ie. O(1 min), O(10 min), O(1 hour)). Then try to estimate the encoding time at each
+//   quality level and select the quality level which gets estimated time within the budget.
+// - Brotli and closure ops can both contribute significantly to overall segmenting times,
+//   so we will need to first estimate the typical brotli and closure operation time cost
+//   for the particular font (eg. run a few random closures and brotli compressions)
+// - Then estimate the number of ops that are needed. For closure take into account
+//   how much savings the dep graph can provide.
+// - Finall overall time can be estimated (number ops) * (op time) * (fixed scaling factor)
+//   for both brotli and closure. Total time is the sum.
+
+// TODO(garretrieger): to help speed up init font processing times when latin is primary script
+// consider adding the latin alphabet (upper and lower) directly to the init font. Similar things
+// could be done for other scripts if we can find data on what the "core" alphabet is.
+
+// TODO(garretrieger): collect data on brotli compression times as a function of
+// quality assuming group sizes of 4 using a CJK font
+
+static bool IsScript(absl::string_view file_name) {
+  return absl::StartsWith(file_name, "Script_");
+}
+
+static bool IsLanguage(absl::string_view file_name) {
+  return absl::StartsWith(file_name, "Language_");
+}
+
+// Changes from "Script_foo.riegeli" to "Foo".
+static std::string ScriptName(absl::string_view script_name) {
+  if (IsScript(script_name)) {
+    script_name.remove_prefix(7);
+  }
+  std::string name(script_name);
+  size_t dot_pos = name.find('.');
+  if (dot_pos != std::string::npos) {
+    name = name.substr(0, dot_pos);
+  }
+
+  if (!name.empty() && std::islower(name[0])) {
+    name[0] = std::toupper(name[0]);
+  }
+  return name;
+}
+
+static flat_hash_set<std::string> CjkScripts() {
+  return {
+      "Script_CJK.riegeli@*",
+      "Script_japanese.riegeli@*",
+      "Script_korean.riegeli@*",
+      "Script_chinese-simplified.riegeli@*",
+      "Script_chinese-traditional.riegeli@*",
+  };
+}
+
+static CodepointSet CommonCodepoints(
+    const flat_hash_map<std::string, CodepointSet>& freq_list, bool cjk_only) {
+  auto cjk_scripts = CjkScripts();
+  flat_hash_map<hb_codepoint_t, uint32_t> unicode_counts;
+  for (const auto& [file_name, script_codepoints] : freq_list) {
+    if (!IsScript(file_name)) {
+      continue;
+    }
+
+    if (file_name == "Script_CJK.riegeli@*") {
+      // this is a combination of CJK so ignore for the purposes of common
+      // codepoints.
+      continue;
+    }
+
+    bool is_cjk = cjk_scripts.contains(file_name);
+    if (cjk_only && !is_cjk) {
+      continue;
+    }
+
+    for (hb_codepoint_t u : script_codepoints) {
+      unicode_counts[u]++;
+    }
+  }
+
+  CodepointSet common_codepoints;
+  for (const auto& [u, count] : unicode_counts) {
+    if (count > 1) {
+      common_codepoints.insert(u);
+    }
+  }
+
+  return common_codepoints;
+}
+
+static btree_set<std::string> DetectScripts(
+    const flat_hash_map<std::string, CodepointSet>& freq_list,
+    const CodepointSet& unicodes) {
+  btree_set<std::string> detected_scripts;
+  flat_hash_set<std::string> detected_cjk_scripts;
+
+  CodepointSet common = CommonCodepoints(freq_list, false);
+  auto cjk_scripts = CjkScripts();
+
+  for (const auto& [file_name, script_codepoints] : freq_list) {
+    if (!IsScript(file_name) && file_name != "fallback.riegeli") {
+      continue;
+    }
+    if (file_name == "Script_CJK.riegeli@*") {
+      // special cased later.
+      continue;
+    }
+
+    // To avoid false positives on fonts with common ASCII/punctuation,
+    // only consider codepoints outside the basic Latin range for detection.
+    CodepointSet unique_codepoints = script_codepoints;
+    unique_codepoints.subtract(common);
+
+    CodepointSet intersection = unique_codepoints;
+    intersection.intersect(unicodes);
+
+    // TODO(garretrieger): consider using a threshold on intersection size here.
+    if (intersection.size() > 1) {
+      LOG(INFO) << "Script " << file_name << " is present, "
+                << intersection.size() << " codepoints.";
+      detected_scripts.insert(file_name);
+      if (cjk_scripts.contains(file_name)) {
+        detected_cjk_scripts.insert(file_name);
+      }
+    }
+  }
+
+  // Since the language specific CJK scripts all overlap if we have detected
+  // more than one, or the only codepoints present are common to all cjk scripts
+  // then replace the language specific scripts with the unified CJK script.
+  CodepointSet only_cjk_common = CommonCodepoints(freq_list, true);
+  only_cjk_common.subtract(common);
+  if (detected_cjk_scripts.size() > 1 ||
+      (detected_cjk_scripts.empty() && only_cjk_common.intersects(unicodes))) {
+    // upgrade from individual CJK scripts to the unified one.
+    for (const auto& script : detected_cjk_scripts) {
+      detected_scripts.erase(script);
+    }
+
+    LOG(INFO) << "Script_CJK.riegeli@* added to detected list.";
+    detected_scripts.insert("Script_CJK.riegeli@*");
+  }
+
+  return detected_scripts;
+}
+
+static StatusOr<std::string> FindFileName(
+    absl::string_view base_name,
+    const flat_hash_map<std::string, CodepointSet>& built_in_freqs) {
+  for (const auto& [file_name, _] : built_in_freqs) {
+    if (absl::StartsWith(file_name, base_name) &&
+        (file_name.size() == base_name.size() ||
+         file_name[base_name.size()] == '.')) {
+      return file_name;
+    }
+  }
+  return absl::NotFoundError(
+      absl::StrCat("Freq file for ", base_name, " was not found."));
+}
+
+StatusOr<std::string> AutoSegmenterConfig::GetBaseScriptForLanguage(
+    absl::string_view language) {
+  if (absl::EndsWith(language, ".riegeli")) {
+    language = absl::StripSuffix(language, ".riegeli");
+  }
+  if (absl::EndsWith(language, ".riegeli@*")) {
+    language = absl::StripSuffix(language, ".riegeli@*");
+  }
+
+  static const auto* lang_to_script =
+      new std::unordered_map<std::string, std::string>{
+          {"Language_af", "Script_latin"},
+          {"Language_ak", "Script_latin"},
+          {"Language_am", "Script_ethiopic"},
+          {"Language_ar", "Script_arabic"},
+          {"Language_ar-Latn", "Script_latin"},
+          {"Language_as", "Script_bengali"},
+          {"Language_ay", "Script_latin"},
+          {"Language_az", "Script_latin"},
+          {"Language_be", "Script_cyrillic"},
+          {"Language_bg", "Script_cyrillic"},
+          {"Language_bg-Latn", "Script_latin"},
+          {"Language_bho", "Script_devanagari"},
+          {"Language_bm", "Script_latin"},
+          {"Language_bn", "Script_bengali"},
+          {"Language_bn-Latn", "Script_latin"},
+          {"Language_bs", "Script_latin"},
+          {"Language_ca", "Script_latin"},
+          {"Language_ceb", "Script_latin"},
+          {"Language_ckb", "Script_arabic"},
+          {"Language_co", "Script_latin"},
+          {"Language_cs", "Script_latin"},
+          {"Language_cy", "Script_latin"},
+          {"Language_da", "Script_latin"},
+          {"Language_de", "Script_latin"},
+          {"Language_doi", "Script_devanagari"},
+          {"Language_dv", "Script_thaana"},
+          {"Language_ee", "Script_latin"},
+          {"Language_el", "Script_greek"},
+          {"Language_el-Latn", "Script_latin"},
+          {"Language_en", "Script_latin"},
+          {"Language_en-Cyrl", "Script_cyrillic"},
+          {"Language_eo", "Script_latin"},
+          {"Language_es", "Script_latin"},
+          {"Language_et", "Script_latin"},
+          {"Language_eu", "Script_latin"},
+          {"Language_fa", "Script_arabic"},
+          {"Language_ff", "Script_latin"},
+          {"Language_fi", "Script_latin"},
+          {"Language_fil", "Script_latin"},
+          {"Language_fr", "Script_latin"},
+          {"Language_fy", "Script_latin"},
+          {"Language_ga", "Script_latin"},
+          {"Language_gd", "Script_latin"},
+          {"Language_gl", "Script_latin"},
+          {"Language_gn", "Script_latin"},
+          {"Language_gu", "Script_gujarati"},
+          {"Language_gu-Latn", "Script_latin"},
+          {"Language_ha", "Script_latin"},
+          {"Language_haw", "Script_latin"},
+          {"Language_hi", "Script_devanagari"},
+          {"Language_hi-Latn", "Script_latin"},
+          {"Language_hmn", "Script_latin"},
+          {"Language_hr", "Script_latin"},
+          {"Language_ht", "Script_latin"},
+          {"Language_hu", "Script_latin"},
+          {"Language_hy", "Script_armenian"},
+          {"Language_id", "Script_latin"},
+          {"Language_ig", "Script_latin"},
+          {"Language_ilo", "Script_latin"},
+          {"Language_is", "Script_latin"},
+          {"Language_it", "Script_latin"},
+          {"Language_iw", "Script_hebrew"},
+          {"Language_ja", "Script_japanese"},
+          {"Language_ja-Latn", "Script_latin"},
+          {"Language_jv", "Script_latin"},
+          {"Language_ka", "Script_georgian"},
+          {"Language_kk", "Script_cyrillic"},
+          {"Language_kl", "Script_latin"},
+          {"Language_km", "Script_khmer"},
+          {"Language_kn", "Script_kannada"},
+          {"Language_kn-Latn", "Script_latin"},
+          {"Language_ko", "Script_korean"},
+          {"Language_kok", "Script_devanagari"},
+          {"Language_kri", "Script_latin"},
+          {"Language_ku", "Script_latin"},
+          {"Language_ky", "Script_cyrillic"},
+          {"Language_la", "Script_latin"},
+          {"Language_lb", "Script_latin"},
+          {"Language_lg", "Script_latin"},
+          {"Language_ln", "Script_latin"},
+          {"Language_lo", "Script_lao"},
+          {"Language_lt", "Script_latin"},
+          {"Language_lus", "Script_latin"},
+          {"Language_lv", "Script_latin"},
+          {"Language_mai", "Script_devanagari"},
+          {"Language_mg", "Script_latin"},
+          {"Language_mi", "Script_latin"},
+          {"Language_mk", "Script_cyrillic"},
+          {"Language_ml", "Script_malayalam"},
+          {"Language_ml-Latn", "Script_latin"},
+          {"Language_mn", "Script_cyrillic"},
+          {"Language_mni-Mtei", "Script_meetei-mayek"},
+          {"Language_mr", "Script_devanagari"},
+          {"Language_mr-Latn", "Script_latin"},
+          {"Language_ms", "Script_latin"},
+          {"Language_mt", "Script_latin"},
+          {"Language_my", "Script_myanmar"},
+          {"Language_ne", "Script_devanagari"},
+          {"Language_nl", "Script_latin"},
+          {"Language_no", "Script_latin"},
+          {"Language_nso", "Script_latin"},
+          {"Language_ny", "Script_latin"},
+          {"Language_om", "Script_latin"},
+          {"Language_or", "Script_oriya"},
+          {"Language_pa", "Script_gurmukhi"},
+          {"Language_pl", "Script_latin"},
+          {"Language_ps", "Script_arabic"},
+          {"Language_pt", "Script_latin"},
+          {"Language_qu", "Script_latin"},
+          {"Language_ro", "Script_latin"},
+          {"Language_ru", "Script_cyrillic"},
+          {"Language_ru-Latn", "Script_latin"},
+          {"Language_rw", "Script_latin"},
+          {"Language_sa", "Script_devanagari"},
+          {"Language_sd", "Script_arabic"},
+          {"Language_si", "Script_sinhala"},
+          {"Language_sk", "Script_latin"},
+          {"Language_sl", "Script_latin"},
+          {"Language_sm", "Script_latin"},
+          {"Language_sn", "Script_latin"},
+          {"Language_so", "Script_latin"},
+          {"Language_sq", "Script_latin"},
+          {"Language_sr", "Script_cyrillic"},
+          {"Language_st", "Script_latin"},
+          {"Language_su", "Script_latin"},
+          {"Language_sv", "Script_latin"},
+          {"Language_sw", "Script_latin"},
+          {"Language_ta", "Script_tamil"},
+          {"Language_ta-Latn", "Script_latin"},
+          {"Language_te", "Script_telugu"},
+          {"Language_te-Latn", "Script_latin"},
+          {"Language_tg", "Script_cyrillic"},
+          {"Language_th", "Script_thai"},
+          {"Language_ti", "Script_ethiopic"},
+          {"Language_tk", "Script_latin"},
+          {"Language_tr", "Script_latin"},
+          {"Language_ts", "Script_latin"},
+          {"Language_tt", "Script_cyrillic"},
+          {"Language_ug", "Script_arabic"},
+          {"Language_uk", "Script_cyrillic"},
+          {"Language_ur", "Script_arabic"},
+          {"Language_uz", "Script_latin"},
+          {"Language_vi", "Script_latin"},
+          {"Language_xh", "Script_latin"},
+          {"Language_yi", "Script_hebrew"},
+          {"Language_yo", "Script_latin"},
+          {"Language_zh-Hani", "Script_chinese-simplified"},
+          {"Language_zh-Hans", "Script_chinese-simplified"},
+          {"Language_zh-Hant", "Script_chinese-traditional"},
+          {"Language_zh-Latn", "Script_latin"},
+          {"Language_zu", "Script_latin"},
+      };
+  auto it = lang_to_script->find(std::string(language));
+  if (it != lang_to_script->end()) {
+    return it->second;
+  }
+  return absl::NotFoundError(
+      absl::StrCat("Unable to find base script for ", language));
+}
+
+static Status ApplyPrimaryScript(
+    const flat_hash_map<std::string, CodepointSet>& freq_list,
+    std::string primary_script, btree_set<std::string>& detected_scripts) {
+  std::string primary_base_script = "";
+  if (IsLanguage(primary_script)) {
+    primary_base_script = TRY(FindFileName(
+        TRY(AutoSegmenterConfig::GetBaseScriptForLanguage(primary_script)),
+        freq_list));
+  } else if (IsScript(primary_script)) {
+    primary_base_script = TRY(FindFileName(primary_script, freq_list));
+  } else {
+    return absl::InternalError(
+        absl::StrCat("Unknown freq file type: ", primary_script));
+  }
+
+  primary_script = TRY(FindFileName(primary_script, freq_list));
+  LOG(INFO) << "Primary script/language: " << primary_script;
+  LOG(INFO) << "Primary base script is " << primary_base_script;
+
+  // Primary script behaviour:
+  // - base script if present is replaced by primary script.
+  // - if base script is CJK, then all CJK's are replaced by primary script
+  detected_scripts.erase(primary_base_script);
+  auto cjk_scripts = CjkScripts();
+  if (cjk_scripts.contains(primary_base_script)) {
+    for (const auto& script : cjk_scripts) {
+      detected_scripts.erase(script);
+    }
+  }
+
+  detected_scripts.insert(primary_script);
+
+  return absl::OkStatus();
+}
+
+absl::StatusOr<SegmenterConfig> AutoSegmenterConfig::GenerateConfig(
+    hb_face_t* face, std::optional<std::string> primary_script) {
+  SegmenterConfig config;
+  config.set_generate_table_keyed_segments(true);
+  config.set_generate_feature_segments(true);
+  config.set_unmapped_glyph_handling(FIND_CONDITIONS);
+  config.set_condition_analysis_mode(CLOSURE_AND_DEP_GRAPH);
+
+  auto* base_plan = config.mutable_base_segmentation_plan();
+  base_plan->set_jump_ahead(2);
+  base_plan->set_use_prefetch_lists(true);
+
+  config.mutable_ungrouped_config()->set_min_patch_size(2500);
+
+  // Collect codepoints
+  auto freq_list = TRY(BuiltInFrequenciesList());
+  CodepointSet unicodes = FontHelper::ToCodepointsSet(face);
+  uint32_t cp_count = unicodes.size();
+
+  // Detect scripts by intersection with frequency data
+  btree_set<std::string> detected_scripts = DetectScripts(freq_list, unicodes);
+
+  // Quality tradeoffs based on codepoint count
+  // TODO(garretrieger): alternate approach - estimate the number of brotli ops
+  // (including accounting for pairs only within merge groups), and then select
+  // the cutoffs and premerging to keep the number of brotli ops within a
+  // specific range.
+  auto* base_cost = config.mutable_base_cost_config();
+  base_cost->set_use_bigrams(true);
+  base_cost->set_min_group_size(
+      kMinimumGroupSize);  // as recommended by the spec.
+  config.set_preprocess_merging_group_size_for_ungrouped(kMinimumGroupSize);
+  base_cost->set_optimization_cutoff_fraction(0.01);
+
+  if (cp_count > 2000) {
+    config.set_brotli_quality(9);
+  } else {
+    config.set_brotli_quality(11);
+  }
+
+  TRYV(ApplyPrimaryScript(freq_list, primary_script.value_or("Script_latin"),
+                          detected_scripts));
+  std::string primary_script_file =
+      TRY(FindFileName(primary_script.value_or("Script_latin"), freq_list));
+
+  // Add merge groups for other detected scripts
+  for (const std::string& script : detected_scripts) {
+    auto* mg = config.add_merge_groups();
+    mg->set_name(ScriptName(script));
+    auto* cost = mg->mutable_cost_config();
+
+    // TODO(garretrieger): use a heuristic to select probability threshold based
+    // on estimated number of brotli ops (assuming O(n^2) on codepoints in the
+    // group).
+    mg->set_preprocess_merging_group_size(kMinimumGroupSize);
+    mg->set_preprocess_merging_probability_threshold(0.001);
+
+    cost->set_built_in_freq_data_name(script);
+    if (script == primary_script_file) {
+      // TODO(garretrieger): customize these values based on the quality level
+      cost->set_initial_font_merge_threshold(-60);
+      cost->set_initial_font_merge_probability_threshold(0.40);
+    }
+  }
+
+  return config;
+}
+
+}  // namespace util
diff --git a/util/auto_segmenter_config.h b/util/auto_segmenter_config.h
new file mode 100644
index 00000000..2874fd87
--- /dev/null
+++ b/util/auto_segmenter_config.h
@@ -0,0 +1,37 @@
+#ifndef UTIL_AUTO_SEGMENTER_CONFIG_H_
+#define UTIL_AUTO_SEGMENTER_CONFIG_H_
+
+#include <optional>
+#include <string>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "hb.h"
+#include "util/segmenter_config.pb.h"
+
+namespace util {
+
+class AutoSegmenterConfig {
+ public:
+  // Analyzes the provided font face and generates an appropriate segmenter
+  // configuration.
+  //
+  // primary_script: an optional name of a script or language frequency data
+  //                 file (e.g., "Script_cyrillic", "Language_fr").
+  //                 Defaults to "Script_latin" if not provided.
+  static absl::StatusOr<SegmenterConfig> GenerateConfig(
+      hb_face_t* face,
+      std::optional<std::string> primary_script = std::nullopt);
+
+  // Returns the base script for a given language.
+  // For example, "Language_fr" -> "Script_latin".
+  static absl::StatusOr<std::string> GetBaseScriptForLanguage(
+      absl::string_view language);
+
+ private:
+  AutoSegmenterConfig() = delete;
+};
+
+}  // namespace util
+
+#endif  // UTIL_AUTO_SEGMENTER_CONFIG_H_
diff --git a/util/auto_segmenter_config_test.cc b/util/auto_segmenter_config_test.cc
new file mode 100644
index 00000000..c756a592
--- /dev/null
+++ b/util/auto_segmenter_config_test.cc
@@ -0,0 +1,279 @@
+#include "util/auto_segmenter_config.h"
+
+#include <google/protobuf/text_format.h>
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/match.h"
+#include "common/font_data.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include "hb.h"
+#include "util/load_codepoints.h"
+
+namespace util {
+namespace {
+
+using ::common::hb_blob_unique_ptr;
+using ::common::hb_face_unique_ptr;
+using ::common::make_hb_blob;
+using ::common::make_hb_face;
+using google::protobuf::TextFormat;
+using ::testing::Eq;
+using ::testing::Pair;
+using ::testing::UnorderedElementsAre;
+
+class AutoSegmenterConfigTest : public ::testing::Test {
+ protected:
+  AutoSegmenterConfigTest()
+      : face_(make_hb_face(nullptr)), cjk_face_(make_hb_face(nullptr)) {}
+
+  void SetUp() override {
+    hb_blob_unique_ptr roboto_blob = make_hb_blob(
+        hb_blob_create_from_file("common/testdata/Roboto-Regular.ttf"));
+    face_ = make_hb_face(hb_face_create(roboto_blob.get(), 0));
+
+    hb_blob_unique_ptr noto_blob = make_hb_blob(
+        hb_blob_create_from_file("common/testdata/NotoSansJP-Regular.ttf"));
+    if (hb_blob_get_length(noto_blob.get()) > 0) {
+      cjk_face_ = make_hb_face(hb_face_create(noto_blob.get(), 0));
+    }
+  }
+
+  hb_face_unique_ptr face_;
+  hb_face_unique_ptr cjk_face_;
+};
+
+using ScriptPair = std::pair<std::string, std::string>;
+
+static std::vector<ScriptPair> GetScripts(const SegmenterConfig& config) {
+  std::vector<ScriptPair> result;
+  for (const auto& mg : config.merge_groups()) {
+    result.push_back({mg.name(), mg.cost_config().built_in_freq_data_name()});
+  }
+  return result;
+}
+
+static std::vector<std::string> GetScriptsWithInitialMergeThreshold(
+    const SegmenterConfig& config) {
+  std::vector<std::string> result;
+  for (const auto& mg : config.merge_groups()) {
+    if (mg.cost_config().has_initial_font_merge_threshold()) {
+      result.push_back(mg.name());
+    }
+  }
+  return result;
+}
+
+const ScriptPair kLatin = {"Latin", "Script_latin.riegeli"};
+const ScriptPair kCyrillic = {"Cyrillic", "Script_cyrillic.riegeli"};
+const ScriptPair kGreek = {"Greek", "Script_greek.riegeli"};
+const ScriptPair kSymbols = {"Symbols", "Script_symbols.riegeli"};
+const ScriptPair kEmoji = {"Emoji", "Script_emoji.riegeli"};
+const ScriptPair kCJK = {"CJK", "Script_CJK.riegeli@*"};
+const ScriptPair kFallback = {"Fallback", "fallback.riegeli"};
+
+TEST_F(AutoSegmenterConfigTest, Roboto_UnspecifiedPrimary) {
+  auto config_or = AutoSegmenterConfig::GenerateConfig(face_.get());
+  ASSERT_TRUE(config_or.ok()) << config_or.status();
+  EXPECT_THAT(
+      GetScripts(*config_or),
+      UnorderedElementsAre(kLatin, kCyrillic, kGreek, kSymbols, kFallback));
+  EXPECT_THAT(GetScriptsWithInitialMergeThreshold(*config_or),
+              UnorderedElementsAre("Latin"));
+
+  std::string config_string;
+  TextFormat::PrintToString(*config_or, &config_string);
+  ASSERT_EQ(config_string, R"(unmapped_glyph_handling: FIND_CONDITIONS
+generate_table_keyed_segments: true
+brotli_quality: 11
+base_cost_config {
+  use_bigrams: true
+  min_group_size: 4
+  optimization_cutoff_fraction: 0.01
+}
+ungrouped_config {
+  min_patch_size: 2500
+}
+preprocess_merging_group_size_for_ungrouped: 4
+merge_groups {
+  name: "Cyrillic"
+  preprocess_merging_group_size: 4
+  preprocess_merging_probability_threshold: 0.001
+  cost_config {
+    built_in_freq_data_name: "Script_cyrillic.riegeli"
+  }
+}
+merge_groups {
+  name: "Greek"
+  preprocess_merging_group_size: 4
+  preprocess_merging_probability_threshold: 0.001
+  cost_config {
+    built_in_freq_data_name: "Script_greek.riegeli"
+  }
+}
+merge_groups {
+  name: "Latin"
+  preprocess_merging_group_size: 4
+  preprocess_merging_probability_threshold: 0.001
+  cost_config {
+    built_in_freq_data_name: "Script_latin.riegeli"
+    initial_font_merge_threshold: -60
+    initial_font_merge_probability_threshold: 0.4
+  }
+}
+merge_groups {
+  name: "Symbols"
+  preprocess_merging_group_size: 4
+  preprocess_merging_probability_threshold: 0.001
+  cost_config {
+    built_in_freq_data_name: "Script_symbols.riegeli"
+  }
+}
+merge_groups {
+  name: "Fallback"
+  preprocess_merging_group_size: 4
+  preprocess_merging_probability_threshold: 0.001
+  cost_config {
+    built_in_freq_data_name: "fallback.riegeli"
+  }
+}
+base_segmentation_plan {
+  jump_ahead: 2
+  use_prefetch_lists: true
+}
+generate_feature_segments: true
+condition_analysis_mode: CLOSURE_AND_DEP_GRAPH
+)");
+}
+
+TEST_F(AutoSegmenterConfigTest, Roboto_ScriptCyrillic) {
+  auto config_or =
+      AutoSegmenterConfig::GenerateConfig(face_.get(), "Script_cyrillic");
+  ASSERT_TRUE(config_or.ok()) << config_or.status();
+  EXPECT_THAT(
+      GetScripts(*config_or),
+      UnorderedElementsAre(kLatin, kCyrillic, kGreek, kSymbols, kFallback));
+  EXPECT_THAT(GetScriptsWithInitialMergeThreshold(*config_or),
+              UnorderedElementsAre("Cyrillic"));
+}
+
+TEST_F(AutoSegmenterConfigTest, Roboto_LanguageFr) {
+  auto config_or =
+      AutoSegmenterConfig::GenerateConfig(face_.get(), "Language_fr");
+  ASSERT_TRUE(config_or.ok()) << config_or.status();
+  EXPECT_THAT(GetScripts(*config_or),
+              UnorderedElementsAre(Pair("Language_fr", "Language_fr.riegeli"),
+                                   kCyrillic, kGreek, kSymbols, kFallback));
+  EXPECT_THAT(GetScriptsWithInitialMergeThreshold(*config_or),
+              UnorderedElementsAre("Language_fr"));
+}
+
+TEST_F(AutoSegmenterConfigTest, NotoSansJP_UnspecifiedPrimary) {
+  if (!cjk_face_) GTEST_SKIP() << "NotoSansJP-Regular.ttf not found";
+  auto config_or = AutoSegmenterConfig::GenerateConfig(cjk_face_.get());
+  ASSERT_TRUE(config_or.ok()) << config_or.status();
+  EXPECT_THAT(GetScripts(*config_or),
+              UnorderedElementsAre(kLatin, kGreek, kCyrillic, kCJK, kSymbols,
+                                   kEmoji, kFallback));
+  EXPECT_THAT(GetScriptsWithInitialMergeThreshold(*config_or),
+              UnorderedElementsAre("Latin"));
+}
+
+TEST_F(AutoSegmenterConfigTest, NotoSansJP_ScriptCJK) {
+  if (!cjk_face_) GTEST_SKIP() << "NotoSansJP-Regular.ttf not found";
+  auto config_or =
+      AutoSegmenterConfig::GenerateConfig(cjk_face_.get(), "Script_CJK");
+  ASSERT_TRUE(config_or.ok()) << config_or.status();
+  EXPECT_THAT(GetScripts(*config_or),
+              UnorderedElementsAre(kLatin, kGreek, kCyrillic, kCJK, kSymbols,
+                                   kEmoji, kFallback));
+  EXPECT_THAT(GetScriptsWithInitialMergeThreshold(*config_or),
+              UnorderedElementsAre("CJK"));
+}
+
+TEST_F(AutoSegmenterConfigTest, NotoSansJP_ScriptJapanese) {
+  if (!cjk_face_) GTEST_SKIP() << "NotoSansJP-Regular.ttf not found";
+  auto config_or =
+      AutoSegmenterConfig::GenerateConfig(cjk_face_.get(), "Script_japanese");
+  ASSERT_TRUE(config_or.ok()) << config_or.status();
+  EXPECT_THAT(
+      GetScripts(*config_or),
+      UnorderedElementsAre(kLatin, kGreek, kCyrillic,
+                           Pair("Japanese", "Script_japanese.riegeli@*"),
+                           kSymbols, kEmoji, kFallback));
+  EXPECT_THAT(GetScriptsWithInitialMergeThreshold(*config_or),
+              UnorderedElementsAre("Japanese"));
+}
+
+TEST_F(AutoSegmenterConfigTest, NotoSansJP_LanguageZhHans) {
+  if (!cjk_face_) GTEST_SKIP() << "NotoSansJP-Regular.ttf not found";
+  auto config_or =
+      AutoSegmenterConfig::GenerateConfig(cjk_face_.get(), "Language_zh-Hans");
+  ASSERT_TRUE(config_or.ok()) << config_or.status();
+  EXPECT_THAT(GetScripts(*config_or),
+              UnorderedElementsAre(
+                  kLatin, kGreek, kCyrillic,
+                  Pair("Language_zh-Hans", "Language_zh-Hans.riegeli@*"),
+                  kSymbols, kEmoji, kFallback));
+  EXPECT_THAT(GetScriptsWithInitialMergeThreshold(*config_or),
+              UnorderedElementsAre("Language_zh-Hans"));
+}
+
+TEST_F(AutoSegmenterConfigTest, Roboto_ScriptNotFound) {
+  auto config_or =
+      AutoSegmenterConfig::GenerateConfig(face_.get(), "Script_foobar");
+  EXPECT_EQ(config_or.status().code(), absl::StatusCode::kNotFound);
+}
+
+TEST_F(AutoSegmenterConfigTest, Roboto_LanguageNotFound) {
+  auto config_or =
+      AutoSegmenterConfig::GenerateConfig(face_.get(), "Language_foobar");
+  EXPECT_EQ(config_or.status().code(), absl::StatusCode::kNotFound);
+}
+
+TEST_F(AutoSegmenterConfigTest, Roboto_InvalidPrefix) {
+  auto config_or =
+      AutoSegmenterConfig::GenerateConfig(face_.get(), "Foo_latin");
+  EXPECT_EQ(config_or.status().code(), absl::StatusCode::kInternal);
+}
+
+TEST_F(AutoSegmenterConfigTest, Roboto_FullFileName_Script) {
+  auto config_or = AutoSegmenterConfig::GenerateConfig(
+      face_.get(), "Script_cyrillic.riegeli");
+  ASSERT_TRUE(config_or.ok()) << config_or.status();
+  EXPECT_THAT(
+      GetScripts(*config_or),
+      UnorderedElementsAre(kLatin, kCyrillic, kGreek, kSymbols, kFallback));
+  EXPECT_THAT(GetScriptsWithInitialMergeThreshold(*config_or),
+              UnorderedElementsAre("Cyrillic"));
+}
+
+TEST_F(AutoSegmenterConfigTest, Roboto_FullFileName_Language) {
+  auto config_or =
+      AutoSegmenterConfig::GenerateConfig(face_.get(), "Language_fr.riegeli");
+  EXPECT_THAT(GetScripts(*config_or),
+              UnorderedElementsAre(Pair("Language_fr", "Language_fr.riegeli"),
+                                   kCyrillic, kGreek, kSymbols, kFallback));
+  EXPECT_THAT(GetScriptsWithInitialMergeThreshold(*config_or),
+              UnorderedElementsAre("Language_fr"));
+}
+
+TEST_F(AutoSegmenterConfigTest, LanguageMappingsExist) {
+  auto built_in_freqs_or = util::BuiltInFrequenciesList();
+  ASSERT_TRUE(built_in_freqs_or.ok());
+  for (const auto& [file_name, _] : *built_in_freqs_or) {
+    if (!absl::StartsWith(file_name, "Language_")) continue;
+    std::string language = file_name;
+    size_t dot_pos = language.find('.');
+    if (dot_pos != std::string::npos) language = language.substr(0, dot_pos);
+    auto base_script = AutoSegmenterConfig::GetBaseScriptForLanguage(language);
+    ASSERT_TRUE(base_script.ok())
+        << "No mapping for " << language << ": " << base_script.status();
+  }
+}
+
+}  // namespace
+}  // namespace util
diff --git a/util/closure_glyph_keyed_segmenter_util.cc b/util/closure_glyph_keyed_segmenter_util.cc
index 925d4337..09c19cb3 100644
--- a/util/closure_glyph_keyed_segmenter_util.cc
+++ b/util/closure_glyph_keyed_segmenter_util.cc
@@ -23,6 +23,7 @@
 #include "ift/encoder/merge_strategy.h"
 #include "ift/encoder/subset_definition.h"
 #include "ift/freq/unicode_frequencies.h"
+#include "util/auto_segmenter_config.h"
 #include "util/load_codepoints.h"
 #include "util/segmentation_plan.pb.h"
 #include "util/segmenter_config.pb.h"
@@ -42,6 +43,14 @@ ABSL_FLAG(
     "Path to a text proto file containing the configuration for the segmenter. "
     "Should contain a single SegmenterConfig message.");
 
+ABSL_FLAG(bool, auto_config, false,
+          "If set the segmenter configuration will be automatically generated "
+          "based on the input font.");
+
+ABSL_FLAG(std::string, primary_script, "Script_latin",
+          "When auto_config is enabled this sets the primary script or "
+          "language frequency data file to use.");
+
 ABSL_FLAG(bool, output_segmentation_plan, false,
           "If set a segmentation plan representing the determined segmentation "
           "will be output to stdout.");
@@ -81,9 +90,15 @@ using ift::encoder::Segment;
 using ift::encoder::SegmentationCost;
 using ift::encoder::SubsetDefinition;
 using ift::freq::UnicodeFrequencies;
+using util::AutoSegmenterConfig;
 using util::SegmenterConfigUtil;
 
-static StatusOr<SegmenterConfig> LoadConfig() {
+static StatusOr<SegmenterConfig> LoadConfig(hb_face_t* font) {
+  if (absl::GetFlag(FLAGS_auto_config)) {
+    return AutoSegmenterConfig::GenerateConfig(
+        font, absl::GetFlag(FLAGS_primary_script));
+  }
+
   FontData config_text =
       TRY(util::LoadFile(absl::GetFlag(FLAGS_config).c_str()));
   SegmenterConfig config;
@@ -191,9 +206,10 @@ static Status OutputFallbackGlyphCount(hb_face_t* original_face,
 static Status Main(const std::vector<char*> args) {
   hb_face_unique_ptr font =
       TRY(LoadFont(absl::GetFlag(FLAGS_input_font).c_str()));
-  SegmenterConfig config = TRY(LoadConfig());
+  SegmenterConfig config = TRY(LoadConfig(font.get()));
 
-  SegmenterConfigUtil config_util(absl::GetFlag(FLAGS_config));
+  SegmenterConfigUtil config_util(
+      absl::GetFlag(FLAGS_auto_config) ? "" : absl::GetFlag(FLAGS_config));
 
   CodepointSet font_codepoints = FontHelper::ToCodepointsSet(font.get());
   btree_set<hb_tag_t> font_features = FontHelper::GetFeatureTags(font.get());
diff --git a/util/generate_segmenter_config.cc b/util/generate_segmenter_config.cc
new file mode 100644
index 00000000..d0fa8133
--- /dev/null
+++ b/util/generate_segmenter_config.cc
@@ -0,0 +1,55 @@
+#include <google/protobuf/text_format.h>
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "absl/flags/flag.h"
+#include "absl/flags/parse.h"
+#include "absl/log/globals.h"
+#include "absl/log/initialize.h"
+#include "absl/status/status.h"
+#include "common/font_data.h"
+#include "common/try.h"
+#include "util/auto_segmenter_config.h"
+#include "util/load_codepoints.h"
+
+ABSL_FLAG(std::string, input_font, "in.ttf",
+          "Path to the font file to analyze.");
+
+ABSL_FLAG(std::string, primary_script, "Script_latin",
+          "The primary script or language frequency data file to use.");
+
+using absl::Status;
+using common::hb_face_unique_ptr;
+using util::AutoSegmenterConfig;
+
+static Status Main(const std::vector<char*> args) {
+  std::string input_font_path = absl::GetFlag(FLAGS_input_font);
+  auto font_data = TRY(util::LoadFile(input_font_path.c_str()));
+  hb_face_unique_ptr font = font_data.face();
+
+  auto config = TRY(AutoSegmenterConfig::GenerateConfig(
+      font.get(), absl::GetFlag(FLAGS_primary_script)));
+
+  std::string output;
+  if (!google::protobuf::TextFormat::PrintToString(config, &output)) {
+    return absl::InternalError("Failed to format SegmenterConfig as textproto.");
+  }
+
+  std::cout << output;
+  return absl::OkStatus();
+}
+
+int main(int argc, char** argv) {
+  absl::SetStderrThreshold(absl::LogSeverityAtLeast::kInfo);
+  auto args = absl::ParseCommandLine(argc, argv);
+  absl::InitializeLog();
+
+  Status sc = Main(args);
+  if (!sc.ok()) {
+    std::cerr << "Error: " << sc << std::endl;
+    return -1;
+  }
+  return 0;
+}
diff --git a/util/load_codepoints_test.cc b/util/load_codepoints_test.cc
index 60b00542..2f77ab7b 100644
--- a/util/load_codepoints_test.cc
+++ b/util/load_codepoints_test.cc
@@ -168,9 +168,14 @@ TEST_F(LoadCodepointsTest, BuiltInFrequenciesList) {
   auto result = util::BuiltInFrequenciesList();
   ASSERT_TRUE(result.ok()) << result.status();
   EXPECT_FALSE(result->empty());
+
   EXPECT_TRUE(result->contains("Script_latin.riegeli"));
   EXPECT_FALSE((*result)["Script_latin.riegeli"].empty());
   EXPECT_TRUE((*result)["Script_latin.riegeli"].contains('Q'));
+
+  EXPECT_TRUE(result->contains("Script_japanese.riegeli@*"));
+  EXPECT_FALSE((*result)["Script_japanese.riegeli@*"].empty());
+  EXPECT_TRUE((*result)["Script_japanese.riegeli@*"].contains(0x304C /* が */));
 }
 
 }  // namespace util

From 0e6391fd01a5b84ec565ee9b3bd65bfa8181570f Mon Sep 17 00:00:00 2001
From: Garret Rieger <grieger@google.com>
Date: Thu, 5 Mar 2026 00:27:18 +0000
Subject: [PATCH 2/3] For the auto config generator add a customizable quality
 level.

Numeric value which controls the performance vs quality tradeoff. Lower values maximize performance, higher values maximize segmentation quality (at the cost of longer analysis times).
---
 util/auto_segmenter_config.cc              | 168 ++++++++++++++++++---
 util/auto_segmenter_config.h               |   3 +-
 util/auto_segmenter_config_test.cc         |  46 ++++--
 util/closure_glyph_keyed_segmenter_util.cc |  18 ++-
 util/generate_segmenter_config.cc          |  10 +-
 5 files changed, 202 insertions(+), 43 deletions(-)

diff --git a/util/auto_segmenter_config.cc b/util/auto_segmenter_config.cc
index f3f99f26..843404f0 100644
--- a/util/auto_segmenter_config.cc
+++ b/util/auto_segmenter_config.cc
@@ -27,6 +27,29 @@ namespace util {
 
 static constexpr uint32_t kMinimumGroupSize = 4;
 
+// Quality Table:
+// Quality | bigrams | find conditions | init brotli | non init brotli | init font merge threshold | opt cut off | preprocess merging | preprocess threshold
+// 1       | No      | No              | 0           | 0               | 60%                       | 5%          | Yes                | 5%
+// 2       | Yes     | No              | 0           | 0               | 55%                       | 4%          | Yes                | 4%
+// 3       | Yes     | Yes             | 0           | 0               | 50%                       | 3%          | Yes                | 3%
+// 4       | Yes     | Yes             | 0           | 9               | 45%                       | 2%          | Yes                | 2%
+// 5       | Yes     | Yes             | 9           | 9               | 40%                       | 1%          | Yes                | 1%
+// 6       | Yes     | Yes             | 9           | 11              | 30%                       | 0.5%        | Yes                | 0.5%
+// 7       | Yes     | Yes             | 11          | 11              | 25%                       | 0.5%        | Yes                | 0.05%
+// 8       | Yes     | Yes             | 11          | 11              | 25%                       | 0.5%        | No                 | na
+enum Quality {
+  MIN = 1, // Alias for ONE
+  ONE = 1,
+  TWO = 2,
+  THREE = 3,
+  FOUR = 4,
+  FIVE = 5,
+  SIX = 6,
+  SEVEN = 7,
+  EIGHT = 8,
+  MAX = 8, // Alias for EIGHT
+};
+
 // TODO(garretrieger): define a very basic set of quality levels first (see next TODO),
 //   start with just a lowest and highest to set the upper and lower bounds for quality
 //   settings (maybe also a mid point). To begin use number of codepoints to select quality
@@ -464,24 +487,140 @@ static Status ApplyPrimaryScript(
   return absl::OkStatus();
 }
 
+static void ApplyQualityLevelTo(Quality quality, HeuristicConfiguration& config) {
+  config.set_min_patch_size(2500);
+}
+
+static void ApplyQualityLevelTo(Quality quality, CostConfiguration& config) {
+  config.set_min_group_size(kMinimumGroupSize);
+
+  if (quality == ONE) {
+    config.set_use_bigrams(false);
+  } else {
+    config.set_use_bigrams(true);
+  }
+
+  switch (quality) {
+    case ONE: config.set_optimization_cutoff_fraction(0.05); break;
+    case TWO: config.set_optimization_cutoff_fraction(0.04); break;
+    case THREE: config.set_optimization_cutoff_fraction(0.03); break;
+    case FOUR: config.set_optimization_cutoff_fraction(0.02); break;
+    case FIVE: config.set_optimization_cutoff_fraction(0.01); break;
+    case SIX:
+    case SEVEN:
+    case EIGHT:
+    default: config.set_optimization_cutoff_fraction(0.005); break;
+  }
+}
+
+static void ApplyQualityLevelTo(Quality quality, MergeGroup& merge_group) {
+  if (merge_group.has_cost_config()) {
+    if (quality >= ONE && quality <= SEVEN) {
+      merge_group.set_preprocess_merging_group_size(kMinimumGroupSize);
+    } else {
+      merge_group.set_preprocess_merging_group_size(1);
+    }
+
+    switch (quality) {
+      case ONE: merge_group.set_preprocess_merging_probability_threshold(0.05); break;
+      case TWO: merge_group.set_preprocess_merging_probability_threshold(0.04); break;
+      case THREE: merge_group.set_preprocess_merging_probability_threshold(0.03); break;
+      case FOUR: merge_group.set_preprocess_merging_probability_threshold(0.02); break;
+      case FIVE: merge_group.set_preprocess_merging_probability_threshold(0.01); break;
+      case SIX: merge_group.set_preprocess_merging_probability_threshold(0.005); break;
+      case SEVEN: merge_group.set_preprocess_merging_probability_threshold(0.0005); break;
+      case EIGHT:
+      default: merge_group.clear_preprocess_merging_probability_threshold(); break;
+    }
+
+    if (merge_group.mutable_cost_config()->has_initial_font_merge_threshold()) {
+      switch (quality) {
+        case ONE: merge_group.mutable_cost_config()->set_initial_font_merge_probability_threshold(0.60); break;
+        case TWO: merge_group.mutable_cost_config()->set_initial_font_merge_probability_threshold(0.55); break;
+        case THREE: merge_group.mutable_cost_config()->set_initial_font_merge_probability_threshold(0.50); break;
+        case FOUR: merge_group.mutable_cost_config()->set_initial_font_merge_probability_threshold(0.45); break;
+        case FIVE: merge_group.mutable_cost_config()->set_initial_font_merge_probability_threshold(0.40); break;
+        case SIX: merge_group.mutable_cost_config()->set_initial_font_merge_probability_threshold(0.30); break;
+        case SEVEN:
+        case EIGHT:
+        default: merge_group.mutable_cost_config()->set_initial_font_merge_probability_threshold(0.25); break;
+      }
+    }
+  }
+}
+
+static void ApplyQualityLevelTo(Quality quality, SegmenterConfig& config) {
+  config.set_preprocess_merging_group_size_for_ungrouped(kMinimumGroupSize);
+
+  if (quality == ONE || quality == TWO) {
+    config.set_unmapped_glyph_handling(MOVE_TO_INIT_FONT);
+  } else {
+    config.set_unmapped_glyph_handling(FIND_CONDITIONS);
+  }
+
+  switch (quality) {
+    case ONE:
+    case TWO:
+    case THREE:
+      config.set_brotli_quality(0);
+      break;
+    case FOUR:
+    case FIVE:
+      config.set_brotli_quality(9);
+      break;
+    case SIX:
+    case SEVEN:
+    case EIGHT:
+    default:
+      config.set_brotli_quality(11);
+      break;
+  }
+
+  switch (quality) {
+    case ONE:
+    case TWO:
+    case THREE:
+    case FOUR:
+      config.set_brotli_quality_for_initial_font_merging(0);
+      break;
+    case FIVE:
+    case SIX:
+      config.set_brotli_quality_for_initial_font_merging(9);
+      break;
+    case SEVEN:
+    case EIGHT:
+    default:
+      config.set_brotli_quality_for_initial_font_merging(11);
+      break;
+  }
+
+  ApplyQualityLevelTo(quality, *config.mutable_base_heuristic_config());
+  ApplyQualityLevelTo(quality, *config.mutable_base_cost_config());
+
+  for (auto& merge_group : *config.mutable_merge_groups()) {
+    ApplyQualityLevelTo(quality, merge_group);
+  }
+}
+
 absl::StatusOr<SegmenterConfig> AutoSegmenterConfig::GenerateConfig(
-    hb_face_t* face, std::optional<std::string> primary_script) {
+    hb_face_t* face, std::optional<std::string> primary_script, std::optional<int> quality_level) {
   SegmenterConfig config;
   config.set_generate_table_keyed_segments(true);
   config.set_generate_feature_segments(true);
-  config.set_unmapped_glyph_handling(FIND_CONDITIONS);
   config.set_condition_analysis_mode(CLOSURE_AND_DEP_GRAPH);
 
   auto* base_plan = config.mutable_base_segmentation_plan();
   base_plan->set_jump_ahead(2);
   base_plan->set_use_prefetch_lists(true);
 
-  config.mutable_ungrouped_config()->set_min_patch_size(2500);
-
   // Collect codepoints
   auto freq_list = TRY(BuiltInFrequenciesList());
   CodepointSet unicodes = FontHelper::ToCodepointsSet(face);
   uint32_t cp_count = unicodes.size();
+  Quality quality = cp_count > 2000 ? MIN : MAX;
+  if (quality_level.has_value() && quality_level.value() >= ONE && quality_level.value() <= MAX) {
+    quality = static_cast<Quality>(quality_level.value());
+  }
 
   // Detect scripts by intersection with frequency data
   btree_set<std::string> detected_scripts = DetectScripts(freq_list, unicodes);
@@ -491,18 +630,6 @@ absl::StatusOr<SegmenterConfig> AutoSegmenterConfig::GenerateConfig(
   // (including accounting for pairs only within merge groups), and then select
   // the cutoffs and premerging to keep the number of brotli ops within a
   // specific range.
-  auto* base_cost = config.mutable_base_cost_config();
-  base_cost->set_use_bigrams(true);
-  base_cost->set_min_group_size(
-      kMinimumGroupSize);  // as recommended by the spec.
-  config.set_preprocess_merging_group_size_for_ungrouped(kMinimumGroupSize);
-  base_cost->set_optimization_cutoff_fraction(0.01);
-
-  if (cp_count > 2000) {
-    config.set_brotli_quality(9);
-  } else {
-    config.set_brotli_quality(11);
-  }
 
   TRYV(ApplyPrimaryScript(freq_list, primary_script.value_or("Script_latin"),
                           detected_scripts));
@@ -515,20 +642,15 @@ absl::StatusOr<SegmenterConfig> AutoSegmenterConfig::GenerateConfig(
     mg->set_name(ScriptName(script));
     auto* cost = mg->mutable_cost_config();
 
-    // TODO(garretrieger): use a heuristic to select probability threshold based
-    // on estimated number of brotli ops (assuming O(n^2) on codepoints in the
-    // group).
-    mg->set_preprocess_merging_group_size(kMinimumGroupSize);
-    mg->set_preprocess_merging_probability_threshold(0.001);
-
     cost->set_built_in_freq_data_name(script);
     if (script == primary_script_file) {
       // TODO(garretrieger): customize these values based on the quality level
       cost->set_initial_font_merge_threshold(-60);
-      cost->set_initial_font_merge_probability_threshold(0.40);
     }
   }
 
+  ApplyQualityLevelTo(quality, config);
+
   return config;
 }
 
diff --git a/util/auto_segmenter_config.h b/util/auto_segmenter_config.h
index 2874fd87..9c974a7d 100644
--- a/util/auto_segmenter_config.h
+++ b/util/auto_segmenter_config.h
@@ -21,7 +21,8 @@ class AutoSegmenterConfig {
   //                 Defaults to "Script_latin" if not provided.
   static absl::StatusOr<SegmenterConfig> GenerateConfig(
       hb_face_t* face,
-      std::optional<std::string> primary_script = std::nullopt);
+      std::optional<std::string> primary_script = std::nullopt,
+      std::optional<int> quality_level = std::nullopt);
 
   // Returns the base script for a given language.
   // For example, "Language_fr" -> "Script_latin".
diff --git a/util/auto_segmenter_config_test.cc b/util/auto_segmenter_config_test.cc
index c756a592..6a11725e 100644
--- a/util/auto_segmenter_config_test.cc
+++ b/util/auto_segmenter_config_test.cc
@@ -89,53 +89,49 @@ TEST_F(AutoSegmenterConfigTest, Roboto_UnspecifiedPrimary) {
   ASSERT_EQ(config_string, R"(unmapped_glyph_handling: FIND_CONDITIONS
 generate_table_keyed_segments: true
 brotli_quality: 11
+brotli_quality_for_initial_font_merging: 11
+base_heuristic_config {
+  min_patch_size: 2500
+}
 base_cost_config {
   use_bigrams: true
   min_group_size: 4
-  optimization_cutoff_fraction: 0.01
-}
-ungrouped_config {
-  min_patch_size: 2500
+  optimization_cutoff_fraction: 0.005
 }
 preprocess_merging_group_size_for_ungrouped: 4
 merge_groups {
   name: "Cyrillic"
-  preprocess_merging_group_size: 4
-  preprocess_merging_probability_threshold: 0.001
+  preprocess_merging_group_size: 1
   cost_config {
     built_in_freq_data_name: "Script_cyrillic.riegeli"
   }
 }
 merge_groups {
   name: "Greek"
-  preprocess_merging_group_size: 4
-  preprocess_merging_probability_threshold: 0.001
+  preprocess_merging_group_size: 1
   cost_config {
     built_in_freq_data_name: "Script_greek.riegeli"
   }
 }
 merge_groups {
   name: "Latin"
-  preprocess_merging_group_size: 4
-  preprocess_merging_probability_threshold: 0.001
+  preprocess_merging_group_size: 1
   cost_config {
     built_in_freq_data_name: "Script_latin.riegeli"
     initial_font_merge_threshold: -60
-    initial_font_merge_probability_threshold: 0.4
+    initial_font_merge_probability_threshold: 0.25
   }
 }
 merge_groups {
   name: "Symbols"
-  preprocess_merging_group_size: 4
-  preprocess_merging_probability_threshold: 0.001
+  preprocess_merging_group_size: 1
   cost_config {
     built_in_freq_data_name: "Script_symbols.riegeli"
   }
 }
 merge_groups {
   name: "Fallback"
-  preprocess_merging_group_size: 4
-  preprocess_merging_probability_threshold: 0.001
+  preprocess_merging_group_size: 1
   cost_config {
     built_in_freq_data_name: "fallback.riegeli"
   }
@@ -275,5 +271,25 @@ TEST_F(AutoSegmenterConfigTest, LanguageMappingsExist) {
   }
 }
 
+TEST_F(AutoSegmenterConfigTest, QualityLevelForcing) {
+  auto config_or = AutoSegmenterConfig::GenerateConfig(
+      face_.get(), std::nullopt, 1);
+  ASSERT_TRUE(config_or.ok()) << config_or.status();
+  EXPECT_EQ(config_or->brotli_quality(), 0);
+  EXPECT_EQ(config_or->unmapped_glyph_handling(), MOVE_TO_INIT_FONT);
+  EXPECT_EQ(config_or->base_cost_config().use_bigrams(), false);
+  EXPECT_EQ(config_or->brotli_quality_for_initial_font_merging(), 0);
+  EXPECT_EQ(config_or->base_cost_config().optimization_cutoff_fraction(), 0.05);
+
+  auto config_or_8 = AutoSegmenterConfig::GenerateConfig(
+      face_.get(), std::nullopt, 8);
+  ASSERT_TRUE(config_or_8.ok()) << config_or_8.status();
+  EXPECT_EQ(config_or_8->brotli_quality(), 11);
+  EXPECT_EQ(config_or_8->unmapped_glyph_handling(), FIND_CONDITIONS);
+  EXPECT_EQ(config_or_8->base_cost_config().use_bigrams(), true);
+  EXPECT_EQ(config_or_8->brotli_quality_for_initial_font_merging(), 11);
+  EXPECT_EQ(config_or_8->base_cost_config().optimization_cutoff_fraction(), 0.005);
+}
+
 }  // namespace
 }  // namespace util
diff --git a/util/closure_glyph_keyed_segmenter_util.cc b/util/closure_glyph_keyed_segmenter_util.cc
index 09c19cb3..fabe08af 100644
--- a/util/closure_glyph_keyed_segmenter_util.cc
+++ b/util/closure_glyph_keyed_segmenter_util.cc
@@ -1,9 +1,9 @@
 #include <google/protobuf/text_format.h>
 
 #include <cstdint>
-#include <cstdio>
 #include <iostream>
 #include <vector>
+#include <chrono>
 
 #include "absl/container/btree_map.h"
 #include "absl/container/flat_hash_map.h"
@@ -43,6 +43,9 @@ ABSL_FLAG(
     "Path to a text proto file containing the configuration for the segmenter. "
     "Should contain a single SegmenterConfig message.");
 
+ABSL_FLAG(int, auto_config_quality, 0,
+          "The quality level to use when auto_config is enabled. A value of 0 means auto pick. Valid values are 1-8.");
+
 ABSL_FLAG(bool, auto_config, false,
           "If set the segmenter configuration will be automatically generated "
           "based on the input font.");
@@ -95,8 +98,12 @@ using util::SegmenterConfigUtil;
 
 static StatusOr<SegmenterConfig> LoadConfig(hb_face_t* font) {
   if (absl::GetFlag(FLAGS_auto_config)) {
+    std::optional<int> quality_level = std::nullopt;
+    if (absl::GetFlag(FLAGS_auto_config_quality) > 0) {
+      quality_level = absl::GetFlag(FLAGS_auto_config_quality);
+    }
     return AutoSegmenterConfig::GenerateConfig(
-        font, absl::GetFlag(FLAGS_primary_script));
+        font, absl::GetFlag(FLAGS_primary_script), quality_level);
   }
 
   FontData config_text =
@@ -143,7 +150,7 @@ static Status Analysis(hb_face_t* font,
     group_index++;
   }
 
-  std::cerr << "total_cost_across_groups = " << overall_cost << std::endl;
+  std::cerr << "total_cost_across_groups = " << (uint64_t) overall_cost << std::endl;
 
   return absl::OkStatus();
 }
@@ -224,8 +231,13 @@ static Status Main(const std::vector<char*> args) {
   ClosureGlyphSegmenter segmenter(
       config.brotli_quality(), config.brotli_quality_for_initial_font_merging(),
       config.unmapped_glyph_handling(), config.condition_analysis_mode());
+
+  auto start_time = std::chrono::high_resolution_clock::now();
   GlyphSegmentation segmentation = TRY(segmenter.CodepointToGlyphSegments(
       font.get(), init_segment, segments, merge_groups));
+  auto end_time = std::chrono::high_resolution_clock::now();
+  std::chrono::duration<double> duration = end_time - start_time;
+  std::cerr << "CodepointToGlyphSegments took: " << duration.count() << " seconds" << std::endl;
 
   if (absl::GetFlag(FLAGS_output_segmentation_plan)) {
     SegmentationPlan plan = segmentation.ToSegmentationPlanProto();
diff --git a/util/generate_segmenter_config.cc b/util/generate_segmenter_config.cc
index d0fa8133..7af04759 100644
--- a/util/generate_segmenter_config.cc
+++ b/util/generate_segmenter_config.cc
@@ -20,6 +20,9 @@ ABSL_FLAG(std::string, input_font, "in.ttf",
 ABSL_FLAG(std::string, primary_script, "Script_latin",
           "The primary script or language frequency data file to use.");
 
+ABSL_FLAG(int, quality, 0,
+          "The quality level to use. A value of 0 means auto pick. Valid values are 1-8.");
+
 using absl::Status;
 using common::hb_face_unique_ptr;
 using util::AutoSegmenterConfig;
@@ -29,8 +32,13 @@ static Status Main(const std::vector<char*> args) {
   auto font_data = TRY(util::LoadFile(input_font_path.c_str()));
   hb_face_unique_ptr font = font_data.face();
 
+  std::optional<int> quality_level = std::nullopt;
+  if (absl::GetFlag(FLAGS_quality) > 0) {
+    quality_level = absl::GetFlag(FLAGS_quality);
+  }
+
   auto config = TRY(AutoSegmenterConfig::GenerateConfig(
-      font.get(), absl::GetFlag(FLAGS_primary_script)));
+      font.get(), absl::GetFlag(FLAGS_primary_script), quality_level));
 
   std::string output;
   if (!google::protobuf::TextFormat::PrintToString(config, &output)) {

From dc6c3a8d92a0fc930c2b9cdac2c01f82924095a5 Mon Sep 17 00:00:00 2001
From: Garret Rieger <grieger@google.com>
Date: Fri, 6 Mar 2026 21:50:59 +0000
Subject: [PATCH 3/3] Add the auto segmenter config to font2ift.

This allows font2ift to perform the full IFT encoding process:
1. Auto generate segmenter config.
2. Run segmenter.
3. Compile the font.

If a segmentation plan is not supplied to font2ift it will then using the segemnter auto config and closure segmenter to generate one.
---
 README.md                                  | 112 ++++++++++++++++-----
 ift/encoder/closure_glyph_segmenter.cc     |  33 ++++++
 ift/encoder/closure_glyph_segmenter.h      |   8 ++
 util/BUILD                                 |  20 ++++
 util/auto_config_flags.cc                  |  15 +++
 util/auto_config_flags.h                   |  11 ++
 util/auto_segmenter_config.cc              |  31 +++---
 util/auto_segmenter_config.h               |   5 +
 util/closure_glyph_keyed_segmenter_util.cc |  92 ++++-------------
 util/font2ift.cc                           |  76 ++++++++++----
 util/segmenter_config_util.cc              |  40 ++++++++
 util/segmenter_config_util.h               |  12 +++
 12 files changed, 327 insertions(+), 128 deletions(-)
 create mode 100644 util/auto_config_flags.cc
 create mode 100644 util/auto_config_flags.h

diff --git a/README.md b/README.md
index 6b2549ff..bd000300 100644
--- a/README.md
+++ b/README.md
@@ -58,7 +58,7 @@ script:
 ## Documentation
 
 The documents under [docs/experimental](docs/experimental) provide some more detailed designs of various aspects of the IFT encoder. Of note:
-* [compiler.md](docs/experimental)
+* [compiler.md](docs/experimental/compiler.md)
 * [closure_glyph_segmentation.md](docs/experimental/closure_glyph_segmentation.md)
 * [closure_glyph_segmentation_merging.md](docs/experimental/closure_glyph_segmentation_merging.md)
 * [closure_glyph_segmentation_complex_conditions.md](docs/experimental/closure_glyph_segmentation_complex_conditions.md)
@@ -77,13 +77,81 @@ bazel run @hedron_compile_commands//:refresh_all
 
 Will generate a compile_commands.json file.
 
-## Producing IFT Encoded Fonts
+## Producing IFT Encoded Fonts (with Auto Config)
 
-IFT encoded fonts are produced in two steps:
-1. A segmentation plan is generated which specifies how the font file should be split up in the IFT encoding.
-2. The IFT encoded font and patches are compiled by the Compiler sub module using the segmentation plan.
+The simplest way to create IFT fonts is via the `font2ift` utility utilizing the auto configuration mode.
+This is done by running the utility and not providing a segmentation plan. Example invocation:
 
-### Generating Segmentation Plan
+```bash
+bazel run -c opt @ift_encoder//util:font2ift -- \
+  --input_font="$HOME/fonts/myfont/MyFont.ttf" \
+  --output_path=$HOME/fonts/myfont/ift/ \
+  --output_font="MyFont-IFT.woff2"
+```
+
+This will analyze the input font, decide how to segment it, and then produce the final IFT encoded font
+and patches.
+
+When utilizing auto config there are two optional flags which can be used to adjust the behaviour:
+* `--auto_config_primary_script`: this tells the config generator which language/script the font is intended
+  to be used with. It has two effects: first the codepoints of the primary script are eligible to be moved
+  into the initial font. Second for scripts with large overlaps, such as CJK, primary script selects which
+  of the overlapping scripts to use frequency data from. Values refer to frequency data files in
+  [ift-encoder-data](https://github.com/w3c/ift-encoder-data/tree/main/data). Example values: "Script_bengali",
+  "Language_fr"
+
+* `--auto_config_quality`: This is analagous to a quality level in a compression library. It controls how much
+  effort is spent to improve the efficiency of the final IFT font. Values range from 1 to 8, where higher
+  values increase encoding times but typically result in a more efficient end IFT font (ie. less bytes
+  transferred by clients using it).
+
+Example command line with optional flags:
+
+```bash
+bazel run -c opt @ift_encoder//util:font2ift -- \
+  --input_font="$HOME/fonts/NotoSansJP-Regular.otf" \
+  --output_path=$HOME/fonts/ift/ \
+  --output_font="NotoSansJP-Regular-IFT.woff2" \
+  --auto_config_primary_script=Script_japanese \
+  --auto_config_quality=3
+```
+
+*Note: the auto configuration mode is still under development, in particular the auto selection of quality level
+is currently quite simplistic. It's expected to continue to evolve from it's current state.*
+
+## Producing IFT Encoded Fonts (Advanced)
+
+Under the hood IFT font encoding happens in three stages:
+
+1. Generate or write a segmenter config for the font.
+2. Generate a segmentation plan, which describes how the font is split into patches. Takes the segmenter config as an input.
+3. Compile the final IFT encoded font following the segmentation plan.
+
+For more advanced use cases these steps can be performed individually. This allows the segmenter config
+and segmentation plans to be fine tuned beyond what auto configuration is capable of.
+
+### Step 1: Generating a Segmenter Config
+
+There are two main options for generating a segmenter config:
+
+1. Write the config by hand, the segmenter is configured via an input configuration file using the
+    [segmenter_config.proto](util/segmenter_config.proto) schema, see the comments there for more details.
+    This option is useful when maximum control over segmentation parameters is needed, or custom frequency
+    data is being supplied.
+
+2. Auto generate the segmenter config using `util:generate_segmenter_config`.
+
+   ```
+   CC=clang bazel run //util:generate_segmenter_config -- \
+     --quality=5 \
+     --input_font=$HOME/MyFont.ttf > config.txtpb
+   ```
+
+   This analyzes the input font and tries to pick appropriate config values automatically. As discussed in
+   the previous "Producing IFT Encoded Fonts" section there is a configurable quality level. If needed
+   the auto generated config can be hand tweaked after generation.
+
+### Step 2: Generating Segmentation Plan
 
 Segmentation plans are in a [textproto format](https://protobuf.dev/reference/protobuf/textformat-spec/) using the
 [segmentation_plan.proto](util/segmentation_plan.proto) schema. See the comments in the schema file for more information.
@@ -93,17 +161,9 @@ possible to write plans by hand, or develop new utilities to generate plans.
 
 In this repo 3 options are currently provided:
 
-1.  `util/generate_table_keyed_config`: this utility generates the table keyed (extension segments that augment non
-    glyph data in the font) portion of a plan. Example execution:
-
-    ```sh
-    bazel run -c opt util:generate_table_keyed_config -- \
-      --font=$(pwd)/myfont.ttf \
-      latin.txt cyrillic.txt greek.txt > table_keyed.txtpb
-    ```
-
-2.  `util/closure_glyph_keyed_segmenter_util`: this utility uses a subsetting closure based approach to generate a glyph
-    keyed segmentation plan (extension segments that augment glyph data). Example execution:
+1. [Recommended] `util/closure_glyph_keyed_segmenter_util`: this utility uses a subsetting closure based approach
+    to generate a glyph keyed segmentation plan (extension segments that augment glyph data). It can optionally
+    generate the table keyed portion of the config as well. Example execution:
 
     ```sh
     bazel run -c opt util:closure_glyph_keyed_segmenter_util  -- \
@@ -119,6 +179,15 @@ In this repo 3 options are currently provided:
     Note: this utility is under active development and still very experimental. See
     [the status section](docs/experimental/closure_glyph_segmentation.md#status) for more details.
 
+2.  `util/generate_table_keyed_config`: this utility generates the table keyed (extension segments that augment non
+    glyph data in the font) portion of a plan. Example execution:
+
+    ```sh
+    bazel run -c opt util:generate_table_keyed_config -- \
+      --font=$(pwd)/myfont.ttf \
+      latin.txt cyrillic.txt greek.txt > table_keyed.txtpb
+    ```
+
 3.  `util/iftb2config`: this utility converts a segmentation obtained from the
     [binned incremental font transfer prototype](https://github.com/adobe/binned-ift-reference)
     into and equivalent segmentation plan. Example execution:
@@ -128,23 +197,20 @@ In this repo 3 options are currently provided:
       bazel run util:iftb2config > segmentation_plan.txtpb
     ```
 
-If seperate glyph keyed and table keyed configs were generated using #1 and #2 they can then be combined into one
+If separate glyph keyed and table keyed configs were generated using #1 and #2 they can then be combined into one
 complete plan by concatenating them:
 
 ```sh
 cat glyph_keyed.txtpb table_keyed.txtpb > segmentation_plan.txtpb
 ```
 
-Additional tools for generating encoder configs are planned to be added in the future.
-
 For concrete examples of how to generate IFT fonts, see the [IFT Demo](https://github.com/garretrieger/ift-demo).
 In particular the [Makefile](https://github.com/garretrieger/ift-demo/blob/main/Makefile) and the
 [segmenter configs](https://github.com/garretrieger/ift-demo/tree/main/config) may be helpful.
 
-### Generating an IFT Encoding
+### Step 3: Generating an IFT Encoding
 
-Once an segmentation plan has been created it can be combined with the target font to produce and incremental font and collection
-of associated patches using the font2ift utility which is a wrapper around the compiler. Example execution:
+Once a segmentation plan has been created it can be combined with the target font to produce an incremental font and collection of associated patches using the font2ift utility which is a wrapper around the compiler. Example execution:
 
 ```sh
 bazel -c opt run util:font2ift  -- \
diff --git a/ift/encoder/closure_glyph_segmenter.cc b/ift/encoder/closure_glyph_segmenter.cc
index a83d5daf..72227678 100644
--- a/ift/encoder/closure_glyph_segmenter.cc
+++ b/ift/encoder/closure_glyph_segmenter.cc
@@ -734,4 +734,37 @@ Status ClosureGlyphSegmenter::FallbackCost(
   return absl::OkStatus();
 }
 
+void ClosureGlyphSegmenter::AddTableKeyedSegments(
+    SegmentationPlan& plan,
+    const btree_map<SegmentSet, MergeStrategy>& merge_groups,
+    const std::vector<SubsetDefinition>& segments,
+    const SubsetDefinition& init_segment) {
+  std::vector<SubsetDefinition> table_keyed_segments;
+  for (const auto& [segment_ids, _] : merge_groups) {
+    SubsetDefinition new_segment;
+    for (uint32_t s : segment_ids) {
+      new_segment.Union(segments.at(s));
+    }
+    new_segment.Subtract(init_segment);
+    table_keyed_segments.push_back(new_segment);
+  }
+
+  uint32_t max_id = 0;
+  for (const auto& [id, _] : plan.segments()) {
+    if (id > max_id) {
+      max_id = id;
+    }
+  }
+
+  uint32_t next_id = max_id + 1;
+  auto* plan_segments = plan.mutable_segments();
+  for (const SubsetDefinition& def : table_keyed_segments) {
+    GlyphSegmentation::SubsetDefinitionToSegment(def,
+                                                 (*plan_segments)[next_id]);
+    SegmentsProto* segment_ids = plan.add_non_glyph_segments();
+    segment_ids->add_values(next_id);
+    next_id++;
+  }
+}
+
 }  // namespace ift::encoder
diff --git a/ift/encoder/closure_glyph_segmenter.h b/ift/encoder/closure_glyph_segmenter.h
index 09338559..d29e074c 100644
--- a/ift/encoder/closure_glyph_segmenter.h
+++ b/ift/encoder/closure_glyph_segmenter.h
@@ -4,6 +4,7 @@
 #include <optional>
 #include <vector>
 
+#include "absl/container/btree_map.h"
 #include "absl/status/statusor.h"
 #include "ift/encoder/glyph_segmentation.h"
 #include "ift/encoder/merge_strategy.h"
@@ -11,6 +12,7 @@
 #include "ift/encoder/subset_definition.h"
 #include "ift/freq/probability_calculator.h"
 #include "util/common.pb.h"
+#include "util/segmentation_plan.pb.h"
 #include "util/segmenter_config.pb.h"
 
 namespace ift::encoder {
@@ -89,6 +91,12 @@ class ClosureGlyphSegmenter {
                             uint32_t& fallback_glyphs_size,
                             uint32_t& all_glyphs_size) const;
 
+  static void AddTableKeyedSegments(
+      SegmentationPlan& plan,
+      const absl::btree_map<common::SegmentSet, MergeStrategy>& merge_groups,
+      const std::vector<SubsetDefinition>& segments,
+      const SubsetDefinition& init_segment);
+
  private:
   uint32_t brotli_quality_;
   uint32_t init_font_merging_brotli_quality_;
diff --git a/util/BUILD b/util/BUILD
index e46fec96..0dcb17d6 100644
--- a/util/BUILD
+++ b/util/BUILD
@@ -64,9 +64,15 @@ cc_binary(
     srcs = [
         "font2ift.cc",
     ],
+    data = [
+        "@ift_encoder_data//:freq_data",
+    ],
     deps = [
+        ":auto_config_flags",
+        ":auto_segmenter_config",
         ":load_codepoints",
         ":segmentation_plan_cc_proto",
+        ":segmenter_config_util",
         "//common",
         "//ift",
         "//ift/encoder",
@@ -76,6 +82,7 @@ cc_binary(
         "@abseil-cpp//absl/status:statusor",
         "@abseil-cpp//absl/strings",
         "@harfbuzz",
+        "//util:segmenter_config_cc_proto",
     ],
 )
 
@@ -103,6 +110,7 @@ cc_binary(
         "@ift_encoder_data//:freq_data",
     ],
     deps = [
+        ":auto_config_flags",
         ":auto_segmenter_config",
         ":load_codepoints",
         ":segmentation_plan_cc_proto",
@@ -138,6 +146,16 @@ cc_binary(
     ],
 )
 
+cc_library(
+    name = "auto_config_flags",
+    srcs = ["auto_config_flags.cc"],
+    hdrs = ["auto_config_flags.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "@abseil-cpp//absl/flags:flag",
+    ],
+)
+
 cc_library(
     name = "convert_iftb",
     srcs = [
@@ -203,10 +221,12 @@ cc_library(
     ],
     deps = [
         ":load_codepoints",
+        ":segmentation_plan_cc_proto",
         ":segmenter_config_cc_proto",
         "//common",
         "//ift/encoder",
         "@abseil-cpp//absl/status:statusor",
+        "@harfbuzz",
     ],
 )
 
diff --git a/util/auto_config_flags.cc b/util/auto_config_flags.cc
new file mode 100644
index 00000000..ab35f9f5
--- /dev/null
+++ b/util/auto_config_flags.cc
@@ -0,0 +1,15 @@
+#include "util/auto_config_flags.h"
+
+#include <string>
+
+#include "absl/flags/flag.h"
+
+ABSL_FLAG(int, auto_config_quality, 0,
+          "The quality level to use when generating a segmenter config. A value of 0 "
+          "means auto pick. Valid values are 1-8.");
+
+ABSL_FLAG(std::string, auto_config_primary_script, "Script_latin",
+          "When auto_config is enabled this sets the primary script or "
+          "language frequency data file to use. "
+          "The primary script is eligible to have codepoints moved to the init font. "
+          "For CJK primary script can be used to specialize against a specific language/script.");
diff --git a/util/auto_config_flags.h b/util/auto_config_flags.h
new file mode 100644
index 00000000..4f158361
--- /dev/null
+++ b/util/auto_config_flags.h
@@ -0,0 +1,11 @@
+#ifndef UTIL_AUTO_CONFIG_FLAGS_H_
+#define UTIL_AUTO_CONFIG_FLAGS_H_
+
+#include <string>
+
+#include "absl/flags/declare.h"
+
+ABSL_DECLARE_FLAG(int, auto_config_quality);
+ABSL_DECLARE_FLAG(std::string, auto_config_primary_script);
+
+#endif  // UTIL_AUTO_CONFIG_FLAGS_H_
diff --git a/util/auto_segmenter_config.cc b/util/auto_segmenter_config.cc
index 843404f0..b17b465a 100644
--- a/util/auto_segmenter_config.cc
+++ b/util/auto_segmenter_config.cc
@@ -2,7 +2,6 @@
 
 #include <cctype>
 #include <string>
-#include <unordered_map>
 
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/log.h"
@@ -50,12 +49,6 @@ enum Quality {
   MAX = 8, // Alias for EIGHT
 };
 
-// TODO(garretrieger): define a very basic set of quality levels first (see next TODO),
-//   start with just a lowest and highest to set the upper and lower bounds for quality
-//   settings (maybe also a mid point). To begin use number of codepoints to select quality
-//   level. Do some testing on segmentation times at low and high to get a sense of
-//   how times are impacted.
-
 // TODO(garretrieger): do something analagous to brotli quality levels
 // where we define a series of levels which correspond to a set of
 // values for the quality/performance tradeoff settings (including setting the
@@ -291,7 +284,7 @@ StatusOr<std::string> AutoSegmenterConfig::GetBaseScriptForLanguage(
   }
 
   static const auto* lang_to_script =
-      new std::unordered_map<std::string, std::string>{
+      new flat_hash_map<std::string, std::string> {
           {"Language_af", "Script_latin"},
           {"Language_ak", "Script_latin"},
           {"Language_am", "Script_ethiopic"},
@@ -602,7 +595,7 @@ static void ApplyQualityLevelTo(Quality quality, SegmenterConfig& config) {
   }
 }
 
-absl::StatusOr<SegmenterConfig> AutoSegmenterConfig::GenerateConfig(
+StatusOr<SegmenterConfig> AutoSegmenterConfig::GenerateConfig(
     hb_face_t* face, std::optional<std::string> primary_script, std::optional<int> quality_level) {
   SegmenterConfig config;
   config.set_generate_table_keyed_segments(true);
@@ -617,9 +610,22 @@ absl::StatusOr<SegmenterConfig> AutoSegmenterConfig::GenerateConfig(
   auto freq_list = TRY(BuiltInFrequenciesList());
   CodepointSet unicodes = FontHelper::ToCodepointsSet(face);
   uint32_t cp_count = unicodes.size();
-  Quality quality = cp_count > 2000 ? MIN : MAX;
-  if (quality_level.has_value() && quality_level.value() >= ONE && quality_level.value() <= MAX) {
+
+  // TODO(garretrieger): more sophisticated scheme for auto picking quality level.
+  // roughly we want to estimate the expected cost of each quality level and pick
+  // based on that.
+  Quality quality = THREE;
+  if (cp_count <= 1000) {
+    quality = MAX;
+  } else if (cp_count <= 3000) {
+    quality_level = SIX;
+  }
+
+  if (quality_level.has_value() && quality_level.value() >= MIN && quality_level.value() <= MAX) {
     quality = static_cast<Quality>(quality_level.value());
+    VLOG(0) << "Using specified quality level for segmenting: " << quality;
+  } else {
+    VLOG(0) << "Quality level unspecified, auto picked: " << quality;
   }
 
   // Detect scripts by intersection with frequency data
@@ -644,7 +650,6 @@ absl::StatusOr<SegmenterConfig> AutoSegmenterConfig::GenerateConfig(
 
     cost->set_built_in_freq_data_name(script);
     if (script == primary_script_file) {
-      // TODO(garretrieger): customize these values based on the quality level
       cost->set_initial_font_merge_threshold(-60);
     }
   }
@@ -654,4 +659,4 @@ absl::StatusOr<SegmenterConfig> AutoSegmenterConfig::GenerateConfig(
   return config;
 }
 
-}  // namespace util
+}  // namespace util
\ No newline at end of file
diff --git a/util/auto_segmenter_config.h b/util/auto_segmenter_config.h
index 9c974a7d..ba00a9a5 100644
--- a/util/auto_segmenter_config.h
+++ b/util/auto_segmenter_config.h
@@ -19,6 +19,11 @@ class AutoSegmenterConfig {
   // primary_script: an optional name of a script or language frequency data
   //                 file (e.g., "Script_cyrillic", "Language_fr").
   //                 Defaults to "Script_latin" if not provided.
+  //
+  // quality_level: ranges from 1-8, sets the segmenting time to segmentation
+  //                quality tradeoff. Lower values have shorter segmenting times,
+  //                high values have longer segmenting times but typically results
+  //                in better segmentation quality.
   static absl::StatusOr<SegmenterConfig> GenerateConfig(
       hb_face_t* face,
       std::optional<std::string> primary_script = std::nullopt,
diff --git a/util/closure_glyph_keyed_segmenter_util.cc b/util/closure_glyph_keyed_segmenter_util.cc
index fabe08af..3a26982e 100644
--- a/util/closure_glyph_keyed_segmenter_util.cc
+++ b/util/closure_glyph_keyed_segmenter_util.cc
@@ -23,6 +23,7 @@
 #include "ift/encoder/merge_strategy.h"
 #include "ift/encoder/subset_definition.h"
 #include "ift/freq/unicode_frequencies.h"
+#include "util/auto_config_flags.h"
 #include "util/auto_segmenter_config.h"
 #include "util/load_codepoints.h"
 #include "util/segmentation_plan.pb.h"
@@ -39,20 +40,11 @@ ABSL_FLAG(std::string, input_font, "in.ttf",
           "Name of the font to convert to IFT.");
 
 ABSL_FLAG(
-    std::string, config, "config.textpb",
+    std::string, config, "auto",
     "Path to a text proto file containing the configuration for the segmenter. "
-    "Should contain a single SegmenterConfig message.");
-
-ABSL_FLAG(int, auto_config_quality, 0,
-          "The quality level to use when auto_config is enabled. A value of 0 means auto pick. Valid values are 1-8.");
-
-ABSL_FLAG(bool, auto_config, false,
-          "If set the segmenter configuration will be automatically generated "
-          "based on the input font.");
-
-ABSL_FLAG(std::string, primary_script, "Script_latin",
-          "When auto_config is enabled this sets the primary script or "
-          "language frequency data file to use.");
+    "Should contain a single SegmenterConfig message. If set to \"auto\", then "
+    "segmenter configuration will be automatically generated "
+    "based on the input font.");
 
 ABSL_FLAG(bool, output_segmentation_plan, false,
           "If set a segmentation plan representing the determined segmentation "
@@ -97,13 +89,13 @@ using util::AutoSegmenterConfig;
 using util::SegmenterConfigUtil;
 
 static StatusOr<SegmenterConfig> LoadConfig(hb_face_t* font) {
-  if (absl::GetFlag(FLAGS_auto_config)) {
+  if (absl::GetFlag(FLAGS_config) == "auto") {
     std::optional<int> quality_level = std::nullopt;
     if (absl::GetFlag(FLAGS_auto_config_quality) > 0) {
       quality_level = absl::GetFlag(FLAGS_auto_config_quality);
     }
     return AutoSegmenterConfig::GenerateConfig(
-        font, absl::GetFlag(FLAGS_primary_script), quality_level);
+        font, absl::GetFlag(FLAGS_auto_config_primary_script), quality_level);
   }
 
   FontData config_text =
@@ -155,39 +147,6 @@ static Status Analysis(hb_face_t* font,
   return absl::OkStatus();
 }
 
-static void AddTableKeyedSegments(
-    SegmentationPlan& plan,
-    const btree_map<SegmentSet, MergeStrategy>& merge_groups,
-    const std::vector<SubsetDefinition>& segments,
-    const SubsetDefinition& init_segment) {
-  std::vector<SubsetDefinition> table_keyed_segments;
-  for (const auto& [segment_ids, _] : merge_groups) {
-    SubsetDefinition new_segment;
-    for (uint32_t s : segment_ids) {
-      new_segment.Union(segments.at(s));
-    }
-    new_segment.Subtract(init_segment);
-    table_keyed_segments.push_back(new_segment);
-  }
-
-  uint32_t max_id = 0;
-  for (const auto& [id, _] : plan.segments()) {
-    if (id > max_id) {
-      max_id = id;
-    }
-  }
-
-  uint32_t next_id = max_id + 1;
-  auto* plan_segments = plan.mutable_segments();
-  for (const SubsetDefinition& def : table_keyed_segments) {
-    GlyphSegmentation::SubsetDefinitionToSegment(def,
-                                                 (*plan_segments)[next_id]);
-    SegmentsProto* segment_ids = plan.add_non_glyph_segments();
-    segment_ids->add_values(next_id);
-    next_id++;
-  }
-}
-
 static Status OutputFallbackGlyphCount(hb_face_t* original_face,
                                        const ClosureGlyphSegmenter& segmenter,
                                        const GlyphSegmentation& segmentation) {
@@ -216,49 +175,29 @@ static Status Main(const std::vector<char*> args) {
   SegmenterConfig config = TRY(LoadConfig(font.get()));
 
   SegmenterConfigUtil config_util(
-      absl::GetFlag(FLAGS_auto_config) ? "" : absl::GetFlag(FLAGS_config));
-
-  CodepointSet font_codepoints = FontHelper::ToCodepointsSet(font.get());
-  btree_set<hb_tag_t> font_features = FontHelper::GetFeatureTags(font.get());
-  SubsetDefinition init_segment =
-      config_util.SegmentProtoToSubsetDefinition(config.initial_segment());
-
-  std::vector<SubsetDefinition> segments;
-  btree_map<SegmentSet, MergeStrategy> merge_groups =
-      TRY(config_util.ConfigToMergeGroups(config, font_codepoints,
-                                          font_features, segments));
-
-  ClosureGlyphSegmenter segmenter(
-      config.brotli_quality(), config.brotli_quality_for_initial_font_merging(),
-      config.unmapped_glyph_handling(), config.condition_analysis_mode());
+      (absl::GetFlag(FLAGS_config) == "auto") ? "" : absl::GetFlag(FLAGS_config));
 
   auto start_time = std::chrono::high_resolution_clock::now();
-  GlyphSegmentation segmentation = TRY(segmenter.CodepointToGlyphSegments(
-      font.get(), init_segment, segments, merge_groups));
+  auto result = TRY(config_util.RunSegmenter(font.get(), config));
   auto end_time = std::chrono::high_resolution_clock::now();
   std::chrono::duration<double> duration = end_time - start_time;
   std::cerr << "CodepointToGlyphSegments took: " << duration.count() << " seconds" << std::endl;
 
+  GlyphSegmentation segmentation = std::move(result.segmentation);
+  SegmentationPlan plan = std::move(result.plan);
+
   if (absl::GetFlag(FLAGS_output_segmentation_plan)) {
-    SegmentationPlan plan = segmentation.ToSegmentationPlanProto();
     if (!absl::GetFlag(FLAGS_include_initial_codepoints_in_config)) {
       // Requested to not include init codepoints in the generated config.
       plan.clear_initial_codepoints();
     }
 
-    if (config.generate_table_keyed_segments()) {
-      AddTableKeyedSegments(plan, merge_groups, segments, init_segment);
-    }
-
-    SegmentationPlan combined = config.base_segmentation_plan();
-    combined.MergeFrom(plan);
-
     // TODO(garretrieger): assign a basic (single segment) table keyed config.
     // Later on the input to this util should include information on how the
     // segments should be grouped together for the table keyed portion of the
     // font.
     std::string config_string;
-    TextFormat::PrintToString(combined, &config_string);
+    TextFormat::PrintToString(plan, &config_string);
     std::cout << config_string;
   } else {
     // No config requested, just output a simplified plain text representation
@@ -267,6 +206,9 @@ static Status Main(const std::vector<char*> args) {
   }
 
   if (absl::GetFlag(FLAGS_output_fallback_glyph_count)) {
+    ClosureGlyphSegmenter segmenter(
+        config.brotli_quality(), config.brotli_quality_for_initial_font_merging(),
+        config.unmapped_glyph_handling(), config.condition_analysis_mode());
     TRYV(OutputFallbackGlyphCount(font.get(), segmenter, segmentation));
   }
 
@@ -275,7 +217,7 @@ static Status Main(const std::vector<char*> args) {
   }
 
   std::cerr << ">> Analysis" << std::endl;
-  return Analysis(font.get(), merge_groups, segmentation);
+  return Analysis(font.get(), result.merge_groups, segmentation);
 }
 
 int main(int argc, char** argv) {
diff --git a/util/font2ift.cc b/util/font2ift.cc
index eb2564cf..6c543cb9 100644
--- a/util/font2ift.cc
+++ b/util/font2ift.cc
@@ -9,6 +9,8 @@
 #include "absl/container/flat_hash_map.h"
 #include "absl/flags/flag.h"
 #include "absl/flags/parse.h"
+#include "absl/log/globals.h"
+#include "absl/log/initialize.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "common/axis_range.h"
@@ -21,23 +23,30 @@
 #include "ift/encoder/compiler.h"
 #include "ift/encoder/glyph_segmentation.h"
 #include "ift/encoder/subset_definition.h"
+#include "util/auto_config_flags.h"
+#include "util/auto_segmenter_config.h"
 #include "util/load_codepoints.h"
 #include "util/segmentation_plan.pb.h"
+#include "util/segmenter_config.pb.h"
+#include "util/segmenter_config_util.h"
 
 /*
- * Utility that converts a standard font file into an IFT font file following a
+ * Utility that converts a standard font file into an IFT font file optionally following a
  * supplied segmentation plan.
  *
  * Configuration is provided as a textproto file following the
  * segmentation_plan.proto schema.
+ *
+ * If no configuration is supplied it will be auto generated.
  */
 
 ABSL_FLAG(std::string, input_font, "in.ttf",
           "Name of the font to convert to IFT.");
 
-ABSL_FLAG(std::string, plan, "",
+ABSL_FLAG(std::string, plan, "auto",
           "Path to a plan file which is a textproto following the "
-          "segmentation_plan.proto schema.");
+          "segmentation_plan.proto schema. If set to \"auto\", then "
+          "segmentation plan will be automatically generated.");
 
 ABSL_FLAG(std::string, output_path, "./",
           "Path to write output files under (base font and patches).");
@@ -50,6 +59,10 @@ ABSL_FLAG(bool, woff2_encode, true,
           "in woff2 will be disabled when necessary to keep the woff2 encoding "
           "compatible with IFT.");
 
+ABSL_FLAG(
+    int, verbosity, 0,
+    "Log verbosity level from. 0 is least verbose, higher values are more.");
+
 using absl::btree_set;
 using absl::flat_hash_map;
 using absl::Status;
@@ -68,6 +81,7 @@ using ift::encoder::Compiler;
 using ift::encoder::design_space_t;
 using ift::encoder::GlyphSegmentation;
 using ift::encoder::SubsetDefinition;
+using util::AutoSegmenterConfig;
 
 // TODO(garretrieger): add check that all glyph patches have at least one
 // activation condition.
@@ -258,22 +272,44 @@ Status ConfigureCompiler(SegmentationPlan plan, Compiler& compiler) {
   return absl::OkStatus();
 }
 
-int main(int argc, char** argv) {
-  auto args = absl::ParseCommandLine(argc, argv);
+StatusOr<SegmentationPlan> CreateSegmentationPlan(hb_face_t* font) {
+  SegmentationPlan plan;
+  if (absl::GetFlag(FLAGS_plan).empty() || absl::GetFlag(FLAGS_plan) == "auto") {
+    std::cerr << ">> auto generating segmentation plan:" << std::endl;
+    std::optional<int> quality_level = std::nullopt;
+    if (absl::GetFlag(FLAGS_auto_config_quality) > 0) {
+      quality_level = absl::GetFlag(FLAGS_auto_config_quality);
+    }
+    auto config = AutoSegmenterConfig::GenerateConfig(
+        font, absl::GetFlag(FLAGS_auto_config_primary_script), quality_level);
+    if (!config.ok()) {
+      return absl::InternalError(StrCat("Failed to generate config: ", config.status().message()));
+    }
+    util::SegmenterConfigUtil config_util("");
+    auto result = config_util.RunSegmenter(font, *config);
+    if (!result.ok()) {
+      return absl::InternalError(StrCat("Failed to run segmenter: ", result.status().message()));
+    }
+    plan = std::move(result->plan);
+  } else {
+    auto config_text = util::LoadFile(absl::GetFlag(FLAGS_plan).c_str());
+    if (!config_text.ok()) {
+      return absl::InternalError(StrCat("Failed to load config file: ", config_text.status().message()));
+    }
 
-  auto config_text = util::LoadFile(absl::GetFlag(FLAGS_plan).c_str());
-  if (!config_text.ok()) {
-    std::cerr << "Failed to load config file: " << config_text.status()
-              << std::endl;
-    return -1;
+    if (!google::protobuf::TextFormat::ParseFromString(config_text->str(),
+                                                       &plan)) {
+      return absl::InternalError("Failed to parse input config.");
+    }
   }
+  return plan;
+}
 
-  SegmentationPlan plan;
-  if (!google::protobuf::TextFormat::ParseFromString(config_text->str(),
-                                                     &plan)) {
-    std::cerr << "Failed to parse input config." << std::endl;
-    return -1;
-  }
+int main(int argc, char** argv) {
+  absl::SetStderrThreshold(absl::LogSeverityAtLeast::kInfo);
+  absl::SetGlobalVLogLevel(absl::GetFlag(FLAGS_verbosity));
+  auto args = absl::ParseCommandLine(argc, argv);
+  absl::InitializeLog();
 
   auto font = load_font(absl::GetFlag(FLAGS_input_font).c_str());
   if (!font.ok()) {
@@ -281,10 +317,16 @@ int main(int argc, char** argv) {
     return -1;
   }
 
+  auto plan = CreateSegmentationPlan(font->get());
+  if (!plan.ok()) {
+    std::cerr << plan.status().message() << std::endl;
+    return -1;
+  }
+
   Compiler compiler;
   compiler.SetFace(font->get());
 
-  auto sc = ConfigureCompiler(plan, compiler);
+  auto sc = ConfigureCompiler(*plan, compiler);
   if (!sc.ok()) {
     std::cerr << "Failed to apply configuration to the encoder: " << sc
               << std::endl;
diff --git a/util/segmenter_config_util.cc b/util/segmenter_config_util.cc
index 0da4fe6d..b3320f21 100644
--- a/util/segmenter_config_util.cc
+++ b/util/segmenter_config_util.cc
@@ -2,8 +2,11 @@
 
 #include <cstdint>
 
+#include "common/font_helper.h"
 #include "common/int_set.h"
 #include "common/try.h"
+#include "ift/encoder/closure_glyph_segmenter.h"
+#include "ift/encoder/glyph_segmentation.h"
 #include "ift/encoder/merge_strategy.h"
 #include "ift/encoder/subset_definition.h"
 #include "ift/feature_registry/feature_registry.h"
@@ -20,6 +23,8 @@ using ift::encoder::MergeStrategy;
 using ift::encoder::SubsetDefinition;
 using ift::feature_registry::DefaultFeatureTags;
 using ift::freq::UnicodeFrequencies;
+using ift::encoder::ClosureGlyphSegmenter;
+using ift::encoder::GlyphSegmentation;
 
 namespace util {
 
@@ -277,4 +282,39 @@ SegmenterConfigUtil::ConfigToMergeGroups(
   return merge_groups;
 }
 
+StatusOr<SegmentationResult> SegmenterConfigUtil::RunSegmenter(
+    hb_face_t* face, const SegmenterConfig& config) {
+  CodepointSet font_codepoints = common::FontHelper::ToCodepointsSet(face);
+  btree_set<hb_tag_t> font_features = common::FontHelper::GetFeatureTags(face);
+  SubsetDefinition init_segment =
+      SegmentProtoToSubsetDefinition(config.initial_segment());
+
+  std::vector<SubsetDefinition> segments;
+  btree_map<SegmentSet, MergeStrategy> merge_groups =
+      TRY(ConfigToMergeGroups(config, font_codepoints, font_features, segments));
+
+  ClosureGlyphSegmenter segmenter(
+      config.brotli_quality(), config.brotli_quality_for_initial_font_merging(),
+      config.unmapped_glyph_handling(), config.condition_analysis_mode());
+
+  GlyphSegmentation segmentation = TRY(segmenter.CodepointToGlyphSegments(
+      face, init_segment, segments, merge_groups));
+
+  SegmentationPlan plan = segmentation.ToSegmentationPlanProto();
+
+  if (config.generate_table_keyed_segments()) {
+    ClosureGlyphSegmenter::AddTableKeyedSegments(
+        plan, merge_groups, segments, init_segment);
+  }
+
+  SegmentationPlan combined = config.base_segmentation_plan();
+  combined.MergeFrom(plan);
+
+  return SegmentationResult{
+      std::move(segmentation),
+      std::move(combined),
+      std::move(merge_groups),
+  };
+}
+
 }  // namespace util
\ No newline at end of file
diff --git a/util/segmenter_config_util.h b/util/segmenter_config_util.h
index 6eb6d332..6c9ce50d 100644
--- a/util/segmenter_config_util.h
+++ b/util/segmenter_config_util.h
@@ -4,17 +4,29 @@
 #include "absl/container/btree_map.h"
 #include "absl/status/statusor.h"
 #include "common/int_set.h"
+#include "hb.h"
+#include "ift/encoder/glyph_segmentation.h"
 #include "ift/encoder/merge_strategy.h"
 #include "ift/encoder/subset_definition.h"
+#include "util/segmentation_plan.pb.h"
 #include "util/segmenter_config.pb.h"
 
 namespace util {
 
+struct SegmentationResult {
+  ift::encoder::GlyphSegmentation segmentation;
+  SegmentationPlan plan;
+  absl::btree_map<common::SegmentSet, ift::encoder::MergeStrategy> merge_groups;
+};
+
 class SegmenterConfigUtil {
  public:
   SegmenterConfigUtil(std::string config_file_path)
       : config_file_path_(config_file_path) {}
 
+  absl::StatusOr<SegmentationResult> RunSegmenter(
+      hb_face_t* face, const SegmenterConfig& config);
+
   ift::encoder::SubsetDefinition SegmentProtoToSubsetDefinition(
       const SegmentProto& segment);