From 9efb2033d0e0239e9971605a8097ad98868bd3f6 Mon Sep 17 00:00:00 2001 From: Garret Rieger Date: Wed, 18 Feb 2026 23:01:16 +0000 Subject: [PATCH 1/3] Very basic implementation of a segmenter config generator. This new util can analyze an input font and generate a segmenter config for it. Currently very early stages. --- README.md | 10 + util/BUILD | 58 +++ util/auto_segmenter_config.cc | 535 +++++++++++++++++++++ util/auto_segmenter_config.h | 37 ++ util/auto_segmenter_config_test.cc | 279 +++++++++++ util/closure_glyph_keyed_segmenter_util.cc | 22 +- util/generate_segmenter_config.cc | 55 +++ util/load_codepoints_test.cc | 5 + 8 files changed, 998 insertions(+), 3 deletions(-) create mode 100644 util/auto_segmenter_config.cc create mode 100644 util/auto_segmenter_config.h create mode 100644 util/auto_segmenter_config_test.cc create mode 100644 util/generate_segmenter_config.cc diff --git a/README.md b/README.md index d861e63d..6b2549ff 100644 --- a/README.md +++ b/README.md @@ -55,6 +55,16 @@ script: ./check-format.sh --fix ``` +## Documentation + +The documents under [docs/experimental](docs/experimental) provide some more detailed designs of various aspects of the IFT encoder. Of note: +* [compiler.md](docs/experimental) +* [closure_glyph_segmentation.md](docs/experimental/closure_glyph_segmentation.md) +* [closure_glyph_segmentation_merging.md](docs/experimental/closure_glyph_segmentation_merging.md) +* [closure_glyph_segmentation_complex_conditions.md](docs/experimental/closure_glyph_segmentation_complex_conditions.md) + +Provide a detailed design of how the two major pieces (segmentation and compilation) of IFT font encoding work. + ## Generating compile_commands.json for IDE This repo is configured to use [hedron](https://github.com/hedronvision/bazel-compile-commands-extractor) to produce a diff --git a/util/BUILD b/util/BUILD index 6248f0c8..e46fec96 100644 --- a/util/BUILD +++ b/util/BUILD @@ -103,6 +103,7 @@ cc_binary( "@ift_encoder_data//:freq_data", ], deps = [ + ":auto_segmenter_config", ":load_codepoints", ":segmentation_plan_cc_proto", ":segmenter_config_cc_proto", @@ -154,6 +155,23 @@ cc_library( ], ) +cc_library( + name = "auto_segmenter_config", + srcs = [ + "auto_segmenter_config.cc", + ], + hdrs = [ + "auto_segmenter_config.h", + ], + deps = [ + ":load_codepoints", + ":segmenter_config_cc_proto", + "//common", + "@abseil-cpp//absl/container:flat_hash_set", + "@harfbuzz", + ], +) + cc_library( name = "load_codepoints", srcs = [ @@ -192,6 +210,24 @@ cc_library( ], ) +cc_test( + name = "auto_segmenter_config_test", + size = "small", + srcs = [ + "auto_segmenter_config_test.cc", + ], + data = [ + "//common:testdata", + "@ift_encoder_data//:freq_data", + ], + deps = [ + ":auto_segmenter_config", + "//common", + "@googletest//:gtest_main", + "@harfbuzz", + ], +) + cc_test( name = "convert_iftb_test", size = "small", @@ -247,6 +283,28 @@ cc_test( ], ) +cc_binary( + name = "generate_segmenter_config", + srcs = [ + "generate_segmenter_config.cc", + ], + data = [ + "@ift_encoder_data//:freq_data", + ], + deps = [ + ":auto_segmenter_config", + ":load_codepoints", + ":segmenter_config_cc_proto", + "//common", + "@abseil-cpp//absl/flags:flag", + "@abseil-cpp//absl/flags:parse", + "@abseil-cpp//absl/log:initialize", + "@abseil-cpp//absl/status", + "@harfbuzz", + "@protobuf", + ], +) + cc_binary( name = "iftb2config", srcs = [ diff --git a/util/auto_segmenter_config.cc b/util/auto_segmenter_config.cc new file mode 100644 index 00000000..f3f99f26 --- /dev/null +++ b/util/auto_segmenter_config.cc @@ -0,0 +1,535 @@ +#include "util/auto_segmenter_config.h" + +#include +#include +#include + +#include "absl/container/flat_hash_set.h" +#include "absl/log/log.h" +#include "absl/strings/match.h" +#include "absl/strings/strip.h" +#include "common/font_helper.h" +#include "common/int_set.h" +#include "common/try.h" +#include "hb.h" +#include "util/load_codepoints.h" +#include "util/segmenter_config.pb.h" + +using absl::btree_set; +using absl::flat_hash_map; +using absl::flat_hash_set; +using absl::Status; +using absl::StatusOr; +using common::CodepointSet; +using common::FontHelper; + +namespace util { + +static constexpr uint32_t kMinimumGroupSize = 4; + +// TODO(garretrieger): define a very basic set of quality levels first (see next TODO), +// start with just a lowest and highest to set the upper and lower bounds for quality +// settings (maybe also a mid point). To begin use number of codepoints to select quality +// level. Do some testing on segmentation times at low and high to get a sense of +// how times are impacted. + +// TODO(garretrieger): do something analagous to brotli quality levels +// where we define a series of levels which correspond to a set of +// values for the quality/performance tradeoff settings (including setting the +// brotli) quality level. Then we need a heuristic to pick a quality level for a +// font. +// +// If we have the ability to estimate the number of brotli ops resulting from +// a specific quality level (including a multiplier for the particular brotli +// quality) then we can select a quality level which keeps brotli ops and closure +// ops within a specific range. +// +// Then can also have a flag/input to force a specific quality level. +// +// To start, the list of parameters we can use to make quality/performance +// tradeoffs: +// +// - unmapped_glyph_handling (global) +// Lower quality is to not find conditions (so use patch or init font), high quality +// is to find conditions. +// +// - generate_feature_segments (global): high quality generate segment per feature, low quality put all optional features +// in one segment. +// +// - brotli_quality (global) +// Use estimated number of brotli ops (per merge group) to set this. Take +// into account the affects of preprocess merging prior to selecting this. +// to start use 0, 9 or 11 (avoid qualities less than 9 other than 0) +// +// - brotli_quality_for_initial_font_merging (global) +// Use estiamted number of brotli ops for the init font processing to set +// this (by looking at what's potentially inscope) +// +// - preprocess_merging_group_size_for_ungrouped (global) +// Would be reasonable to always have this set to at least the minimum group +// size. +// +// - condition_analysis_mode: always use CLOSURE_AND_DEP_GRAPH. +// +// Merge group settings: +// +// - preprocess_merging_group_size (merge group) +// - preprocess_merging_probability_threshold (merge group) +// Set these for merge groups with very large size, using probability +// threshold first, then group size to clamp ops to a reasonable value. +// group size always starts at the min group size. +// +// - use_bigrams (merge group, cost) +// Probably always want this on, use other settings instead to increase +// performance. On very lowest quality could be disabled +// +// - optimization_cutoff_fraction (merge group, cost) +// For now, probably ok with a global setting of somewhere around 1 to 2.5% +// (doesn't vary). +// +// - initial_font_merge_probability_threshold (merge group, cost) +// May be ok with a global setting, start with 50% +// +// - best_case_size_reduction_fraction (mergr group, cost) +// Default is probably fine, but may be worth changing. +// +// - min/max patch_size (merge group, heuristic): +// Probably fixed value for all qualities, has minimal impact on performance. +// +// We may want a quality level per merge group, for the init font merge, +// and global +// +// Utilizing quality levels: +// - Have a configurable setting to the auto config call which specifies a rough encoding budget +// (ie. O(1 min), O(10 min), O(1 hour)). Then try to estimate the encoding time at each +// quality level and select the quality level which gets estimated time within the budget. +// - Brotli and closure ops can both contribute significantly to overall segmenting times, +// so we will need to first estimate the typical brotli and closure operation time cost +// for the particular font (eg. run a few random closures and brotli compressions) +// - Then estimate the number of ops that are needed. For closure take into account +// how much savings the dep graph can provide. +// - Finall overall time can be estimated (number ops) * (op time) * (fixed scaling factor) +// for both brotli and closure. Total time is the sum. + +// TODO(garretrieger): to help speed up init font processing times when latin is primary script +// consider adding the latin alphabet (upper and lower) directly to the init font. Similar things +// could be done for other scripts if we can find data on what the "core" alphabet is. + +// TODO(garretrieger): collect data on brotli compression times as a function of +// quality assuming group sizes of 4 using a CJK font + +static bool IsScript(absl::string_view file_name) { + return absl::StartsWith(file_name, "Script_"); +} + +static bool IsLanguage(absl::string_view file_name) { + return absl::StartsWith(file_name, "Language_"); +} + +// Changes from "Script_foo.riegeli" to "Foo". +static std::string ScriptName(absl::string_view script_name) { + if (IsScript(script_name)) { + script_name.remove_prefix(7); + } + std::string name(script_name); + size_t dot_pos = name.find('.'); + if (dot_pos != std::string::npos) { + name = name.substr(0, dot_pos); + } + + if (!name.empty() && std::islower(name[0])) { + name[0] = std::toupper(name[0]); + } + return name; +} + +static flat_hash_set CjkScripts() { + return { + "Script_CJK.riegeli@*", + "Script_japanese.riegeli@*", + "Script_korean.riegeli@*", + "Script_chinese-simplified.riegeli@*", + "Script_chinese-traditional.riegeli@*", + }; +} + +static CodepointSet CommonCodepoints( + const flat_hash_map& freq_list, bool cjk_only) { + auto cjk_scripts = CjkScripts(); + flat_hash_map unicode_counts; + for (const auto& [file_name, script_codepoints] : freq_list) { + if (!IsScript(file_name)) { + continue; + } + + if (file_name == "Script_CJK.riegeli@*") { + // this is a combination of CJK so ignore for the purposes of common + // codepoints. + continue; + } + + bool is_cjk = cjk_scripts.contains(file_name); + if (cjk_only && !is_cjk) { + continue; + } + + for (hb_codepoint_t u : script_codepoints) { + unicode_counts[u]++; + } + } + + CodepointSet common_codepoints; + for (const auto& [u, count] : unicode_counts) { + if (count > 1) { + common_codepoints.insert(u); + } + } + + return common_codepoints; +} + +static btree_set DetectScripts( + const flat_hash_map& freq_list, + const CodepointSet& unicodes) { + btree_set detected_scripts; + flat_hash_set detected_cjk_scripts; + + CodepointSet common = CommonCodepoints(freq_list, false); + auto cjk_scripts = CjkScripts(); + + for (const auto& [file_name, script_codepoints] : freq_list) { + if (!IsScript(file_name) && file_name != "fallback.riegeli") { + continue; + } + if (file_name == "Script_CJK.riegeli@*") { + // special cased later. + continue; + } + + // To avoid false positives on fonts with common ASCII/punctuation, + // only consider codepoints outside the basic Latin range for detection. + CodepointSet unique_codepoints = script_codepoints; + unique_codepoints.subtract(common); + + CodepointSet intersection = unique_codepoints; + intersection.intersect(unicodes); + + // TODO(garretrieger): consider using a threshold on intersection size here. + if (intersection.size() > 1) { + LOG(INFO) << "Script " << file_name << " is present, " + << intersection.size() << " codepoints."; + detected_scripts.insert(file_name); + if (cjk_scripts.contains(file_name)) { + detected_cjk_scripts.insert(file_name); + } + } + } + + // Since the language specific CJK scripts all overlap if we have detected + // more than one, or the only codepoints present are common to all cjk scripts + // then replace the language specific scripts with the unified CJK script. + CodepointSet only_cjk_common = CommonCodepoints(freq_list, true); + only_cjk_common.subtract(common); + if (detected_cjk_scripts.size() > 1 || + (detected_cjk_scripts.empty() && only_cjk_common.intersects(unicodes))) { + // upgrade from individual CJK scripts to the unified one. + for (const auto& script : detected_cjk_scripts) { + detected_scripts.erase(script); + } + + LOG(INFO) << "Script_CJK.riegeli@* added to detected list."; + detected_scripts.insert("Script_CJK.riegeli@*"); + } + + return detected_scripts; +} + +static StatusOr FindFileName( + absl::string_view base_name, + const flat_hash_map& built_in_freqs) { + for (const auto& [file_name, _] : built_in_freqs) { + if (absl::StartsWith(file_name, base_name) && + (file_name.size() == base_name.size() || + file_name[base_name.size()] == '.')) { + return file_name; + } + } + return absl::NotFoundError( + absl::StrCat("Freq file for ", base_name, " was not found.")); +} + +StatusOr AutoSegmenterConfig::GetBaseScriptForLanguage( + absl::string_view language) { + if (absl::EndsWith(language, ".riegeli")) { + language = absl::StripSuffix(language, ".riegeli"); + } + if (absl::EndsWith(language, ".riegeli@*")) { + language = absl::StripSuffix(language, ".riegeli@*"); + } + + static const auto* lang_to_script = + new std::unordered_map{ + {"Language_af", "Script_latin"}, + {"Language_ak", "Script_latin"}, + {"Language_am", "Script_ethiopic"}, + {"Language_ar", "Script_arabic"}, + {"Language_ar-Latn", "Script_latin"}, + {"Language_as", "Script_bengali"}, + {"Language_ay", "Script_latin"}, + {"Language_az", "Script_latin"}, + {"Language_be", "Script_cyrillic"}, + {"Language_bg", "Script_cyrillic"}, + {"Language_bg-Latn", "Script_latin"}, + {"Language_bho", "Script_devanagari"}, + {"Language_bm", "Script_latin"}, + {"Language_bn", "Script_bengali"}, + {"Language_bn-Latn", "Script_latin"}, + {"Language_bs", "Script_latin"}, + {"Language_ca", "Script_latin"}, + {"Language_ceb", "Script_latin"}, + {"Language_ckb", "Script_arabic"}, + {"Language_co", "Script_latin"}, + {"Language_cs", "Script_latin"}, + {"Language_cy", "Script_latin"}, + {"Language_da", "Script_latin"}, + {"Language_de", "Script_latin"}, + {"Language_doi", "Script_devanagari"}, + {"Language_dv", "Script_thaana"}, + {"Language_ee", "Script_latin"}, + {"Language_el", "Script_greek"}, + {"Language_el-Latn", "Script_latin"}, + {"Language_en", "Script_latin"}, + {"Language_en-Cyrl", "Script_cyrillic"}, + {"Language_eo", "Script_latin"}, + {"Language_es", "Script_latin"}, + {"Language_et", "Script_latin"}, + {"Language_eu", "Script_latin"}, + {"Language_fa", "Script_arabic"}, + {"Language_ff", "Script_latin"}, + {"Language_fi", "Script_latin"}, + {"Language_fil", "Script_latin"}, + {"Language_fr", "Script_latin"}, + {"Language_fy", "Script_latin"}, + {"Language_ga", "Script_latin"}, + {"Language_gd", "Script_latin"}, + {"Language_gl", "Script_latin"}, + {"Language_gn", "Script_latin"}, + {"Language_gu", "Script_gujarati"}, + {"Language_gu-Latn", "Script_latin"}, + {"Language_ha", "Script_latin"}, + {"Language_haw", "Script_latin"}, + {"Language_hi", "Script_devanagari"}, + {"Language_hi-Latn", "Script_latin"}, + {"Language_hmn", "Script_latin"}, + {"Language_hr", "Script_latin"}, + {"Language_ht", "Script_latin"}, + {"Language_hu", "Script_latin"}, + {"Language_hy", "Script_armenian"}, + {"Language_id", "Script_latin"}, + {"Language_ig", "Script_latin"}, + {"Language_ilo", "Script_latin"}, + {"Language_is", "Script_latin"}, + {"Language_it", "Script_latin"}, + {"Language_iw", "Script_hebrew"}, + {"Language_ja", "Script_japanese"}, + {"Language_ja-Latn", "Script_latin"}, + {"Language_jv", "Script_latin"}, + {"Language_ka", "Script_georgian"}, + {"Language_kk", "Script_cyrillic"}, + {"Language_kl", "Script_latin"}, + {"Language_km", "Script_khmer"}, + {"Language_kn", "Script_kannada"}, + {"Language_kn-Latn", "Script_latin"}, + {"Language_ko", "Script_korean"}, + {"Language_kok", "Script_devanagari"}, + {"Language_kri", "Script_latin"}, + {"Language_ku", "Script_latin"}, + {"Language_ky", "Script_cyrillic"}, + {"Language_la", "Script_latin"}, + {"Language_lb", "Script_latin"}, + {"Language_lg", "Script_latin"}, + {"Language_ln", "Script_latin"}, + {"Language_lo", "Script_lao"}, + {"Language_lt", "Script_latin"}, + {"Language_lus", "Script_latin"}, + {"Language_lv", "Script_latin"}, + {"Language_mai", "Script_devanagari"}, + {"Language_mg", "Script_latin"}, + {"Language_mi", "Script_latin"}, + {"Language_mk", "Script_cyrillic"}, + {"Language_ml", "Script_malayalam"}, + {"Language_ml-Latn", "Script_latin"}, + {"Language_mn", "Script_cyrillic"}, + {"Language_mni-Mtei", "Script_meetei-mayek"}, + {"Language_mr", "Script_devanagari"}, + {"Language_mr-Latn", "Script_latin"}, + {"Language_ms", "Script_latin"}, + {"Language_mt", "Script_latin"}, + {"Language_my", "Script_myanmar"}, + {"Language_ne", "Script_devanagari"}, + {"Language_nl", "Script_latin"}, + {"Language_no", "Script_latin"}, + {"Language_nso", "Script_latin"}, + {"Language_ny", "Script_latin"}, + {"Language_om", "Script_latin"}, + {"Language_or", "Script_oriya"}, + {"Language_pa", "Script_gurmukhi"}, + {"Language_pl", "Script_latin"}, + {"Language_ps", "Script_arabic"}, + {"Language_pt", "Script_latin"}, + {"Language_qu", "Script_latin"}, + {"Language_ro", "Script_latin"}, + {"Language_ru", "Script_cyrillic"}, + {"Language_ru-Latn", "Script_latin"}, + {"Language_rw", "Script_latin"}, + {"Language_sa", "Script_devanagari"}, + {"Language_sd", "Script_arabic"}, + {"Language_si", "Script_sinhala"}, + {"Language_sk", "Script_latin"}, + {"Language_sl", "Script_latin"}, + {"Language_sm", "Script_latin"}, + {"Language_sn", "Script_latin"}, + {"Language_so", "Script_latin"}, + {"Language_sq", "Script_latin"}, + {"Language_sr", "Script_cyrillic"}, + {"Language_st", "Script_latin"}, + {"Language_su", "Script_latin"}, + {"Language_sv", "Script_latin"}, + {"Language_sw", "Script_latin"}, + {"Language_ta", "Script_tamil"}, + {"Language_ta-Latn", "Script_latin"}, + {"Language_te", "Script_telugu"}, + {"Language_te-Latn", "Script_latin"}, + {"Language_tg", "Script_cyrillic"}, + {"Language_th", "Script_thai"}, + {"Language_ti", "Script_ethiopic"}, + {"Language_tk", "Script_latin"}, + {"Language_tr", "Script_latin"}, + {"Language_ts", "Script_latin"}, + {"Language_tt", "Script_cyrillic"}, + {"Language_ug", "Script_arabic"}, + {"Language_uk", "Script_cyrillic"}, + {"Language_ur", "Script_arabic"}, + {"Language_uz", "Script_latin"}, + {"Language_vi", "Script_latin"}, + {"Language_xh", "Script_latin"}, + {"Language_yi", "Script_hebrew"}, + {"Language_yo", "Script_latin"}, + {"Language_zh-Hani", "Script_chinese-simplified"}, + {"Language_zh-Hans", "Script_chinese-simplified"}, + {"Language_zh-Hant", "Script_chinese-traditional"}, + {"Language_zh-Latn", "Script_latin"}, + {"Language_zu", "Script_latin"}, + }; + auto it = lang_to_script->find(std::string(language)); + if (it != lang_to_script->end()) { + return it->second; + } + return absl::NotFoundError( + absl::StrCat("Unable to find base script for ", language)); +} + +static Status ApplyPrimaryScript( + const flat_hash_map& freq_list, + std::string primary_script, btree_set& detected_scripts) { + std::string primary_base_script = ""; + if (IsLanguage(primary_script)) { + primary_base_script = TRY(FindFileName( + TRY(AutoSegmenterConfig::GetBaseScriptForLanguage(primary_script)), + freq_list)); + } else if (IsScript(primary_script)) { + primary_base_script = TRY(FindFileName(primary_script, freq_list)); + } else { + return absl::InternalError( + absl::StrCat("Unknown freq file type: ", primary_script)); + } + + primary_script = TRY(FindFileName(primary_script, freq_list)); + LOG(INFO) << "Primary script/language: " << primary_script; + LOG(INFO) << "Primary base script is " << primary_base_script; + + // Primary script behaviour: + // - base script if present is replaced by primary script. + // - if base script is CJK, then all CJK's are replaced by primary script + detected_scripts.erase(primary_base_script); + auto cjk_scripts = CjkScripts(); + if (cjk_scripts.contains(primary_base_script)) { + for (const auto& script : cjk_scripts) { + detected_scripts.erase(script); + } + } + + detected_scripts.insert(primary_script); + + return absl::OkStatus(); +} + +absl::StatusOr AutoSegmenterConfig::GenerateConfig( + hb_face_t* face, std::optional primary_script) { + SegmenterConfig config; + config.set_generate_table_keyed_segments(true); + config.set_generate_feature_segments(true); + config.set_unmapped_glyph_handling(FIND_CONDITIONS); + config.set_condition_analysis_mode(CLOSURE_AND_DEP_GRAPH); + + auto* base_plan = config.mutable_base_segmentation_plan(); + base_plan->set_jump_ahead(2); + base_plan->set_use_prefetch_lists(true); + + config.mutable_ungrouped_config()->set_min_patch_size(2500); + + // Collect codepoints + auto freq_list = TRY(BuiltInFrequenciesList()); + CodepointSet unicodes = FontHelper::ToCodepointsSet(face); + uint32_t cp_count = unicodes.size(); + + // Detect scripts by intersection with frequency data + btree_set detected_scripts = DetectScripts(freq_list, unicodes); + + // Quality tradeoffs based on codepoint count + // TODO(garretrieger): alternate approach - estimate the number of brotli ops + // (including accounting for pairs only within merge groups), and then select + // the cutoffs and premerging to keep the number of brotli ops within a + // specific range. + auto* base_cost = config.mutable_base_cost_config(); + base_cost->set_use_bigrams(true); + base_cost->set_min_group_size( + kMinimumGroupSize); // as recommended by the spec. + config.set_preprocess_merging_group_size_for_ungrouped(kMinimumGroupSize); + base_cost->set_optimization_cutoff_fraction(0.01); + + if (cp_count > 2000) { + config.set_brotli_quality(9); + } else { + config.set_brotli_quality(11); + } + + TRYV(ApplyPrimaryScript(freq_list, primary_script.value_or("Script_latin"), + detected_scripts)); + std::string primary_script_file = + TRY(FindFileName(primary_script.value_or("Script_latin"), freq_list)); + + // Add merge groups for other detected scripts + for (const std::string& script : detected_scripts) { + auto* mg = config.add_merge_groups(); + mg->set_name(ScriptName(script)); + auto* cost = mg->mutable_cost_config(); + + // TODO(garretrieger): use a heuristic to select probability threshold based + // on estimated number of brotli ops (assuming O(n^2) on codepoints in the + // group). + mg->set_preprocess_merging_group_size(kMinimumGroupSize); + mg->set_preprocess_merging_probability_threshold(0.001); + + cost->set_built_in_freq_data_name(script); + if (script == primary_script_file) { + // TODO(garretrieger): customize these values based on the quality level + cost->set_initial_font_merge_threshold(-60); + cost->set_initial_font_merge_probability_threshold(0.40); + } + } + + return config; +} + +} // namespace util diff --git a/util/auto_segmenter_config.h b/util/auto_segmenter_config.h new file mode 100644 index 00000000..2874fd87 --- /dev/null +++ b/util/auto_segmenter_config.h @@ -0,0 +1,37 @@ +#ifndef UTIL_AUTO_SEGMENTER_CONFIG_H_ +#define UTIL_AUTO_SEGMENTER_CONFIG_H_ + +#include +#include + +#include "absl/status/statusor.h" +#include "absl/strings/string_view.h" +#include "hb.h" +#include "util/segmenter_config.pb.h" + +namespace util { + +class AutoSegmenterConfig { + public: + // Analyzes the provided font face and generates an appropriate segmenter + // configuration. + // + // primary_script: an optional name of a script or language frequency data + // file (e.g., "Script_cyrillic", "Language_fr"). + // Defaults to "Script_latin" if not provided. + static absl::StatusOr GenerateConfig( + hb_face_t* face, + std::optional primary_script = std::nullopt); + + // Returns the base script for a given language. + // For example, "Language_fr" -> "Script_latin". + static absl::StatusOr GetBaseScriptForLanguage( + absl::string_view language); + + private: + AutoSegmenterConfig() = delete; +}; + +} // namespace util + +#endif // UTIL_AUTO_SEGMENTER_CONFIG_H_ diff --git a/util/auto_segmenter_config_test.cc b/util/auto_segmenter_config_test.cc new file mode 100644 index 00000000..c756a592 --- /dev/null +++ b/util/auto_segmenter_config_test.cc @@ -0,0 +1,279 @@ +#include "util/auto_segmenter_config.h" + +#include + +#include +#include +#include + +#include "absl/strings/match.h" +#include "common/font_data.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "hb.h" +#include "util/load_codepoints.h" + +namespace util { +namespace { + +using ::common::hb_blob_unique_ptr; +using ::common::hb_face_unique_ptr; +using ::common::make_hb_blob; +using ::common::make_hb_face; +using google::protobuf::TextFormat; +using ::testing::Eq; +using ::testing::Pair; +using ::testing::UnorderedElementsAre; + +class AutoSegmenterConfigTest : public ::testing::Test { + protected: + AutoSegmenterConfigTest() + : face_(make_hb_face(nullptr)), cjk_face_(make_hb_face(nullptr)) {} + + void SetUp() override { + hb_blob_unique_ptr roboto_blob = make_hb_blob( + hb_blob_create_from_file("common/testdata/Roboto-Regular.ttf")); + face_ = make_hb_face(hb_face_create(roboto_blob.get(), 0)); + + hb_blob_unique_ptr noto_blob = make_hb_blob( + hb_blob_create_from_file("common/testdata/NotoSansJP-Regular.ttf")); + if (hb_blob_get_length(noto_blob.get()) > 0) { + cjk_face_ = make_hb_face(hb_face_create(noto_blob.get(), 0)); + } + } + + hb_face_unique_ptr face_; + hb_face_unique_ptr cjk_face_; +}; + +using ScriptPair = std::pair; + +static std::vector GetScripts(const SegmenterConfig& config) { + std::vector result; + for (const auto& mg : config.merge_groups()) { + result.push_back({mg.name(), mg.cost_config().built_in_freq_data_name()}); + } + return result; +} + +static std::vector GetScriptsWithInitialMergeThreshold( + const SegmenterConfig& config) { + std::vector result; + for (const auto& mg : config.merge_groups()) { + if (mg.cost_config().has_initial_font_merge_threshold()) { + result.push_back(mg.name()); + } + } + return result; +} + +const ScriptPair kLatin = {"Latin", "Script_latin.riegeli"}; +const ScriptPair kCyrillic = {"Cyrillic", "Script_cyrillic.riegeli"}; +const ScriptPair kGreek = {"Greek", "Script_greek.riegeli"}; +const ScriptPair kSymbols = {"Symbols", "Script_symbols.riegeli"}; +const ScriptPair kEmoji = {"Emoji", "Script_emoji.riegeli"}; +const ScriptPair kCJK = {"CJK", "Script_CJK.riegeli@*"}; +const ScriptPair kFallback = {"Fallback", "fallback.riegeli"}; + +TEST_F(AutoSegmenterConfigTest, Roboto_UnspecifiedPrimary) { + auto config_or = AutoSegmenterConfig::GenerateConfig(face_.get()); + ASSERT_TRUE(config_or.ok()) << config_or.status(); + EXPECT_THAT( + GetScripts(*config_or), + UnorderedElementsAre(kLatin, kCyrillic, kGreek, kSymbols, kFallback)); + EXPECT_THAT(GetScriptsWithInitialMergeThreshold(*config_or), + UnorderedElementsAre("Latin")); + + std::string config_string; + TextFormat::PrintToString(*config_or, &config_string); + ASSERT_EQ(config_string, R"(unmapped_glyph_handling: FIND_CONDITIONS +generate_table_keyed_segments: true +brotli_quality: 11 +base_cost_config { + use_bigrams: true + min_group_size: 4 + optimization_cutoff_fraction: 0.01 +} +ungrouped_config { + min_patch_size: 2500 +} +preprocess_merging_group_size_for_ungrouped: 4 +merge_groups { + name: "Cyrillic" + preprocess_merging_group_size: 4 + preprocess_merging_probability_threshold: 0.001 + cost_config { + built_in_freq_data_name: "Script_cyrillic.riegeli" + } +} +merge_groups { + name: "Greek" + preprocess_merging_group_size: 4 + preprocess_merging_probability_threshold: 0.001 + cost_config { + built_in_freq_data_name: "Script_greek.riegeli" + } +} +merge_groups { + name: "Latin" + preprocess_merging_group_size: 4 + preprocess_merging_probability_threshold: 0.001 + cost_config { + built_in_freq_data_name: "Script_latin.riegeli" + initial_font_merge_threshold: -60 + initial_font_merge_probability_threshold: 0.4 + } +} +merge_groups { + name: "Symbols" + preprocess_merging_group_size: 4 + preprocess_merging_probability_threshold: 0.001 + cost_config { + built_in_freq_data_name: "Script_symbols.riegeli" + } +} +merge_groups { + name: "Fallback" + preprocess_merging_group_size: 4 + preprocess_merging_probability_threshold: 0.001 + cost_config { + built_in_freq_data_name: "fallback.riegeli" + } +} +base_segmentation_plan { + jump_ahead: 2 + use_prefetch_lists: true +} +generate_feature_segments: true +condition_analysis_mode: CLOSURE_AND_DEP_GRAPH +)"); +} + +TEST_F(AutoSegmenterConfigTest, Roboto_ScriptCyrillic) { + auto config_or = + AutoSegmenterConfig::GenerateConfig(face_.get(), "Script_cyrillic"); + ASSERT_TRUE(config_or.ok()) << config_or.status(); + EXPECT_THAT( + GetScripts(*config_or), + UnorderedElementsAre(kLatin, kCyrillic, kGreek, kSymbols, kFallback)); + EXPECT_THAT(GetScriptsWithInitialMergeThreshold(*config_or), + UnorderedElementsAre("Cyrillic")); +} + +TEST_F(AutoSegmenterConfigTest, Roboto_LanguageFr) { + auto config_or = + AutoSegmenterConfig::GenerateConfig(face_.get(), "Language_fr"); + ASSERT_TRUE(config_or.ok()) << config_or.status(); + EXPECT_THAT(GetScripts(*config_or), + UnorderedElementsAre(Pair("Language_fr", "Language_fr.riegeli"), + kCyrillic, kGreek, kSymbols, kFallback)); + EXPECT_THAT(GetScriptsWithInitialMergeThreshold(*config_or), + UnorderedElementsAre("Language_fr")); +} + +TEST_F(AutoSegmenterConfigTest, NotoSansJP_UnspecifiedPrimary) { + if (!cjk_face_) GTEST_SKIP() << "NotoSansJP-Regular.ttf not found"; + auto config_or = AutoSegmenterConfig::GenerateConfig(cjk_face_.get()); + ASSERT_TRUE(config_or.ok()) << config_or.status(); + EXPECT_THAT(GetScripts(*config_or), + UnorderedElementsAre(kLatin, kGreek, kCyrillic, kCJK, kSymbols, + kEmoji, kFallback)); + EXPECT_THAT(GetScriptsWithInitialMergeThreshold(*config_or), + UnorderedElementsAre("Latin")); +} + +TEST_F(AutoSegmenterConfigTest, NotoSansJP_ScriptCJK) { + if (!cjk_face_) GTEST_SKIP() << "NotoSansJP-Regular.ttf not found"; + auto config_or = + AutoSegmenterConfig::GenerateConfig(cjk_face_.get(), "Script_CJK"); + ASSERT_TRUE(config_or.ok()) << config_or.status(); + EXPECT_THAT(GetScripts(*config_or), + UnorderedElementsAre(kLatin, kGreek, kCyrillic, kCJK, kSymbols, + kEmoji, kFallback)); + EXPECT_THAT(GetScriptsWithInitialMergeThreshold(*config_or), + UnorderedElementsAre("CJK")); +} + +TEST_F(AutoSegmenterConfigTest, NotoSansJP_ScriptJapanese) { + if (!cjk_face_) GTEST_SKIP() << "NotoSansJP-Regular.ttf not found"; + auto config_or = + AutoSegmenterConfig::GenerateConfig(cjk_face_.get(), "Script_japanese"); + ASSERT_TRUE(config_or.ok()) << config_or.status(); + EXPECT_THAT( + GetScripts(*config_or), + UnorderedElementsAre(kLatin, kGreek, kCyrillic, + Pair("Japanese", "Script_japanese.riegeli@*"), + kSymbols, kEmoji, kFallback)); + EXPECT_THAT(GetScriptsWithInitialMergeThreshold(*config_or), + UnorderedElementsAre("Japanese")); +} + +TEST_F(AutoSegmenterConfigTest, NotoSansJP_LanguageZhHans) { + if (!cjk_face_) GTEST_SKIP() << "NotoSansJP-Regular.ttf not found"; + auto config_or = + AutoSegmenterConfig::GenerateConfig(cjk_face_.get(), "Language_zh-Hans"); + ASSERT_TRUE(config_or.ok()) << config_or.status(); + EXPECT_THAT(GetScripts(*config_or), + UnorderedElementsAre( + kLatin, kGreek, kCyrillic, + Pair("Language_zh-Hans", "Language_zh-Hans.riegeli@*"), + kSymbols, kEmoji, kFallback)); + EXPECT_THAT(GetScriptsWithInitialMergeThreshold(*config_or), + UnorderedElementsAre("Language_zh-Hans")); +} + +TEST_F(AutoSegmenterConfigTest, Roboto_ScriptNotFound) { + auto config_or = + AutoSegmenterConfig::GenerateConfig(face_.get(), "Script_foobar"); + EXPECT_EQ(config_or.status().code(), absl::StatusCode::kNotFound); +} + +TEST_F(AutoSegmenterConfigTest, Roboto_LanguageNotFound) { + auto config_or = + AutoSegmenterConfig::GenerateConfig(face_.get(), "Language_foobar"); + EXPECT_EQ(config_or.status().code(), absl::StatusCode::kNotFound); +} + +TEST_F(AutoSegmenterConfigTest, Roboto_InvalidPrefix) { + auto config_or = + AutoSegmenterConfig::GenerateConfig(face_.get(), "Foo_latin"); + EXPECT_EQ(config_or.status().code(), absl::StatusCode::kInternal); +} + +TEST_F(AutoSegmenterConfigTest, Roboto_FullFileName_Script) { + auto config_or = AutoSegmenterConfig::GenerateConfig( + face_.get(), "Script_cyrillic.riegeli"); + ASSERT_TRUE(config_or.ok()) << config_or.status(); + EXPECT_THAT( + GetScripts(*config_or), + UnorderedElementsAre(kLatin, kCyrillic, kGreek, kSymbols, kFallback)); + EXPECT_THAT(GetScriptsWithInitialMergeThreshold(*config_or), + UnorderedElementsAre("Cyrillic")); +} + +TEST_F(AutoSegmenterConfigTest, Roboto_FullFileName_Language) { + auto config_or = + AutoSegmenterConfig::GenerateConfig(face_.get(), "Language_fr.riegeli"); + EXPECT_THAT(GetScripts(*config_or), + UnorderedElementsAre(Pair("Language_fr", "Language_fr.riegeli"), + kCyrillic, kGreek, kSymbols, kFallback)); + EXPECT_THAT(GetScriptsWithInitialMergeThreshold(*config_or), + UnorderedElementsAre("Language_fr")); +} + +TEST_F(AutoSegmenterConfigTest, LanguageMappingsExist) { + auto built_in_freqs_or = util::BuiltInFrequenciesList(); + ASSERT_TRUE(built_in_freqs_or.ok()); + for (const auto& [file_name, _] : *built_in_freqs_or) { + if (!absl::StartsWith(file_name, "Language_")) continue; + std::string language = file_name; + size_t dot_pos = language.find('.'); + if (dot_pos != std::string::npos) language = language.substr(0, dot_pos); + auto base_script = AutoSegmenterConfig::GetBaseScriptForLanguage(language); + ASSERT_TRUE(base_script.ok()) + << "No mapping for " << language << ": " << base_script.status(); + } +} + +} // namespace +} // namespace util diff --git a/util/closure_glyph_keyed_segmenter_util.cc b/util/closure_glyph_keyed_segmenter_util.cc index 925d4337..09c19cb3 100644 --- a/util/closure_glyph_keyed_segmenter_util.cc +++ b/util/closure_glyph_keyed_segmenter_util.cc @@ -23,6 +23,7 @@ #include "ift/encoder/merge_strategy.h" #include "ift/encoder/subset_definition.h" #include "ift/freq/unicode_frequencies.h" +#include "util/auto_segmenter_config.h" #include "util/load_codepoints.h" #include "util/segmentation_plan.pb.h" #include "util/segmenter_config.pb.h" @@ -42,6 +43,14 @@ ABSL_FLAG( "Path to a text proto file containing the configuration for the segmenter. " "Should contain a single SegmenterConfig message."); +ABSL_FLAG(bool, auto_config, false, + "If set the segmenter configuration will be automatically generated " + "based on the input font."); + +ABSL_FLAG(std::string, primary_script, "Script_latin", + "When auto_config is enabled this sets the primary script or " + "language frequency data file to use."); + ABSL_FLAG(bool, output_segmentation_plan, false, "If set a segmentation plan representing the determined segmentation " "will be output to stdout."); @@ -81,9 +90,15 @@ using ift::encoder::Segment; using ift::encoder::SegmentationCost; using ift::encoder::SubsetDefinition; using ift::freq::UnicodeFrequencies; +using util::AutoSegmenterConfig; using util::SegmenterConfigUtil; -static StatusOr LoadConfig() { +static StatusOr LoadConfig(hb_face_t* font) { + if (absl::GetFlag(FLAGS_auto_config)) { + return AutoSegmenterConfig::GenerateConfig( + font, absl::GetFlag(FLAGS_primary_script)); + } + FontData config_text = TRY(util::LoadFile(absl::GetFlag(FLAGS_config).c_str())); SegmenterConfig config; @@ -191,9 +206,10 @@ static Status OutputFallbackGlyphCount(hb_face_t* original_face, static Status Main(const std::vector args) { hb_face_unique_ptr font = TRY(LoadFont(absl::GetFlag(FLAGS_input_font).c_str())); - SegmenterConfig config = TRY(LoadConfig()); + SegmenterConfig config = TRY(LoadConfig(font.get())); - SegmenterConfigUtil config_util(absl::GetFlag(FLAGS_config)); + SegmenterConfigUtil config_util( + absl::GetFlag(FLAGS_auto_config) ? "" : absl::GetFlag(FLAGS_config)); CodepointSet font_codepoints = FontHelper::ToCodepointsSet(font.get()); btree_set font_features = FontHelper::GetFeatureTags(font.get()); diff --git a/util/generate_segmenter_config.cc b/util/generate_segmenter_config.cc new file mode 100644 index 00000000..d0fa8133 --- /dev/null +++ b/util/generate_segmenter_config.cc @@ -0,0 +1,55 @@ +#include + +#include +#include +#include + +#include "absl/flags/flag.h" +#include "absl/flags/parse.h" +#include "absl/log/globals.h" +#include "absl/log/initialize.h" +#include "absl/status/status.h" +#include "common/font_data.h" +#include "common/try.h" +#include "util/auto_segmenter_config.h" +#include "util/load_codepoints.h" + +ABSL_FLAG(std::string, input_font, "in.ttf", + "Path to the font file to analyze."); + +ABSL_FLAG(std::string, primary_script, "Script_latin", + "The primary script or language frequency data file to use."); + +using absl::Status; +using common::hb_face_unique_ptr; +using util::AutoSegmenterConfig; + +static Status Main(const std::vector args) { + std::string input_font_path = absl::GetFlag(FLAGS_input_font); + auto font_data = TRY(util::LoadFile(input_font_path.c_str())); + hb_face_unique_ptr font = font_data.face(); + + auto config = TRY(AutoSegmenterConfig::GenerateConfig( + font.get(), absl::GetFlag(FLAGS_primary_script))); + + std::string output; + if (!google::protobuf::TextFormat::PrintToString(config, &output)) { + return absl::InternalError("Failed to format SegmenterConfig as textproto."); + } + + std::cout << output; + return absl::OkStatus(); +} + +int main(int argc, char** argv) { + absl::SetStderrThreshold(absl::LogSeverityAtLeast::kInfo); + auto args = absl::ParseCommandLine(argc, argv); + absl::InitializeLog(); + + Status sc = Main(args); + if (!sc.ok()) { + std::cerr << "Error: " << sc << std::endl; + return -1; + } + return 0; +} diff --git a/util/load_codepoints_test.cc b/util/load_codepoints_test.cc index 60b00542..2f77ab7b 100644 --- a/util/load_codepoints_test.cc +++ b/util/load_codepoints_test.cc @@ -168,9 +168,14 @@ TEST_F(LoadCodepointsTest, BuiltInFrequenciesList) { auto result = util::BuiltInFrequenciesList(); ASSERT_TRUE(result.ok()) << result.status(); EXPECT_FALSE(result->empty()); + EXPECT_TRUE(result->contains("Script_latin.riegeli")); EXPECT_FALSE((*result)["Script_latin.riegeli"].empty()); EXPECT_TRUE((*result)["Script_latin.riegeli"].contains('Q')); + + EXPECT_TRUE(result->contains("Script_japanese.riegeli@*")); + EXPECT_FALSE((*result)["Script_japanese.riegeli@*"].empty()); + EXPECT_TRUE((*result)["Script_japanese.riegeli@*"].contains(0x304C /* が */)); } } // namespace util From 0e6391fd01a5b84ec565ee9b3bd65bfa8181570f Mon Sep 17 00:00:00 2001 From: Garret Rieger Date: Thu, 5 Mar 2026 00:27:18 +0000 Subject: [PATCH 2/3] For the auto config generator add a customizable quality level. Numeric value which controls the performance vs quality tradeoff. Lower values maximize performance, higher values maximize segmentation quality (at the cost of longer analysis times). --- util/auto_segmenter_config.cc | 168 ++++++++++++++++++--- util/auto_segmenter_config.h | 3 +- util/auto_segmenter_config_test.cc | 46 ++++-- util/closure_glyph_keyed_segmenter_util.cc | 18 ++- util/generate_segmenter_config.cc | 10 +- 5 files changed, 202 insertions(+), 43 deletions(-) diff --git a/util/auto_segmenter_config.cc b/util/auto_segmenter_config.cc index f3f99f26..843404f0 100644 --- a/util/auto_segmenter_config.cc +++ b/util/auto_segmenter_config.cc @@ -27,6 +27,29 @@ namespace util { static constexpr uint32_t kMinimumGroupSize = 4; +// Quality Table: +// Quality | bigrams | find conditions | init brotli | non init brotli | init font merge threshold | opt cut off | preprocess merging | preprocess threshold +// 1 | No | No | 0 | 0 | 60% | 5% | Yes | 5% +// 2 | Yes | No | 0 | 0 | 55% | 4% | Yes | 4% +// 3 | Yes | Yes | 0 | 0 | 50% | 3% | Yes | 3% +// 4 | Yes | Yes | 0 | 9 | 45% | 2% | Yes | 2% +// 5 | Yes | Yes | 9 | 9 | 40% | 1% | Yes | 1% +// 6 | Yes | Yes | 9 | 11 | 30% | 0.5% | Yes | 0.5% +// 7 | Yes | Yes | 11 | 11 | 25% | 0.5% | Yes | 0.05% +// 8 | Yes | Yes | 11 | 11 | 25% | 0.5% | No | na +enum Quality { + MIN = 1, // Alias for ONE + ONE = 1, + TWO = 2, + THREE = 3, + FOUR = 4, + FIVE = 5, + SIX = 6, + SEVEN = 7, + EIGHT = 8, + MAX = 8, // Alias for EIGHT +}; + // TODO(garretrieger): define a very basic set of quality levels first (see next TODO), // start with just a lowest and highest to set the upper and lower bounds for quality // settings (maybe also a mid point). To begin use number of codepoints to select quality @@ -464,24 +487,140 @@ static Status ApplyPrimaryScript( return absl::OkStatus(); } +static void ApplyQualityLevelTo(Quality quality, HeuristicConfiguration& config) { + config.set_min_patch_size(2500); +} + +static void ApplyQualityLevelTo(Quality quality, CostConfiguration& config) { + config.set_min_group_size(kMinimumGroupSize); + + if (quality == ONE) { + config.set_use_bigrams(false); + } else { + config.set_use_bigrams(true); + } + + switch (quality) { + case ONE: config.set_optimization_cutoff_fraction(0.05); break; + case TWO: config.set_optimization_cutoff_fraction(0.04); break; + case THREE: config.set_optimization_cutoff_fraction(0.03); break; + case FOUR: config.set_optimization_cutoff_fraction(0.02); break; + case FIVE: config.set_optimization_cutoff_fraction(0.01); break; + case SIX: + case SEVEN: + case EIGHT: + default: config.set_optimization_cutoff_fraction(0.005); break; + } +} + +static void ApplyQualityLevelTo(Quality quality, MergeGroup& merge_group) { + if (merge_group.has_cost_config()) { + if (quality >= ONE && quality <= SEVEN) { + merge_group.set_preprocess_merging_group_size(kMinimumGroupSize); + } else { + merge_group.set_preprocess_merging_group_size(1); + } + + switch (quality) { + case ONE: merge_group.set_preprocess_merging_probability_threshold(0.05); break; + case TWO: merge_group.set_preprocess_merging_probability_threshold(0.04); break; + case THREE: merge_group.set_preprocess_merging_probability_threshold(0.03); break; + case FOUR: merge_group.set_preprocess_merging_probability_threshold(0.02); break; + case FIVE: merge_group.set_preprocess_merging_probability_threshold(0.01); break; + case SIX: merge_group.set_preprocess_merging_probability_threshold(0.005); break; + case SEVEN: merge_group.set_preprocess_merging_probability_threshold(0.0005); break; + case EIGHT: + default: merge_group.clear_preprocess_merging_probability_threshold(); break; + } + + if (merge_group.mutable_cost_config()->has_initial_font_merge_threshold()) { + switch (quality) { + case ONE: merge_group.mutable_cost_config()->set_initial_font_merge_probability_threshold(0.60); break; + case TWO: merge_group.mutable_cost_config()->set_initial_font_merge_probability_threshold(0.55); break; + case THREE: merge_group.mutable_cost_config()->set_initial_font_merge_probability_threshold(0.50); break; + case FOUR: merge_group.mutable_cost_config()->set_initial_font_merge_probability_threshold(0.45); break; + case FIVE: merge_group.mutable_cost_config()->set_initial_font_merge_probability_threshold(0.40); break; + case SIX: merge_group.mutable_cost_config()->set_initial_font_merge_probability_threshold(0.30); break; + case SEVEN: + case EIGHT: + default: merge_group.mutable_cost_config()->set_initial_font_merge_probability_threshold(0.25); break; + } + } + } +} + +static void ApplyQualityLevelTo(Quality quality, SegmenterConfig& config) { + config.set_preprocess_merging_group_size_for_ungrouped(kMinimumGroupSize); + + if (quality == ONE || quality == TWO) { + config.set_unmapped_glyph_handling(MOVE_TO_INIT_FONT); + } else { + config.set_unmapped_glyph_handling(FIND_CONDITIONS); + } + + switch (quality) { + case ONE: + case TWO: + case THREE: + config.set_brotli_quality(0); + break; + case FOUR: + case FIVE: + config.set_brotli_quality(9); + break; + case SIX: + case SEVEN: + case EIGHT: + default: + config.set_brotli_quality(11); + break; + } + + switch (quality) { + case ONE: + case TWO: + case THREE: + case FOUR: + config.set_brotli_quality_for_initial_font_merging(0); + break; + case FIVE: + case SIX: + config.set_brotli_quality_for_initial_font_merging(9); + break; + case SEVEN: + case EIGHT: + default: + config.set_brotli_quality_for_initial_font_merging(11); + break; + } + + ApplyQualityLevelTo(quality, *config.mutable_base_heuristic_config()); + ApplyQualityLevelTo(quality, *config.mutable_base_cost_config()); + + for (auto& merge_group : *config.mutable_merge_groups()) { + ApplyQualityLevelTo(quality, merge_group); + } +} + absl::StatusOr AutoSegmenterConfig::GenerateConfig( - hb_face_t* face, std::optional primary_script) { + hb_face_t* face, std::optional primary_script, std::optional quality_level) { SegmenterConfig config; config.set_generate_table_keyed_segments(true); config.set_generate_feature_segments(true); - config.set_unmapped_glyph_handling(FIND_CONDITIONS); config.set_condition_analysis_mode(CLOSURE_AND_DEP_GRAPH); auto* base_plan = config.mutable_base_segmentation_plan(); base_plan->set_jump_ahead(2); base_plan->set_use_prefetch_lists(true); - config.mutable_ungrouped_config()->set_min_patch_size(2500); - // Collect codepoints auto freq_list = TRY(BuiltInFrequenciesList()); CodepointSet unicodes = FontHelper::ToCodepointsSet(face); uint32_t cp_count = unicodes.size(); + Quality quality = cp_count > 2000 ? MIN : MAX; + if (quality_level.has_value() && quality_level.value() >= ONE && quality_level.value() <= MAX) { + quality = static_cast(quality_level.value()); + } // Detect scripts by intersection with frequency data btree_set detected_scripts = DetectScripts(freq_list, unicodes); @@ -491,18 +630,6 @@ absl::StatusOr AutoSegmenterConfig::GenerateConfig( // (including accounting for pairs only within merge groups), and then select // the cutoffs and premerging to keep the number of brotli ops within a // specific range. - auto* base_cost = config.mutable_base_cost_config(); - base_cost->set_use_bigrams(true); - base_cost->set_min_group_size( - kMinimumGroupSize); // as recommended by the spec. - config.set_preprocess_merging_group_size_for_ungrouped(kMinimumGroupSize); - base_cost->set_optimization_cutoff_fraction(0.01); - - if (cp_count > 2000) { - config.set_brotli_quality(9); - } else { - config.set_brotli_quality(11); - } TRYV(ApplyPrimaryScript(freq_list, primary_script.value_or("Script_latin"), detected_scripts)); @@ -515,20 +642,15 @@ absl::StatusOr AutoSegmenterConfig::GenerateConfig( mg->set_name(ScriptName(script)); auto* cost = mg->mutable_cost_config(); - // TODO(garretrieger): use a heuristic to select probability threshold based - // on estimated number of brotli ops (assuming O(n^2) on codepoints in the - // group). - mg->set_preprocess_merging_group_size(kMinimumGroupSize); - mg->set_preprocess_merging_probability_threshold(0.001); - cost->set_built_in_freq_data_name(script); if (script == primary_script_file) { // TODO(garretrieger): customize these values based on the quality level cost->set_initial_font_merge_threshold(-60); - cost->set_initial_font_merge_probability_threshold(0.40); } } + ApplyQualityLevelTo(quality, config); + return config; } diff --git a/util/auto_segmenter_config.h b/util/auto_segmenter_config.h index 2874fd87..9c974a7d 100644 --- a/util/auto_segmenter_config.h +++ b/util/auto_segmenter_config.h @@ -21,7 +21,8 @@ class AutoSegmenterConfig { // Defaults to "Script_latin" if not provided. static absl::StatusOr GenerateConfig( hb_face_t* face, - std::optional primary_script = std::nullopt); + std::optional primary_script = std::nullopt, + std::optional quality_level = std::nullopt); // Returns the base script for a given language. // For example, "Language_fr" -> "Script_latin". diff --git a/util/auto_segmenter_config_test.cc b/util/auto_segmenter_config_test.cc index c756a592..6a11725e 100644 --- a/util/auto_segmenter_config_test.cc +++ b/util/auto_segmenter_config_test.cc @@ -89,53 +89,49 @@ TEST_F(AutoSegmenterConfigTest, Roboto_UnspecifiedPrimary) { ASSERT_EQ(config_string, R"(unmapped_glyph_handling: FIND_CONDITIONS generate_table_keyed_segments: true brotli_quality: 11 +brotli_quality_for_initial_font_merging: 11 +base_heuristic_config { + min_patch_size: 2500 +} base_cost_config { use_bigrams: true min_group_size: 4 - optimization_cutoff_fraction: 0.01 -} -ungrouped_config { - min_patch_size: 2500 + optimization_cutoff_fraction: 0.005 } preprocess_merging_group_size_for_ungrouped: 4 merge_groups { name: "Cyrillic" - preprocess_merging_group_size: 4 - preprocess_merging_probability_threshold: 0.001 + preprocess_merging_group_size: 1 cost_config { built_in_freq_data_name: "Script_cyrillic.riegeli" } } merge_groups { name: "Greek" - preprocess_merging_group_size: 4 - preprocess_merging_probability_threshold: 0.001 + preprocess_merging_group_size: 1 cost_config { built_in_freq_data_name: "Script_greek.riegeli" } } merge_groups { name: "Latin" - preprocess_merging_group_size: 4 - preprocess_merging_probability_threshold: 0.001 + preprocess_merging_group_size: 1 cost_config { built_in_freq_data_name: "Script_latin.riegeli" initial_font_merge_threshold: -60 - initial_font_merge_probability_threshold: 0.4 + initial_font_merge_probability_threshold: 0.25 } } merge_groups { name: "Symbols" - preprocess_merging_group_size: 4 - preprocess_merging_probability_threshold: 0.001 + preprocess_merging_group_size: 1 cost_config { built_in_freq_data_name: "Script_symbols.riegeli" } } merge_groups { name: "Fallback" - preprocess_merging_group_size: 4 - preprocess_merging_probability_threshold: 0.001 + preprocess_merging_group_size: 1 cost_config { built_in_freq_data_name: "fallback.riegeli" } @@ -275,5 +271,25 @@ TEST_F(AutoSegmenterConfigTest, LanguageMappingsExist) { } } +TEST_F(AutoSegmenterConfigTest, QualityLevelForcing) { + auto config_or = AutoSegmenterConfig::GenerateConfig( + face_.get(), std::nullopt, 1); + ASSERT_TRUE(config_or.ok()) << config_or.status(); + EXPECT_EQ(config_or->brotli_quality(), 0); + EXPECT_EQ(config_or->unmapped_glyph_handling(), MOVE_TO_INIT_FONT); + EXPECT_EQ(config_or->base_cost_config().use_bigrams(), false); + EXPECT_EQ(config_or->brotli_quality_for_initial_font_merging(), 0); + EXPECT_EQ(config_or->base_cost_config().optimization_cutoff_fraction(), 0.05); + + auto config_or_8 = AutoSegmenterConfig::GenerateConfig( + face_.get(), std::nullopt, 8); + ASSERT_TRUE(config_or_8.ok()) << config_or_8.status(); + EXPECT_EQ(config_or_8->brotli_quality(), 11); + EXPECT_EQ(config_or_8->unmapped_glyph_handling(), FIND_CONDITIONS); + EXPECT_EQ(config_or_8->base_cost_config().use_bigrams(), true); + EXPECT_EQ(config_or_8->brotli_quality_for_initial_font_merging(), 11); + EXPECT_EQ(config_or_8->base_cost_config().optimization_cutoff_fraction(), 0.005); +} + } // namespace } // namespace util diff --git a/util/closure_glyph_keyed_segmenter_util.cc b/util/closure_glyph_keyed_segmenter_util.cc index 09c19cb3..fabe08af 100644 --- a/util/closure_glyph_keyed_segmenter_util.cc +++ b/util/closure_glyph_keyed_segmenter_util.cc @@ -1,9 +1,9 @@ #include #include -#include #include #include +#include #include "absl/container/btree_map.h" #include "absl/container/flat_hash_map.h" @@ -43,6 +43,9 @@ ABSL_FLAG( "Path to a text proto file containing the configuration for the segmenter. " "Should contain a single SegmenterConfig message."); +ABSL_FLAG(int, auto_config_quality, 0, + "The quality level to use when auto_config is enabled. A value of 0 means auto pick. Valid values are 1-8."); + ABSL_FLAG(bool, auto_config, false, "If set the segmenter configuration will be automatically generated " "based on the input font."); @@ -95,8 +98,12 @@ using util::SegmenterConfigUtil; static StatusOr LoadConfig(hb_face_t* font) { if (absl::GetFlag(FLAGS_auto_config)) { + std::optional quality_level = std::nullopt; + if (absl::GetFlag(FLAGS_auto_config_quality) > 0) { + quality_level = absl::GetFlag(FLAGS_auto_config_quality); + } return AutoSegmenterConfig::GenerateConfig( - font, absl::GetFlag(FLAGS_primary_script)); + font, absl::GetFlag(FLAGS_primary_script), quality_level); } FontData config_text = @@ -143,7 +150,7 @@ static Status Analysis(hb_face_t* font, group_index++; } - std::cerr << "total_cost_across_groups = " << overall_cost << std::endl; + std::cerr << "total_cost_across_groups = " << (uint64_t) overall_cost << std::endl; return absl::OkStatus(); } @@ -224,8 +231,13 @@ static Status Main(const std::vector args) { ClosureGlyphSegmenter segmenter( config.brotli_quality(), config.brotli_quality_for_initial_font_merging(), config.unmapped_glyph_handling(), config.condition_analysis_mode()); + + auto start_time = std::chrono::high_resolution_clock::now(); GlyphSegmentation segmentation = TRY(segmenter.CodepointToGlyphSegments( font.get(), init_segment, segments, merge_groups)); + auto end_time = std::chrono::high_resolution_clock::now(); + std::chrono::duration duration = end_time - start_time; + std::cerr << "CodepointToGlyphSegments took: " << duration.count() << " seconds" << std::endl; if (absl::GetFlag(FLAGS_output_segmentation_plan)) { SegmentationPlan plan = segmentation.ToSegmentationPlanProto(); diff --git a/util/generate_segmenter_config.cc b/util/generate_segmenter_config.cc index d0fa8133..7af04759 100644 --- a/util/generate_segmenter_config.cc +++ b/util/generate_segmenter_config.cc @@ -20,6 +20,9 @@ ABSL_FLAG(std::string, input_font, "in.ttf", ABSL_FLAG(std::string, primary_script, "Script_latin", "The primary script or language frequency data file to use."); +ABSL_FLAG(int, quality, 0, + "The quality level to use. A value of 0 means auto pick. Valid values are 1-8."); + using absl::Status; using common::hb_face_unique_ptr; using util::AutoSegmenterConfig; @@ -29,8 +32,13 @@ static Status Main(const std::vector args) { auto font_data = TRY(util::LoadFile(input_font_path.c_str())); hb_face_unique_ptr font = font_data.face(); + std::optional quality_level = std::nullopt; + if (absl::GetFlag(FLAGS_quality) > 0) { + quality_level = absl::GetFlag(FLAGS_quality); + } + auto config = TRY(AutoSegmenterConfig::GenerateConfig( - font.get(), absl::GetFlag(FLAGS_primary_script))); + font.get(), absl::GetFlag(FLAGS_primary_script), quality_level)); std::string output; if (!google::protobuf::TextFormat::PrintToString(config, &output)) { From dc6c3a8d92a0fc930c2b9cdac2c01f82924095a5 Mon Sep 17 00:00:00 2001 From: Garret Rieger Date: Fri, 6 Mar 2026 21:50:59 +0000 Subject: [PATCH 3/3] Add the auto segmenter config to font2ift. This allows font2ift to perform the full IFT encoding process: 1. Auto generate segmenter config. 2. Run segmenter. 3. Compile the font. If a segmentation plan is not supplied to font2ift it will then using the segemnter auto config and closure segmenter to generate one. --- README.md | 112 ++++++++++++++++----- ift/encoder/closure_glyph_segmenter.cc | 33 ++++++ ift/encoder/closure_glyph_segmenter.h | 8 ++ util/BUILD | 20 ++++ util/auto_config_flags.cc | 15 +++ util/auto_config_flags.h | 11 ++ util/auto_segmenter_config.cc | 31 +++--- util/auto_segmenter_config.h | 5 + util/closure_glyph_keyed_segmenter_util.cc | 92 ++++------------- util/font2ift.cc | 76 ++++++++++---- util/segmenter_config_util.cc | 40 ++++++++ util/segmenter_config_util.h | 12 +++ 12 files changed, 327 insertions(+), 128 deletions(-) create mode 100644 util/auto_config_flags.cc create mode 100644 util/auto_config_flags.h diff --git a/README.md b/README.md index 6b2549ff..bd000300 100644 --- a/README.md +++ b/README.md @@ -58,7 +58,7 @@ script: ## Documentation The documents under [docs/experimental](docs/experimental) provide some more detailed designs of various aspects of the IFT encoder. Of note: -* [compiler.md](docs/experimental) +* [compiler.md](docs/experimental/compiler.md) * [closure_glyph_segmentation.md](docs/experimental/closure_glyph_segmentation.md) * [closure_glyph_segmentation_merging.md](docs/experimental/closure_glyph_segmentation_merging.md) * [closure_glyph_segmentation_complex_conditions.md](docs/experimental/closure_glyph_segmentation_complex_conditions.md) @@ -77,13 +77,81 @@ bazel run @hedron_compile_commands//:refresh_all Will generate a compile_commands.json file. -## Producing IFT Encoded Fonts +## Producing IFT Encoded Fonts (with Auto Config) -IFT encoded fonts are produced in two steps: -1. A segmentation plan is generated which specifies how the font file should be split up in the IFT encoding. -2. The IFT encoded font and patches are compiled by the Compiler sub module using the segmentation plan. +The simplest way to create IFT fonts is via the `font2ift` utility utilizing the auto configuration mode. +This is done by running the utility and not providing a segmentation plan. Example invocation: -### Generating Segmentation Plan +```bash +bazel run -c opt @ift_encoder//util:font2ift -- \ + --input_font="$HOME/fonts/myfont/MyFont.ttf" \ + --output_path=$HOME/fonts/myfont/ift/ \ + --output_font="MyFont-IFT.woff2" +``` + +This will analyze the input font, decide how to segment it, and then produce the final IFT encoded font +and patches. + +When utilizing auto config there are two optional flags which can be used to adjust the behaviour: +* `--auto_config_primary_script`: this tells the config generator which language/script the font is intended + to be used with. It has two effects: first the codepoints of the primary script are eligible to be moved + into the initial font. Second for scripts with large overlaps, such as CJK, primary script selects which + of the overlapping scripts to use frequency data from. Values refer to frequency data files in + [ift-encoder-data](https://github.com/w3c/ift-encoder-data/tree/main/data). Example values: "Script_bengali", + "Language_fr" + +* `--auto_config_quality`: This is analagous to a quality level in a compression library. It controls how much + effort is spent to improve the efficiency of the final IFT font. Values range from 1 to 8, where higher + values increase encoding times but typically result in a more efficient end IFT font (ie. less bytes + transferred by clients using it). + +Example command line with optional flags: + +```bash +bazel run -c opt @ift_encoder//util:font2ift -- \ + --input_font="$HOME/fonts/NotoSansJP-Regular.otf" \ + --output_path=$HOME/fonts/ift/ \ + --output_font="NotoSansJP-Regular-IFT.woff2" \ + --auto_config_primary_script=Script_japanese \ + --auto_config_quality=3 +``` + +*Note: the auto configuration mode is still under development, in particular the auto selection of quality level +is currently quite simplistic. It's expected to continue to evolve from it's current state.* + +## Producing IFT Encoded Fonts (Advanced) + +Under the hood IFT font encoding happens in three stages: + +1. Generate or write a segmenter config for the font. +2. Generate a segmentation plan, which describes how the font is split into patches. Takes the segmenter config as an input. +3. Compile the final IFT encoded font following the segmentation plan. + +For more advanced use cases these steps can be performed individually. This allows the segmenter config +and segmentation plans to be fine tuned beyond what auto configuration is capable of. + +### Step 1: Generating a Segmenter Config + +There are two main options for generating a segmenter config: + +1. Write the config by hand, the segmenter is configured via an input configuration file using the + [segmenter_config.proto](util/segmenter_config.proto) schema, see the comments there for more details. + This option is useful when maximum control over segmentation parameters is needed, or custom frequency + data is being supplied. + +2. Auto generate the segmenter config using `util:generate_segmenter_config`. + + ``` + CC=clang bazel run //util:generate_segmenter_config -- \ + --quality=5 \ + --input_font=$HOME/MyFont.ttf > config.txtpb + ``` + + This analyzes the input font and tries to pick appropriate config values automatically. As discussed in + the previous "Producing IFT Encoded Fonts" section there is a configurable quality level. If needed + the auto generated config can be hand tweaked after generation. + +### Step 2: Generating Segmentation Plan Segmentation plans are in a [textproto format](https://protobuf.dev/reference/protobuf/textformat-spec/) using the [segmentation_plan.proto](util/segmentation_plan.proto) schema. See the comments in the schema file for more information. @@ -93,17 +161,9 @@ possible to write plans by hand, or develop new utilities to generate plans. In this repo 3 options are currently provided: -1. `util/generate_table_keyed_config`: this utility generates the table keyed (extension segments that augment non - glyph data in the font) portion of a plan. Example execution: - - ```sh - bazel run -c opt util:generate_table_keyed_config -- \ - --font=$(pwd)/myfont.ttf \ - latin.txt cyrillic.txt greek.txt > table_keyed.txtpb - ``` - -2. `util/closure_glyph_keyed_segmenter_util`: this utility uses a subsetting closure based approach to generate a glyph - keyed segmentation plan (extension segments that augment glyph data). Example execution: +1. [Recommended] `util/closure_glyph_keyed_segmenter_util`: this utility uses a subsetting closure based approach + to generate a glyph keyed segmentation plan (extension segments that augment glyph data). It can optionally + generate the table keyed portion of the config as well. Example execution: ```sh bazel run -c opt util:closure_glyph_keyed_segmenter_util -- \ @@ -119,6 +179,15 @@ In this repo 3 options are currently provided: Note: this utility is under active development and still very experimental. See [the status section](docs/experimental/closure_glyph_segmentation.md#status) for more details. +2. `util/generate_table_keyed_config`: this utility generates the table keyed (extension segments that augment non + glyph data in the font) portion of a plan. Example execution: + + ```sh + bazel run -c opt util:generate_table_keyed_config -- \ + --font=$(pwd)/myfont.ttf \ + latin.txt cyrillic.txt greek.txt > table_keyed.txtpb + ``` + 3. `util/iftb2config`: this utility converts a segmentation obtained from the [binned incremental font transfer prototype](https://github.com/adobe/binned-ift-reference) into and equivalent segmentation plan. Example execution: @@ -128,23 +197,20 @@ In this repo 3 options are currently provided: bazel run util:iftb2config > segmentation_plan.txtpb ``` -If seperate glyph keyed and table keyed configs were generated using #1 and #2 they can then be combined into one +If separate glyph keyed and table keyed configs were generated using #1 and #2 they can then be combined into one complete plan by concatenating them: ```sh cat glyph_keyed.txtpb table_keyed.txtpb > segmentation_plan.txtpb ``` -Additional tools for generating encoder configs are planned to be added in the future. - For concrete examples of how to generate IFT fonts, see the [IFT Demo](https://github.com/garretrieger/ift-demo). In particular the [Makefile](https://github.com/garretrieger/ift-demo/blob/main/Makefile) and the [segmenter configs](https://github.com/garretrieger/ift-demo/tree/main/config) may be helpful. -### Generating an IFT Encoding +### Step 3: Generating an IFT Encoding -Once an segmentation plan has been created it can be combined with the target font to produce and incremental font and collection -of associated patches using the font2ift utility which is a wrapper around the compiler. Example execution: +Once a segmentation plan has been created it can be combined with the target font to produce an incremental font and collection of associated patches using the font2ift utility which is a wrapper around the compiler. Example execution: ```sh bazel -c opt run util:font2ift -- \ diff --git a/ift/encoder/closure_glyph_segmenter.cc b/ift/encoder/closure_glyph_segmenter.cc index a83d5daf..72227678 100644 --- a/ift/encoder/closure_glyph_segmenter.cc +++ b/ift/encoder/closure_glyph_segmenter.cc @@ -734,4 +734,37 @@ Status ClosureGlyphSegmenter::FallbackCost( return absl::OkStatus(); } +void ClosureGlyphSegmenter::AddTableKeyedSegments( + SegmentationPlan& plan, + const btree_map& merge_groups, + const std::vector& segments, + const SubsetDefinition& init_segment) { + std::vector table_keyed_segments; + for (const auto& [segment_ids, _] : merge_groups) { + SubsetDefinition new_segment; + for (uint32_t s : segment_ids) { + new_segment.Union(segments.at(s)); + } + new_segment.Subtract(init_segment); + table_keyed_segments.push_back(new_segment); + } + + uint32_t max_id = 0; + for (const auto& [id, _] : plan.segments()) { + if (id > max_id) { + max_id = id; + } + } + + uint32_t next_id = max_id + 1; + auto* plan_segments = plan.mutable_segments(); + for (const SubsetDefinition& def : table_keyed_segments) { + GlyphSegmentation::SubsetDefinitionToSegment(def, + (*plan_segments)[next_id]); + SegmentsProto* segment_ids = plan.add_non_glyph_segments(); + segment_ids->add_values(next_id); + next_id++; + } +} + } // namespace ift::encoder diff --git a/ift/encoder/closure_glyph_segmenter.h b/ift/encoder/closure_glyph_segmenter.h index 09338559..d29e074c 100644 --- a/ift/encoder/closure_glyph_segmenter.h +++ b/ift/encoder/closure_glyph_segmenter.h @@ -4,6 +4,7 @@ #include #include +#include "absl/container/btree_map.h" #include "absl/status/statusor.h" #include "ift/encoder/glyph_segmentation.h" #include "ift/encoder/merge_strategy.h" @@ -11,6 +12,7 @@ #include "ift/encoder/subset_definition.h" #include "ift/freq/probability_calculator.h" #include "util/common.pb.h" +#include "util/segmentation_plan.pb.h" #include "util/segmenter_config.pb.h" namespace ift::encoder { @@ -89,6 +91,12 @@ class ClosureGlyphSegmenter { uint32_t& fallback_glyphs_size, uint32_t& all_glyphs_size) const; + static void AddTableKeyedSegments( + SegmentationPlan& plan, + const absl::btree_map& merge_groups, + const std::vector& segments, + const SubsetDefinition& init_segment); + private: uint32_t brotli_quality_; uint32_t init_font_merging_brotli_quality_; diff --git a/util/BUILD b/util/BUILD index e46fec96..0dcb17d6 100644 --- a/util/BUILD +++ b/util/BUILD @@ -64,9 +64,15 @@ cc_binary( srcs = [ "font2ift.cc", ], + data = [ + "@ift_encoder_data//:freq_data", + ], deps = [ + ":auto_config_flags", + ":auto_segmenter_config", ":load_codepoints", ":segmentation_plan_cc_proto", + ":segmenter_config_util", "//common", "//ift", "//ift/encoder", @@ -76,6 +82,7 @@ cc_binary( "@abseil-cpp//absl/status:statusor", "@abseil-cpp//absl/strings", "@harfbuzz", + "//util:segmenter_config_cc_proto", ], ) @@ -103,6 +110,7 @@ cc_binary( "@ift_encoder_data//:freq_data", ], deps = [ + ":auto_config_flags", ":auto_segmenter_config", ":load_codepoints", ":segmentation_plan_cc_proto", @@ -138,6 +146,16 @@ cc_binary( ], ) +cc_library( + name = "auto_config_flags", + srcs = ["auto_config_flags.cc"], + hdrs = ["auto_config_flags.h"], + visibility = ["//visibility:public"], + deps = [ + "@abseil-cpp//absl/flags:flag", + ], +) + cc_library( name = "convert_iftb", srcs = [ @@ -203,10 +221,12 @@ cc_library( ], deps = [ ":load_codepoints", + ":segmentation_plan_cc_proto", ":segmenter_config_cc_proto", "//common", "//ift/encoder", "@abseil-cpp//absl/status:statusor", + "@harfbuzz", ], ) diff --git a/util/auto_config_flags.cc b/util/auto_config_flags.cc new file mode 100644 index 00000000..ab35f9f5 --- /dev/null +++ b/util/auto_config_flags.cc @@ -0,0 +1,15 @@ +#include "util/auto_config_flags.h" + +#include + +#include "absl/flags/flag.h" + +ABSL_FLAG(int, auto_config_quality, 0, + "The quality level to use when generating a segmenter config. A value of 0 " + "means auto pick. Valid values are 1-8."); + +ABSL_FLAG(std::string, auto_config_primary_script, "Script_latin", + "When auto_config is enabled this sets the primary script or " + "language frequency data file to use. " + "The primary script is eligible to have codepoints moved to the init font. " + "For CJK primary script can be used to specialize against a specific language/script."); diff --git a/util/auto_config_flags.h b/util/auto_config_flags.h new file mode 100644 index 00000000..4f158361 --- /dev/null +++ b/util/auto_config_flags.h @@ -0,0 +1,11 @@ +#ifndef UTIL_AUTO_CONFIG_FLAGS_H_ +#define UTIL_AUTO_CONFIG_FLAGS_H_ + +#include + +#include "absl/flags/declare.h" + +ABSL_DECLARE_FLAG(int, auto_config_quality); +ABSL_DECLARE_FLAG(std::string, auto_config_primary_script); + +#endif // UTIL_AUTO_CONFIG_FLAGS_H_ diff --git a/util/auto_segmenter_config.cc b/util/auto_segmenter_config.cc index 843404f0..b17b465a 100644 --- a/util/auto_segmenter_config.cc +++ b/util/auto_segmenter_config.cc @@ -2,7 +2,6 @@ #include #include -#include #include "absl/container/flat_hash_set.h" #include "absl/log/log.h" @@ -50,12 +49,6 @@ enum Quality { MAX = 8, // Alias for EIGHT }; -// TODO(garretrieger): define a very basic set of quality levels first (see next TODO), -// start with just a lowest and highest to set the upper and lower bounds for quality -// settings (maybe also a mid point). To begin use number of codepoints to select quality -// level. Do some testing on segmentation times at low and high to get a sense of -// how times are impacted. - // TODO(garretrieger): do something analagous to brotli quality levels // where we define a series of levels which correspond to a set of // values for the quality/performance tradeoff settings (including setting the @@ -291,7 +284,7 @@ StatusOr AutoSegmenterConfig::GetBaseScriptForLanguage( } static const auto* lang_to_script = - new std::unordered_map{ + new flat_hash_map { {"Language_af", "Script_latin"}, {"Language_ak", "Script_latin"}, {"Language_am", "Script_ethiopic"}, @@ -602,7 +595,7 @@ static void ApplyQualityLevelTo(Quality quality, SegmenterConfig& config) { } } -absl::StatusOr AutoSegmenterConfig::GenerateConfig( +StatusOr AutoSegmenterConfig::GenerateConfig( hb_face_t* face, std::optional primary_script, std::optional quality_level) { SegmenterConfig config; config.set_generate_table_keyed_segments(true); @@ -617,9 +610,22 @@ absl::StatusOr AutoSegmenterConfig::GenerateConfig( auto freq_list = TRY(BuiltInFrequenciesList()); CodepointSet unicodes = FontHelper::ToCodepointsSet(face); uint32_t cp_count = unicodes.size(); - Quality quality = cp_count > 2000 ? MIN : MAX; - if (quality_level.has_value() && quality_level.value() >= ONE && quality_level.value() <= MAX) { + + // TODO(garretrieger): more sophisticated scheme for auto picking quality level. + // roughly we want to estimate the expected cost of each quality level and pick + // based on that. + Quality quality = THREE; + if (cp_count <= 1000) { + quality = MAX; + } else if (cp_count <= 3000) { + quality_level = SIX; + } + + if (quality_level.has_value() && quality_level.value() >= MIN && quality_level.value() <= MAX) { quality = static_cast(quality_level.value()); + VLOG(0) << "Using specified quality level for segmenting: " << quality; + } else { + VLOG(0) << "Quality level unspecified, auto picked: " << quality; } // Detect scripts by intersection with frequency data @@ -644,7 +650,6 @@ absl::StatusOr AutoSegmenterConfig::GenerateConfig( cost->set_built_in_freq_data_name(script); if (script == primary_script_file) { - // TODO(garretrieger): customize these values based on the quality level cost->set_initial_font_merge_threshold(-60); } } @@ -654,4 +659,4 @@ absl::StatusOr AutoSegmenterConfig::GenerateConfig( return config; } -} // namespace util +} // namespace util \ No newline at end of file diff --git a/util/auto_segmenter_config.h b/util/auto_segmenter_config.h index 9c974a7d..ba00a9a5 100644 --- a/util/auto_segmenter_config.h +++ b/util/auto_segmenter_config.h @@ -19,6 +19,11 @@ class AutoSegmenterConfig { // primary_script: an optional name of a script or language frequency data // file (e.g., "Script_cyrillic", "Language_fr"). // Defaults to "Script_latin" if not provided. + // + // quality_level: ranges from 1-8, sets the segmenting time to segmentation + // quality tradeoff. Lower values have shorter segmenting times, + // high values have longer segmenting times but typically results + // in better segmentation quality. static absl::StatusOr GenerateConfig( hb_face_t* face, std::optional primary_script = std::nullopt, diff --git a/util/closure_glyph_keyed_segmenter_util.cc b/util/closure_glyph_keyed_segmenter_util.cc index fabe08af..3a26982e 100644 --- a/util/closure_glyph_keyed_segmenter_util.cc +++ b/util/closure_glyph_keyed_segmenter_util.cc @@ -23,6 +23,7 @@ #include "ift/encoder/merge_strategy.h" #include "ift/encoder/subset_definition.h" #include "ift/freq/unicode_frequencies.h" +#include "util/auto_config_flags.h" #include "util/auto_segmenter_config.h" #include "util/load_codepoints.h" #include "util/segmentation_plan.pb.h" @@ -39,20 +40,11 @@ ABSL_FLAG(std::string, input_font, "in.ttf", "Name of the font to convert to IFT."); ABSL_FLAG( - std::string, config, "config.textpb", + std::string, config, "auto", "Path to a text proto file containing the configuration for the segmenter. " - "Should contain a single SegmenterConfig message."); - -ABSL_FLAG(int, auto_config_quality, 0, - "The quality level to use when auto_config is enabled. A value of 0 means auto pick. Valid values are 1-8."); - -ABSL_FLAG(bool, auto_config, false, - "If set the segmenter configuration will be automatically generated " - "based on the input font."); - -ABSL_FLAG(std::string, primary_script, "Script_latin", - "When auto_config is enabled this sets the primary script or " - "language frequency data file to use."); + "Should contain a single SegmenterConfig message. If set to \"auto\", then " + "segmenter configuration will be automatically generated " + "based on the input font."); ABSL_FLAG(bool, output_segmentation_plan, false, "If set a segmentation plan representing the determined segmentation " @@ -97,13 +89,13 @@ using util::AutoSegmenterConfig; using util::SegmenterConfigUtil; static StatusOr LoadConfig(hb_face_t* font) { - if (absl::GetFlag(FLAGS_auto_config)) { + if (absl::GetFlag(FLAGS_config) == "auto") { std::optional quality_level = std::nullopt; if (absl::GetFlag(FLAGS_auto_config_quality) > 0) { quality_level = absl::GetFlag(FLAGS_auto_config_quality); } return AutoSegmenterConfig::GenerateConfig( - font, absl::GetFlag(FLAGS_primary_script), quality_level); + font, absl::GetFlag(FLAGS_auto_config_primary_script), quality_level); } FontData config_text = @@ -155,39 +147,6 @@ static Status Analysis(hb_face_t* font, return absl::OkStatus(); } -static void AddTableKeyedSegments( - SegmentationPlan& plan, - const btree_map& merge_groups, - const std::vector& segments, - const SubsetDefinition& init_segment) { - std::vector table_keyed_segments; - for (const auto& [segment_ids, _] : merge_groups) { - SubsetDefinition new_segment; - for (uint32_t s : segment_ids) { - new_segment.Union(segments.at(s)); - } - new_segment.Subtract(init_segment); - table_keyed_segments.push_back(new_segment); - } - - uint32_t max_id = 0; - for (const auto& [id, _] : plan.segments()) { - if (id > max_id) { - max_id = id; - } - } - - uint32_t next_id = max_id + 1; - auto* plan_segments = plan.mutable_segments(); - for (const SubsetDefinition& def : table_keyed_segments) { - GlyphSegmentation::SubsetDefinitionToSegment(def, - (*plan_segments)[next_id]); - SegmentsProto* segment_ids = plan.add_non_glyph_segments(); - segment_ids->add_values(next_id); - next_id++; - } -} - static Status OutputFallbackGlyphCount(hb_face_t* original_face, const ClosureGlyphSegmenter& segmenter, const GlyphSegmentation& segmentation) { @@ -216,49 +175,29 @@ static Status Main(const std::vector args) { SegmenterConfig config = TRY(LoadConfig(font.get())); SegmenterConfigUtil config_util( - absl::GetFlag(FLAGS_auto_config) ? "" : absl::GetFlag(FLAGS_config)); - - CodepointSet font_codepoints = FontHelper::ToCodepointsSet(font.get()); - btree_set font_features = FontHelper::GetFeatureTags(font.get()); - SubsetDefinition init_segment = - config_util.SegmentProtoToSubsetDefinition(config.initial_segment()); - - std::vector segments; - btree_map merge_groups = - TRY(config_util.ConfigToMergeGroups(config, font_codepoints, - font_features, segments)); - - ClosureGlyphSegmenter segmenter( - config.brotli_quality(), config.brotli_quality_for_initial_font_merging(), - config.unmapped_glyph_handling(), config.condition_analysis_mode()); + (absl::GetFlag(FLAGS_config) == "auto") ? "" : absl::GetFlag(FLAGS_config)); auto start_time = std::chrono::high_resolution_clock::now(); - GlyphSegmentation segmentation = TRY(segmenter.CodepointToGlyphSegments( - font.get(), init_segment, segments, merge_groups)); + auto result = TRY(config_util.RunSegmenter(font.get(), config)); auto end_time = std::chrono::high_resolution_clock::now(); std::chrono::duration duration = end_time - start_time; std::cerr << "CodepointToGlyphSegments took: " << duration.count() << " seconds" << std::endl; + GlyphSegmentation segmentation = std::move(result.segmentation); + SegmentationPlan plan = std::move(result.plan); + if (absl::GetFlag(FLAGS_output_segmentation_plan)) { - SegmentationPlan plan = segmentation.ToSegmentationPlanProto(); if (!absl::GetFlag(FLAGS_include_initial_codepoints_in_config)) { // Requested to not include init codepoints in the generated config. plan.clear_initial_codepoints(); } - if (config.generate_table_keyed_segments()) { - AddTableKeyedSegments(plan, merge_groups, segments, init_segment); - } - - SegmentationPlan combined = config.base_segmentation_plan(); - combined.MergeFrom(plan); - // TODO(garretrieger): assign a basic (single segment) table keyed config. // Later on the input to this util should include information on how the // segments should be grouped together for the table keyed portion of the // font. std::string config_string; - TextFormat::PrintToString(combined, &config_string); + TextFormat::PrintToString(plan, &config_string); std::cout << config_string; } else { // No config requested, just output a simplified plain text representation @@ -267,6 +206,9 @@ static Status Main(const std::vector args) { } if (absl::GetFlag(FLAGS_output_fallback_glyph_count)) { + ClosureGlyphSegmenter segmenter( + config.brotli_quality(), config.brotli_quality_for_initial_font_merging(), + config.unmapped_glyph_handling(), config.condition_analysis_mode()); TRYV(OutputFallbackGlyphCount(font.get(), segmenter, segmentation)); } @@ -275,7 +217,7 @@ static Status Main(const std::vector args) { } std::cerr << ">> Analysis" << std::endl; - return Analysis(font.get(), merge_groups, segmentation); + return Analysis(font.get(), result.merge_groups, segmentation); } int main(int argc, char** argv) { diff --git a/util/font2ift.cc b/util/font2ift.cc index eb2564cf..6c543cb9 100644 --- a/util/font2ift.cc +++ b/util/font2ift.cc @@ -9,6 +9,8 @@ #include "absl/container/flat_hash_map.h" #include "absl/flags/flag.h" #include "absl/flags/parse.h" +#include "absl/log/globals.h" +#include "absl/log/initialize.h" #include "absl/status/statusor.h" #include "absl/strings/str_cat.h" #include "common/axis_range.h" @@ -21,23 +23,30 @@ #include "ift/encoder/compiler.h" #include "ift/encoder/glyph_segmentation.h" #include "ift/encoder/subset_definition.h" +#include "util/auto_config_flags.h" +#include "util/auto_segmenter_config.h" #include "util/load_codepoints.h" #include "util/segmentation_plan.pb.h" +#include "util/segmenter_config.pb.h" +#include "util/segmenter_config_util.h" /* - * Utility that converts a standard font file into an IFT font file following a + * Utility that converts a standard font file into an IFT font file optionally following a * supplied segmentation plan. * * Configuration is provided as a textproto file following the * segmentation_plan.proto schema. + * + * If no configuration is supplied it will be auto generated. */ ABSL_FLAG(std::string, input_font, "in.ttf", "Name of the font to convert to IFT."); -ABSL_FLAG(std::string, plan, "", +ABSL_FLAG(std::string, plan, "auto", "Path to a plan file which is a textproto following the " - "segmentation_plan.proto schema."); + "segmentation_plan.proto schema. If set to \"auto\", then " + "segmentation plan will be automatically generated."); ABSL_FLAG(std::string, output_path, "./", "Path to write output files under (base font and patches)."); @@ -50,6 +59,10 @@ ABSL_FLAG(bool, woff2_encode, true, "in woff2 will be disabled when necessary to keep the woff2 encoding " "compatible with IFT."); +ABSL_FLAG( + int, verbosity, 0, + "Log verbosity level from. 0 is least verbose, higher values are more."); + using absl::btree_set; using absl::flat_hash_map; using absl::Status; @@ -68,6 +81,7 @@ using ift::encoder::Compiler; using ift::encoder::design_space_t; using ift::encoder::GlyphSegmentation; using ift::encoder::SubsetDefinition; +using util::AutoSegmenterConfig; // TODO(garretrieger): add check that all glyph patches have at least one // activation condition. @@ -258,22 +272,44 @@ Status ConfigureCompiler(SegmentationPlan plan, Compiler& compiler) { return absl::OkStatus(); } -int main(int argc, char** argv) { - auto args = absl::ParseCommandLine(argc, argv); +StatusOr CreateSegmentationPlan(hb_face_t* font) { + SegmentationPlan plan; + if (absl::GetFlag(FLAGS_plan).empty() || absl::GetFlag(FLAGS_plan) == "auto") { + std::cerr << ">> auto generating segmentation plan:" << std::endl; + std::optional quality_level = std::nullopt; + if (absl::GetFlag(FLAGS_auto_config_quality) > 0) { + quality_level = absl::GetFlag(FLAGS_auto_config_quality); + } + auto config = AutoSegmenterConfig::GenerateConfig( + font, absl::GetFlag(FLAGS_auto_config_primary_script), quality_level); + if (!config.ok()) { + return absl::InternalError(StrCat("Failed to generate config: ", config.status().message())); + } + util::SegmenterConfigUtil config_util(""); + auto result = config_util.RunSegmenter(font, *config); + if (!result.ok()) { + return absl::InternalError(StrCat("Failed to run segmenter: ", result.status().message())); + } + plan = std::move(result->plan); + } else { + auto config_text = util::LoadFile(absl::GetFlag(FLAGS_plan).c_str()); + if (!config_text.ok()) { + return absl::InternalError(StrCat("Failed to load config file: ", config_text.status().message())); + } - auto config_text = util::LoadFile(absl::GetFlag(FLAGS_plan).c_str()); - if (!config_text.ok()) { - std::cerr << "Failed to load config file: " << config_text.status() - << std::endl; - return -1; + if (!google::protobuf::TextFormat::ParseFromString(config_text->str(), + &plan)) { + return absl::InternalError("Failed to parse input config."); + } } + return plan; +} - SegmentationPlan plan; - if (!google::protobuf::TextFormat::ParseFromString(config_text->str(), - &plan)) { - std::cerr << "Failed to parse input config." << std::endl; - return -1; - } +int main(int argc, char** argv) { + absl::SetStderrThreshold(absl::LogSeverityAtLeast::kInfo); + absl::SetGlobalVLogLevel(absl::GetFlag(FLAGS_verbosity)); + auto args = absl::ParseCommandLine(argc, argv); + absl::InitializeLog(); auto font = load_font(absl::GetFlag(FLAGS_input_font).c_str()); if (!font.ok()) { @@ -281,10 +317,16 @@ int main(int argc, char** argv) { return -1; } + auto plan = CreateSegmentationPlan(font->get()); + if (!plan.ok()) { + std::cerr << plan.status().message() << std::endl; + return -1; + } + Compiler compiler; compiler.SetFace(font->get()); - auto sc = ConfigureCompiler(plan, compiler); + auto sc = ConfigureCompiler(*plan, compiler); if (!sc.ok()) { std::cerr << "Failed to apply configuration to the encoder: " << sc << std::endl; diff --git a/util/segmenter_config_util.cc b/util/segmenter_config_util.cc index 0da4fe6d..b3320f21 100644 --- a/util/segmenter_config_util.cc +++ b/util/segmenter_config_util.cc @@ -2,8 +2,11 @@ #include +#include "common/font_helper.h" #include "common/int_set.h" #include "common/try.h" +#include "ift/encoder/closure_glyph_segmenter.h" +#include "ift/encoder/glyph_segmentation.h" #include "ift/encoder/merge_strategy.h" #include "ift/encoder/subset_definition.h" #include "ift/feature_registry/feature_registry.h" @@ -20,6 +23,8 @@ using ift::encoder::MergeStrategy; using ift::encoder::SubsetDefinition; using ift::feature_registry::DefaultFeatureTags; using ift::freq::UnicodeFrequencies; +using ift::encoder::ClosureGlyphSegmenter; +using ift::encoder::GlyphSegmentation; namespace util { @@ -277,4 +282,39 @@ SegmenterConfigUtil::ConfigToMergeGroups( return merge_groups; } +StatusOr SegmenterConfigUtil::RunSegmenter( + hb_face_t* face, const SegmenterConfig& config) { + CodepointSet font_codepoints = common::FontHelper::ToCodepointsSet(face); + btree_set font_features = common::FontHelper::GetFeatureTags(face); + SubsetDefinition init_segment = + SegmentProtoToSubsetDefinition(config.initial_segment()); + + std::vector segments; + btree_map merge_groups = + TRY(ConfigToMergeGroups(config, font_codepoints, font_features, segments)); + + ClosureGlyphSegmenter segmenter( + config.brotli_quality(), config.brotli_quality_for_initial_font_merging(), + config.unmapped_glyph_handling(), config.condition_analysis_mode()); + + GlyphSegmentation segmentation = TRY(segmenter.CodepointToGlyphSegments( + face, init_segment, segments, merge_groups)); + + SegmentationPlan plan = segmentation.ToSegmentationPlanProto(); + + if (config.generate_table_keyed_segments()) { + ClosureGlyphSegmenter::AddTableKeyedSegments( + plan, merge_groups, segments, init_segment); + } + + SegmentationPlan combined = config.base_segmentation_plan(); + combined.MergeFrom(plan); + + return SegmentationResult{ + std::move(segmentation), + std::move(combined), + std::move(merge_groups), + }; +} + } // namespace util \ No newline at end of file diff --git a/util/segmenter_config_util.h b/util/segmenter_config_util.h index 6eb6d332..6c9ce50d 100644 --- a/util/segmenter_config_util.h +++ b/util/segmenter_config_util.h @@ -4,17 +4,29 @@ #include "absl/container/btree_map.h" #include "absl/status/statusor.h" #include "common/int_set.h" +#include "hb.h" +#include "ift/encoder/glyph_segmentation.h" #include "ift/encoder/merge_strategy.h" #include "ift/encoder/subset_definition.h" +#include "util/segmentation_plan.pb.h" #include "util/segmenter_config.pb.h" namespace util { +struct SegmentationResult { + ift::encoder::GlyphSegmentation segmentation; + SegmentationPlan plan; + absl::btree_map merge_groups; +}; + class SegmenterConfigUtil { public: SegmenterConfigUtil(std::string config_file_path) : config_file_path_(config_file_path) {} + absl::StatusOr RunSegmenter( + hb_face_t* face, const SegmenterConfig& config); + ift::encoder::SubsetDefinition SegmentProtoToSubsetDefinition( const SegmentProto& segment);