unicode-org · eggrobin · Apr 20, 2026 · Apr 14, 2026 · Apr 15, 2026 · Apr 20, 2026
diff --git a/unicodetools/src/main/java/org/unicode/text/tools/Indexer.java b/unicodetools/src/main/java/org/unicode/text/tools/Indexer.java
@@ -9,12 +9,14 @@
 import com.ibm.icu.text.Transliterator;
 import com.ibm.icu.text.UnicodeSet;
 import java.io.BufferedReader;
+import java.io.ByteArrayOutputStream;
 import java.io.File;
 import java.io.FileReader;
 import java.io.IOException;
 import java.io.PrintStream;
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.Base64;
 import java.util.Comparator;
 import java.util.HashMap;
 import java.util.HashSet;
@@ -26,6 +28,7 @@
 import java.util.TreeMap;
 import java.util.stream.Collectors;
 import java.util.stream.StreamSupport;
+import java.util.zip.DeflaterOutputStream;
 import org.unicode.props.IndexUnicodeProperties;
 import org.unicode.props.UcdProperty;
 import org.unicode.props.UcdPropertyValues;
@@ -38,6 +41,8 @@
 
 public class Indexer {
 
+    private static final char RECORD_SEPARATOR = 0x001E;
+
     private static Transliterator toHTML;
     private static String htmlRulesControls;
 
@@ -82,6 +87,27 @@ public class Indexer {
 
     private static int maxRSEntryCharacters = 0;
 
+    private static class StringIndexer {
+        public StringIndexer() {}
+
+        public int getStringIndex(String s) {
+            int result = stringIndices.getOrDefault(s, allTheStrings.length());
+            if (result == allTheStrings.length()) {
+                allTheStrings.append(s).append(RECORD_SEPARATOR);
+                stringIndices.put(s, result);
+            }
+            return result;
+        }
+
+        @Override
+        public String toString() {
+            return allTheStrings.toString();
+        }
+
+        private final HashMap<String, Integer> stringIndices = new HashMap<>();
+        private final StringBuilder allTheStrings = new StringBuilder();
+    }
+
     static {
         String baseRules =
                 "'<' > '&lt;' ;"
@@ -113,8 +139,8 @@ public class Indexer {
     }
 
     private static class IndexEntry {
-        IndexEntry(String snippet, UnicodeProperty property) {
-            this.snippet = snippet;
+        IndexEntry(int snippetIndex, UnicodeProperty property) {
+            this.snippetIndex = snippetIndex;
             this.property = property;
             characters = new UnicodeSet();
         }
@@ -127,12 +153,12 @@ List<IndexSubEntry> subEntries() {
                         /* showName= */ property != NAME,
                         characters);
             } catch (Exception e) {
-                System.err.println("In entry for " + property.getName() + ": " + snippet);
+                System.err.println("In entry for " + property.getName() + ": " + snippetIndex);
                 throw e;
             }
         }
 
-        String snippet;
+        int snippetIndex;
         UnicodeProperty property;
         UnicodeSet characters;
         Map<String, UnicodeSet> relatedCharacters = new TreeMap<>();
@@ -258,11 +284,13 @@ public int compare(UnicodeProperty left, UnicodeProperty right) {
                 return left.getName().compareTo(right.getName());
             }
         }
-        // Property to property value to index entry.
-        Map<UnicodeProperty, Map<String, IndexEntry>> indexEntries =
+        final var allTheStrings = new StringIndexer();
+        // Property to snippet based on property value (as an index in allTheStrings) to index
+        // entry.
+        Map<UnicodeProperty, Map<Integer, IndexEntry>> indexEntries =
                 new TreeMap<>(new PropertyComparator());
-        // Lemma to snippet to position of the word in the snippet.
-        Map<String, Map<String, Integer>> wordIndex = new TreeMap<>();
+        // Lemma to snippet (as an index in allTheStrings) to position of the word in the snippet.
+        Map<String, Map<Integer, Integer>> wordIndex = new TreeMap<>();
         final var properties =
                 List.of(
                         BLOCK,
@@ -292,10 +320,9 @@ public int compare(UnicodeProperty left, UnicodeProperty right) {
                     } else if (prop == NAME) {
                         snippet = snippet.replace(Utility.hex(cp), "#");
                     }
-                    // Copy the snippet to a final variable for use in the λ.
-                    final String indexSnippet = snippet;
+                    final int snippetIndex = allTheStrings.getStringIndex(snippet);
                     propertyIndex
-                            .computeIfAbsent(snippet, k -> new IndexEntry(indexSnippet, prop))
+                            .computeIfAbsent(snippetIndex, k -> new IndexEntry(k, prop))
                             .characters
                             .add(cp);
                     // Override word breaking of ' and - in appropriate contexts so that
@@ -313,11 +340,11 @@ public int compare(UnicodeProperty left, UnicodeProperty right) {
                             String lemma = lemmatize(word);
                             wordIndex
                                     .computeIfAbsent(fold(word), k -> new TreeMap<>())
-                                    .putIfAbsent(snippet, start);
+                                    .putIfAbsent(snippetIndex, start);
                             if (!lemma.equals(fold(word))) {
                                 wordIndex
                                         .computeIfAbsent(lemma, k -> new TreeMap<>())
-                                        .putIfAbsent(snippet, start);
+                                        .putIfAbsent(snippetIndex, start);
                             }
                         }
                     }
@@ -327,18 +354,20 @@ public int compare(UnicodeProperty left, UnicodeProperty right) {
                 System.out.println("Indexed plane " + cp / 0x10000);
             }
         }
+        final int bettyIndex = allTheStrings.getStringIndex("Betty");
+        final int theIndex = allTheStrings.getStringIndex("the");
         indexEntries
                 .get(BLOCK)
-                .computeIfAbsent("Betty", k -> new IndexEntry(k, BLOCK))
+                .computeIfAbsent(bettyIndex, k -> new IndexEntry(k, BLOCK))
                 .characters
                 .add(BOOP);
         indexEntries
                 .get(BLOCK)
-                .computeIfAbsent("the", k -> new IndexEntry(k, BLOCK))
+                .computeIfAbsent(theIndex, k -> new IndexEntry(k, BLOCK))
                 .characters
                 .add(DOOD);
-        wordIndex.computeIfAbsent("betty", k -> new TreeMap<>()).putIfAbsent("Betty", 0);
-        wordIndex.computeIfAbsent("the", k -> new TreeMap<>()).putIfAbsent("the", 0);
+        wordIndex.computeIfAbsent("betty", k -> new TreeMap<>()).putIfAbsent(bettyIndex, 0);
+        wordIndex.computeIfAbsent("the", k -> new TreeMap<>()).putIfAbsent(theIndex, 0);
 
         System.out.println("Radicals…");
         final var radicalSets = getRadicalSets();
@@ -379,61 +408,101 @@ public int compare(UnicodeProperty left, UnicodeProperty right) {
                 }
                 css.close();
             } else if (htmlLine.contains("JS HERE")) {
-                file.println("let wordIndex = new Map([");
+                // No pretty-printing in the loops that print these two maps; each space or newline
+                // here enlarges charindex.html by hundreds of kilobytes.  These are not suitable
+                // for human consumption anyway, since anything readable is turned into indices in
+                // allTheStrings.
+                file.print("let wordIndex = new Map([");
                 System.out.println("wordIndex...");
                 {
                     int i = 0;
                     for (var wordAndSnippets : wordIndex.entrySet()) {
                         if (++i % 1000 == 0) {
                             System.out.println(i + "/" + wordIndex.size() + "...");
                         }
-                        file.println(
-                                "    ['"
+                        file.print(
+                                "['"
                                         + wordAndSnippets.getKey().replace("'", "\\'")
-                                        + "', new Map([");
-                        for (var snippetAndPosition : wordAndSnippets.getValue().entrySet()) {
-                            file.println(
-                                    "      ['"
-                                            + snippetAndPosition.getKey().replace("'", "\\'")
-                                            + "', "
-                                            + snippetAndPosition.getValue()
-                                            + "],");
-                        }
-                        file.println("])],");
+                                        + "',new Map([");
+                        // Stream and collect for the innermost map to avoid trailing commas, for
+                        // size.
+                        file.print(
+                                wordAndSnippets.getValue().entrySet().stream()
+                                        .map(
+                                                snippetAndPosition ->
+                                                        "["
+                                                                + snippetAndPosition.getKey()
+                                                                + ","
+                                                                + snippetAndPosition.getValue()
+                                                                + "]")
+                                        .collect(Collectors.joining(",")));
+                        file.print("])],");
                     }
                 }
                 file.println("]);");
                 System.out.println("indexEntries...");
-                file.println("let indexEntries = new Map([");
+                file.print("let indexEntries = new Map([");
                 for (var property : properties) {
                     System.out.println(property.getName() + "...");
                     final var propertyIndex = indexEntries.get(property);
-                    file.println("  ['" + property.getName() + "', new Map([");
+                    file.print("['" + property.getName() + "',new Map([");
                     int i = 0;
                     for (var indexEntry : propertyIndex.values()) {
                         if (++i % 1000 == 0) {
                             System.out.println(i + "/" + propertyIndex.size() + "...");
                         }
-                        file.println("    ['" + indexEntry.snippet.replace("'", "\\'") + "', {");
-                        file.println(
-                                "       html: \""
-                                        + indexEntry.toHTML().replace("\"", "\\\"")
-                                        + "\",");
-                        file.println("       characters: [");
-                        for (var range : indexEntry.coveredCharacters().ranges()) {
-                            file.println(
-                                    "         [0x"
-                                            + Utility.hex(range.codepoint)
-                                            + ", 0x"
-                                            + Utility.hex(range.codepointEnd)
-                                            + "],");
-                        }
-                        file.println("      ],");
-                        file.println("    }],");
+                        final int htmlIndex = allTheStrings.getStringIndex(indexEntry.toHTML());
+                        file.print("[" + indexEntry.snippetIndex + ",{");
+                        file.print("html:" + htmlIndex + ",");
+                        file.print("characters:[");
+                        // Stream and collect for the innermost array to avoid trailing commas, for
+                        // size.
+                        file.print(
+                                indexEntry
+                                        .coveredCharacters()
+                                        .rangeStream()
+                                        .map(
+                                                range ->
+                                                        // Code points in decimal without
+                                                        // zero-padding for size.
+                                                        "["
+                                                                + range.codepoint
+                                                                + (range.codepointEnd
+                                                                                != range.codepoint
+                                                                        ? "," + range.codepointEnd
+                                                                        : "")
+                                                                + "]")
+                                        .collect(Collectors.joining(",")));
+                        file.print("]}],");
                     }
-                    file.println("  ])],");
+                    file.print("])],");
                 }
                 file.println("]);");
+                file.println("let bettyIndex = " + bettyIndex + ";");
+                file.println("let theIndex = " + theIndex + ";");
+                final var compressed = new ByteArrayOutputStream();
+                final var compressor = new DeflaterOutputStream(compressed);
+                final var uncompressed = allTheStrings.toString().getBytes("UTF-8");
+                compressor.write(uncompressed);
+                compressor.close();
+                final var compressedBytes = compressed.toByteArray();
+                System.out.println(
+                        "Strings compressed from "
+                                + (uncompressed.length >> 20)
+                                + " MiB to "
+                                + (compressedBytes.length >> 10)
+                                + " kiB ("
+                                + 100 * compressedBytes.length / uncompressed.length
+                                + "%)");
+                System.out.println(
+                        "Compressed payload is "
+                                + compressedBytes.length
+                                + " bytes, first byte is "
+                                + Byte.toUnsignedInt(compressedBytes[0]));
+                file.println(
+                        "let allTheStringsCompressed = '"
+                                + Base64.getEncoder().encodeToString(compressedBytes)
+                                + "'");
                 final var js =
                         new BufferedReader(new FileReader(new File(resources + "charindex.js")));
                 for (String jsLine = js.readLine(); jsLine != null; jsLine = js.readLine()) {