From ada735e4f3929ddbaaf57ef3117490a45ca7f599 Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 15 Apr 2026 01:04:33 +0200 Subject: [PATCH 1/3] A more compact charindex --- .../java/org/unicode/text/tools/Indexer.java | 153 ++++++++++++------ .../org/unicode/text/tools/charindex.js | 104 +++++++----- 2 files changed, 167 insertions(+), 90 deletions(-) diff --git a/unicodetools/src/main/java/org/unicode/text/tools/Indexer.java b/unicodetools/src/main/java/org/unicode/text/tools/Indexer.java index eddfa21e6..26a7cfcbb 100644 --- a/unicodetools/src/main/java/org/unicode/text/tools/Indexer.java +++ b/unicodetools/src/main/java/org/unicode/text/tools/Indexer.java @@ -9,12 +9,14 @@ import com.ibm.icu.text.Transliterator; import com.ibm.icu.text.UnicodeSet; import java.io.BufferedReader; +import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileReader; import java.io.IOException; import java.io.PrintStream; import java.util.ArrayList; import java.util.Arrays; +import java.util.Base64; import java.util.Comparator; import java.util.HashMap; import java.util.HashSet; @@ -26,6 +28,7 @@ import java.util.TreeMap; import java.util.stream.Collectors; import java.util.stream.StreamSupport; +import java.util.zip.DeflaterOutputStream; import org.unicode.props.IndexUnicodeProperties; import org.unicode.props.UcdProperty; import org.unicode.props.UcdPropertyValues; @@ -38,6 +41,8 @@ public class Indexer { + private static final char RECORD_SEPARATOR = 0x001E; + private static Transliterator toHTML; private static String htmlRulesControls; @@ -113,8 +118,8 @@ public class Indexer { } private static class IndexEntry { - IndexEntry(String snippet, UnicodeProperty property) { - this.snippet = snippet; + IndexEntry(int snippetIndex, UnicodeProperty property) { + this.snippetIndex = snippetIndex; this.property = property; characters = new UnicodeSet(); } @@ -127,12 +132,12 @@ List subEntries() { /* showName= */ property != NAME, characters); } catch (Exception e) { - System.err.println("In entry for " + property.getName() + ": " + snippet); + System.err.println("In entry for " + property.getName() + ": " + snippetIndex); throw e; } } - String snippet; + int snippetIndex; UnicodeProperty property; UnicodeSet characters; Map relatedCharacters = new TreeMap<>(); @@ -258,11 +263,14 @@ public int compare(UnicodeProperty left, UnicodeProperty right) { return left.getName().compareTo(right.getName()); } } - // Property to property value to index entry. - Map> indexEntries = + final StringBuilder allTheStrings = new StringBuilder(); + final HashMap stringIndices = new HashMap<>(); + // Property to snippet based on property value (as an index in allTheStrings) to index + // entry. + Map> indexEntries = new TreeMap<>(new PropertyComparator()); - // Lemma to snippet to position of the word in the snippet. - Map> wordIndex = new TreeMap<>(); + // Lemma to snippet (as an index in allTheStrings) to position of the word in the snippet. + Map> wordIndex = new TreeMap<>(); final var properties = List.of( BLOCK, @@ -292,10 +300,14 @@ public int compare(UnicodeProperty left, UnicodeProperty right) { } else if (prop == NAME) { snippet = snippet.replace(Utility.hex(cp), "#"); } - // Copy the snippet to a final variable for use in the λ. - final String indexSnippet = snippet; + final int snippetIndex = + stringIndices.getOrDefault(snippet, allTheStrings.length()); + if (snippetIndex == allTheStrings.length()) { + allTheStrings.append(snippet).append(RECORD_SEPARATOR); + stringIndices.put(snippet, snippetIndex); + } propertyIndex - .computeIfAbsent(snippet, k -> new IndexEntry(indexSnippet, prop)) + .computeIfAbsent(snippetIndex, k -> new IndexEntry(k, prop)) .characters .add(cp); // Override word breaking of ' and - in appropriate contexts so that @@ -313,11 +325,11 @@ public int compare(UnicodeProperty left, UnicodeProperty right) { String lemma = lemmatize(word); wordIndex .computeIfAbsent(fold(word), k -> new TreeMap<>()) - .putIfAbsent(snippet, start); + .putIfAbsent(snippetIndex, start); if (!lemma.equals(fold(word))) { wordIndex .computeIfAbsent(lemma, k -> new TreeMap<>()) - .putIfAbsent(snippet, start); + .putIfAbsent(snippetIndex, start); } } } @@ -327,18 +339,22 @@ public int compare(UnicodeProperty left, UnicodeProperty right) { System.out.println("Indexed plane " + cp / 0x10000); } } + final int bettyIndex = allTheStrings.length(); + allTheStrings.append("Betty").append(RECORD_SEPARATOR); + final int theIndex = allTheStrings.length(); + allTheStrings.append("the").append(RECORD_SEPARATOR); indexEntries .get(BLOCK) - .computeIfAbsent("Betty", k -> new IndexEntry(k, BLOCK)) + .computeIfAbsent(bettyIndex, k -> new IndexEntry(k, BLOCK)) .characters .add(BOOP); indexEntries .get(BLOCK) - .computeIfAbsent("the", k -> new IndexEntry(k, BLOCK)) + .computeIfAbsent(theIndex, k -> new IndexEntry(k, BLOCK)) .characters .add(DOOD); - wordIndex.computeIfAbsent("betty", k -> new TreeMap<>()).putIfAbsent("Betty", 0); - wordIndex.computeIfAbsent("the", k -> new TreeMap<>()).putIfAbsent("the", 0); + wordIndex.computeIfAbsent("betty", k -> new TreeMap<>()).putIfAbsent(bettyIndex, 0); + wordIndex.computeIfAbsent("the", k -> new TreeMap<>()).putIfAbsent(theIndex, 0); System.out.println("Radicals…"); final var radicalSets = getRadicalSets(); @@ -379,7 +395,11 @@ public int compare(UnicodeProperty left, UnicodeProperty right) { } css.close(); } else if (htmlLine.contains("JS HERE")) { - file.println("let wordIndex = new Map(["); + // No pretty-printing in the loops that print these two maps; each space or newline + // here enlarges charindex.html by hundreds of kilobytes. These are not suitable + // for human consumption anyway, since anything readable is turned into indices in + // allTheStrings. + file.print("let wordIndex = new Map(["); System.out.println("wordIndex..."); { int i = 0; @@ -387,53 +407,88 @@ public int compare(UnicodeProperty left, UnicodeProperty right) { if (++i % 1000 == 0) { System.out.println(i + "/" + wordIndex.size() + "..."); } - file.println( - " ['" + file.print( + "['" + wordAndSnippets.getKey().replace("'", "\\'") - + "', new Map(["); - for (var snippetAndPosition : wordAndSnippets.getValue().entrySet()) { - file.println( - " ['" - + snippetAndPosition.getKey().replace("'", "\\'") - + "', " - + snippetAndPosition.getValue() - + "],"); - } - file.println("])],"); + + "',new Map(["); + // Stream and collect for the innermost map to avoid trailing commas, for + // size. + file.print( + wordAndSnippets.getValue().entrySet().stream() + .map( + snippetAndPosition -> + "[" + + snippetAndPosition.getKey() + + "," + + snippetAndPosition.getValue() + + "]") + .collect(Collectors.joining(","))); + file.print("])],"); } } file.println("]);"); System.out.println("indexEntries..."); - file.println("let indexEntries = new Map(["); + file.print("let indexEntries = new Map(["); for (var property : properties) { System.out.println(property.getName() + "..."); final var propertyIndex = indexEntries.get(property); - file.println(" ['" + property.getName() + "', new Map(["); + file.print("['" + property.getName() + "',new Map(["); int i = 0; for (var indexEntry : propertyIndex.values()) { if (++i % 1000 == 0) { System.out.println(i + "/" + propertyIndex.size() + "..."); } - file.println(" ['" + indexEntry.snippet.replace("'", "\\'") + "', {"); - file.println( - " html: \"" - + indexEntry.toHTML().replace("\"", "\\\"") - + "\","); - file.println(" characters: ["); - for (var range : indexEntry.coveredCharacters().ranges()) { - file.println( - " [0x" - + Utility.hex(range.codepoint) - + ", 0x" - + Utility.hex(range.codepointEnd) - + "],"); - } - file.println(" ],"); - file.println(" }],"); + final int htmlIndex = allTheStrings.length(); + allTheStrings.append(indexEntry.toHTML()).append(RECORD_SEPARATOR); + file.print("[" + indexEntry.snippetIndex + ",{"); + file.print("html:" + htmlIndex + ","); + file.print("characters:["); + // Stream and collect for the innermost array to avoid trailing commas, for + // size. + file.print( + indexEntry + .coveredCharacters() + .rangeStream() + .map( + range -> + // Code points in decimal without + // zero-padding for size. + "[" + + range.codepoint + + "," + + range.codepointEnd + + "]") + .collect(Collectors.joining(","))); + file.print("]}],"); } - file.println(" ])],"); + file.print("])],"); } file.println("]);"); + file.println("let bettyIndex = " + bettyIndex + ";"); + file.println("let theIndex = " + theIndex + ";"); + final var compressed = new ByteArrayOutputStream(); + final var compressor = new DeflaterOutputStream(compressed); + final var uncompressed = allTheStrings.toString().getBytes("UTF-8"); + compressor.write(uncompressed); + compressor.close(); + final var compressedBytes = compressed.toByteArray(); + System.out.println( + "Strings compressed from " + + (uncompressed.length >> 20) + + " MiB to " + + (compressedBytes.length >> 10) + + " kiB (" + + 100 * compressedBytes.length / uncompressed.length + + "%)"); + System.out.println( + "Compressed payload is " + + compressedBytes.length + + " bytes, first byte is " + + Byte.toUnsignedInt(compressedBytes[0])); + file.println( + "let allTheStringsCompressed = '" + + Base64.getEncoder().encodeToString(compressedBytes) + + "'"); final var js = new BufferedReader(new FileReader(new File(resources + "charindex.js"))); for (String jsLine = js.readLine(); jsLine != null; jsLine = js.readLine()) { diff --git a/unicodetools/src/main/resources/org/unicode/text/tools/charindex.js b/unicodetools/src/main/resources/org/unicode/text/tools/charindex.js index 4adbcdf44..eca405322 100644 --- a/unicodetools/src/main/resources/org/unicode/text/tools/charindex.js +++ b/unicodetools/src/main/resources/org/unicode/text/tools/charindex.js @@ -1,15 +1,28 @@ -// Lemma to snippet to position of the word in the snippet. -/**@type {Map>}*/ +// Lemma to snippet (compressed) to position of the word in the snippet. +/**@type {Map>}*/ let wordIndex/*= GENERATED LINE*/; -// Property name to snippet to index entry. -/**@type {Map>}*/ +// Property name to snippet (compressed) to index entry; the html is compressed. +/**@type {Map>}*/ let indexEntries/*= GENERATED LINE*/; +/**@type {number}*/ +let bettyIndex/*= GENERATED LINE*/; +/**@type {number}*/ +let theIndex/*= GENERATED LINE*/; +/**@type {string}*/ +let allTheStringsCompressed/*= GENERATED LINE*/; +let decompressor = new DecompressionStream("deflate"); +/**@type {string}*/ +var allTheStrings; +new Response( + new Blob([Uint8Array.fromBase64(allTheStringsCompressed)]) + .stream().pipeThrough(decompressor)) + .text().then(s => allTheStrings = s); -/**@type {Map}*/ +/**@type {Map}*/ let characterNames = new Map(); -/**@type {Map<[number, number], {property: string, snippet: string}>}*/ +/**@type {Map<[number, number], {property: string, snippetIndex: number}>}*/ let radicalStrokeRanges = new Map(); -/**@type {Map<[number, number], string>}*/ +/**@type {Map<[number, number], number>}*/ let characterNameRanges = new Map(); let maxResults = 100; @@ -18,9 +31,9 @@ for (let [property, propertyIndex] of indexEntries) { if (!property.endsWith("RSUnicode") && property !== "kSEAL_Rad") { continue; } - for (let [snippet, entry] of propertyIndex) { + for (let [snippetIndex, entry] of propertyIndex) { for (let range of entry.characters) { - radicalStrokeRanges.set(range, {property, snippet}); + radicalStrokeRanges.set(range, {property, snippetIndex}); } } } @@ -40,6 +53,12 @@ for (let [name, entry] of indexEntries.get("Name_Alias")) { } } +function getString(/**@type {number}*/ start) { + let RECORD_SEPARATOR = "\x1E"; + let limit = allTheStrings.indexOf(RECORD_SEPARATOR, start); + return allTheStrings.substring(start, limit); +} + function updateQuery(event) { if(event.key === 'Enter') { let newURL = window.location.protocol + "//" + window.location.host + window.location.pathname @@ -76,47 +95,49 @@ function search(/**@type {string}*/ query) { var covered = []; /**@type {string[]}*/ var result = []; - /**@type {Set}*/ - var resultSnippets = new Set(wordIndex.get(foldedQuery[0])?.keys() ?? []); + /**@type {Set}*/ + var resultSnippetIndices = new Set(wordIndex.get(foldedQuery[0])?.keys() ?? []); let firstLemmata = [foldedQuery[0]]; - if (resultSnippets.size === 0 && foldedQuery.length == 1) { + if (resultSnippetIndices.size === 0 && foldedQuery.length == 1) { let prefix = fold(queryWords.at(-1)); - for (let [completion, leaves] of wordIndex) { + for (let [completion, snippets] of wordIndex) { if (completion.startsWith(prefix)) { firstLemmata.push(completion); - resultSnippets = resultSnippets.union(leaves); + resultSnippetIndices = resultSnippetIndices.union(snippets); } } } for (var i = 1; i < foldedQuery.length; ++i) { var rhs = new Set(wordIndex.get(foldedQuery[i])?.keys() ?? []); - let intersection = resultSnippets.intersection(rhs); + let intersection = resultSnippetIndices.intersection(rhs); if (intersection.size === 0 && i == foldedQuery.length - 1) { let prefix = fold(queryWords.at(-1)); - for (let [completion, leaves] of wordIndex) { + for (let [completion, snippets] of wordIndex) { if (completion.startsWith(prefix)) { - rhs = rhs.union(leaves); + rhs = rhs.union(snippets); } } - resultSnippets = resultSnippets.intersection(rhs); + resultSnippetIndices = resultSnippetIndices.intersection(rhs); } else { - resultSnippets = intersection; + resultSnippetIndices = intersection; } } let pivots = firstLemmata.map(l => wordIndex.get(l)).filter(x => !!x); - let getPivot = (/**@type {string}*/s) => pivots.map(p => p.get(s)).filter(x => x !== undefined)[0]; + let getPivot = (/**@type {number}*/s) => pivots.map(p => p.get(s)).filter(x => x !== undefined)[0]; let collator = new Intl.Collator("en"); - resultSnippets = Array.from(resultSnippets).sort( + let sortKeys = new Map(Array.from(resultSnippetIndices).map( + i => { + let snippet = getString(i); + return [i, snippet.substring(getPivot(i)) + ' \uFFFE ' + + snippet.substring(0, getPivot(i))]; + })); + let sortedSnippetIndices = Array.from(resultSnippetIndices).sort( (left, right) => collator.compare( - left.substring(getPivot(left)) + - ' \uFFFE ' + - left.substring(0, getPivot(left)), - right.substring(getPivot(right)) + - ' \uFFFE ' + - right.substring(0, getPivot(right)))); + sortKeys.get(left), + sortKeys.get(right))); for (let propertyIndex of indexEntries.values()) { - for (let snippet of resultSnippets) { - let entry = propertyIndex.get(snippet); + for (let snippetIndex of sortedSnippetIndices) { + let entry = propertyIndex.get(snippetIndex); if (!entry) { continue; } @@ -126,9 +147,10 @@ function search(/**@type {string}*/ query) { } rangeCount += entrySet.length; covered = covered.concat(entrySet); - let pivot = getPivot(snippet); + let pivot = getPivot(snippetIndex); + let snippet = getString(snippetIndex); let tail = snippet.substring(pivot); - result.push(entry.html.replace( + result.push(getString(entry.html).replace( "[RESULT TEXT]", "" + @@ -156,17 +178,17 @@ function search(/**@type {string}*/ query) { var name = characterNames.get(cp); var rs = null; if (!name) { - for (let [[first, last], {property, snippet}] of radicalStrokeRanges) { + for (let [[first, last], {property, snippetIndex}] of radicalStrokeRanges) { if (first <= cp && cp <= last) { - rs = {property, snippet}; + rs = {property, snippetIndex}; break; } } if (rs) { - rangeCount += indexEntries.get(rs.property).get(rs.snippet).characters.length; + rangeCount += indexEntries.get(rs.property).get(rs.snippetIndex).characters.length; result.push( - indexEntries.get(rs.property).get(rs.snippet).html.replace( - "[RESULT TEXT]", toHTML(rs.snippet))); + getString(indexEntries.get(rs.property).get(rs.snippetIndex).html).replace( + "[RESULT TEXT]", toHTML(getString(rs.snippetIndex)))); } else { for (let [[first, last], n] of characterNameRanges) { if (first <= cp && cp <= last) { @@ -179,20 +201,20 @@ function search(/**@type {string}*/ query) { if (name) { rangeCount += 1; result.push( - (indexEntries.get("Name").get(name) ?? - indexEntries.get("Name_Alias").get(name)).html.replace( - "[RESULT TEXT]", toHTML(name))); + getString(indexEntries.get("Name").get(name) ?? + indexEntries.get("Name_Alias").get(name).html).replace( + "[RESULT TEXT]", toHTML(getString(name)))); } } if (/^boop$/i.test(query)) { rangeCount += 1; result.push( - indexEntries.get("Block").get("Betty").html.replace( + getString(indexEntries.get("Block").get(bettyIndex).html).replace( "[RESULT TEXT]", toHTML("Betty"))); } else if (/^dood$/i.test(query)) { rangeCount += 1; result.push( - indexEntries.get("Block").get("the").html.replace( + getString(indexEntries.get("Block").get(theIndex).html).replace( "[RESULT TEXT]", toHTML("the"))); } } From 78f09073f1c32e4acad048fe4c50e49b3444d7ee Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Wed, 15 Apr 2026 03:39:48 +0200 Subject: [PATCH 2/3] Save another 433 kiB --- .../java/org/unicode/text/tools/Indexer.java | 6 ++++-- .../org/unicode/text/tools/charindex.js | 21 +++++++++++-------- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/unicodetools/src/main/java/org/unicode/text/tools/Indexer.java b/unicodetools/src/main/java/org/unicode/text/tools/Indexer.java index 26a7cfcbb..81b7f745f 100644 --- a/unicodetools/src/main/java/org/unicode/text/tools/Indexer.java +++ b/unicodetools/src/main/java/org/unicode/text/tools/Indexer.java @@ -455,8 +455,10 @@ public int compare(UnicodeProperty left, UnicodeProperty right) { // zero-padding for size. "[" + range.codepoint - + "," - + range.codepointEnd + + (range.codepointEnd + != range.codepoint + ? "," + range.codepointEnd + : "") + "]") .collect(Collectors.joining(","))); file.print("]}],"); diff --git a/unicodetools/src/main/resources/org/unicode/text/tools/charindex.js b/unicodetools/src/main/resources/org/unicode/text/tools/charindex.js index eca405322..3171fa324 100644 --- a/unicodetools/src/main/resources/org/unicode/text/tools/charindex.js +++ b/unicodetools/src/main/resources/org/unicode/text/tools/charindex.js @@ -2,7 +2,7 @@ /**@type {Map>}*/ let wordIndex/*= GENERATED LINE*/; // Property name to snippet (compressed) to index entry; the html is compressed. -/**@type {Map>}*/ +/**@type {Map>}*/ let indexEntries/*= GENERATED LINE*/; /**@type {number}*/ let bettyIndex/*= GENERATED LINE*/; @@ -39,7 +39,7 @@ for (let [property, propertyIndex] of indexEntries) { } for (let [name, entry] of indexEntries.get("Name")) { - if (entry.characters[0][0] == entry.characters[0][1]) { + if (entry.characters[0][0] == entry.characters[0].at(-1)) { characterNames.set(entry.characters[0][0], name); } else { for (let range of entry.characters) { @@ -227,7 +227,8 @@ function toHTML(/**@type {string}*/ plain) { .replaceAll(">", ">") } -function superset(/**@type {[number, number][]}*/left, /**@type {[number, number][]}*/right) { +function superset(/**@type {([number, number]|[number])[]}*/left, + /**@type {([number, number]|[number])[]}*/right) { var remaining = right.slice(); for (containingRange of left) { remaining = remaining.flatMap(r => rangeMinus(r, containingRange)); @@ -238,7 +239,8 @@ function superset(/**@type {[number, number][]}*/left, /**@type {[number, number return true; } -function rangeMinus(/**@type {[number, number]}*/left, /**@type {[number, number]}*/right) { +function rangeMinus(/**@type {[number, number]|[number]}*/left, + /**@type {[number, number]|[number]}*/right) { let intersection = rangeIntersection(left, right); if (intersection === left || intersection === right) { return []; @@ -250,16 +252,17 @@ function rangeMinus(/**@type {[number, number]}*/left, /**@type {[number, number if (left[0] < intersection[0]) { result.push([left[0], intersection[0] - 1]); } - if (left[1] > intersection[1]) { - result.push([intersection[1] + 1, left[1] - 1]); + if (left.at(-1) > intersection.at(-1)) { + result.push([intersection.at(-1) + 1, left.at(-1) - 1]); } return result; } } -function rangeIntersection(/**@type {[number, number]}*/left, /**@type {[number, number]}*/right) { - let [leftStart, leftEnd] = left; - let [rightStart, rightEnd] = right; +function rangeIntersection(/**@type {[number, number]|[number]}*/left, + /**@type {[number, number]|[number]}*/right) { + let [leftStart, leftEnd] = [left[0], left.at(-1)]; + let [rightStart, rightEnd] = [right[0], right.at(-1)]; if (leftEnd < rightStart || rightEnd < leftStart) { return null; } else { From 07ad58533aba5f07b31db8ff25830ab949b494cb Mon Sep 17 00:00:00 2001 From: Robin Leroy Date: Mon, 20 Apr 2026 17:54:09 +0200 Subject: [PATCH 3/3] helper class --- .../java/org/unicode/text/tools/Indexer.java | 40 ++++++++++++------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/unicodetools/src/main/java/org/unicode/text/tools/Indexer.java b/unicodetools/src/main/java/org/unicode/text/tools/Indexer.java index 81b7f745f..712334b57 100644 --- a/unicodetools/src/main/java/org/unicode/text/tools/Indexer.java +++ b/unicodetools/src/main/java/org/unicode/text/tools/Indexer.java @@ -87,6 +87,27 @@ public class Indexer { private static int maxRSEntryCharacters = 0; + private static class StringIndexer { + public StringIndexer() {} + + public int getStringIndex(String s) { + int result = stringIndices.getOrDefault(s, allTheStrings.length()); + if (result == allTheStrings.length()) { + allTheStrings.append(s).append(RECORD_SEPARATOR); + stringIndices.put(s, result); + } + return result; + } + + @Override + public String toString() { + return allTheStrings.toString(); + } + + private final HashMap stringIndices = new HashMap<>(); + private final StringBuilder allTheStrings = new StringBuilder(); + } + static { String baseRules = "'<' > '<' ;" @@ -263,8 +284,7 @@ public int compare(UnicodeProperty left, UnicodeProperty right) { return left.getName().compareTo(right.getName()); } } - final StringBuilder allTheStrings = new StringBuilder(); - final HashMap stringIndices = new HashMap<>(); + final var allTheStrings = new StringIndexer(); // Property to snippet based on property value (as an index in allTheStrings) to index // entry. Map> indexEntries = @@ -300,12 +320,7 @@ public int compare(UnicodeProperty left, UnicodeProperty right) { } else if (prop == NAME) { snippet = snippet.replace(Utility.hex(cp), "#"); } - final int snippetIndex = - stringIndices.getOrDefault(snippet, allTheStrings.length()); - if (snippetIndex == allTheStrings.length()) { - allTheStrings.append(snippet).append(RECORD_SEPARATOR); - stringIndices.put(snippet, snippetIndex); - } + final int snippetIndex = allTheStrings.getStringIndex(snippet); propertyIndex .computeIfAbsent(snippetIndex, k -> new IndexEntry(k, prop)) .characters @@ -339,10 +354,8 @@ public int compare(UnicodeProperty left, UnicodeProperty right) { System.out.println("Indexed plane " + cp / 0x10000); } } - final int bettyIndex = allTheStrings.length(); - allTheStrings.append("Betty").append(RECORD_SEPARATOR); - final int theIndex = allTheStrings.length(); - allTheStrings.append("the").append(RECORD_SEPARATOR); + final int bettyIndex = allTheStrings.getStringIndex("Betty"); + final int theIndex = allTheStrings.getStringIndex("the"); indexEntries .get(BLOCK) .computeIfAbsent(bettyIndex, k -> new IndexEntry(k, BLOCK)) @@ -438,8 +451,7 @@ public int compare(UnicodeProperty left, UnicodeProperty right) { if (++i % 1000 == 0) { System.out.println(i + "/" + propertyIndex.size() + "..."); } - final int htmlIndex = allTheStrings.length(); - allTheStrings.append(indexEntry.toHTML()).append(RECORD_SEPARATOR); + final int htmlIndex = allTheStrings.getStringIndex(indexEntry.toHTML()); file.print("[" + indexEntry.snippetIndex + ",{"); file.print("html:" + htmlIndex + ","); file.print("characters:[");