-
Notifications
You must be signed in to change notification settings - Fork 35
Fix modified UTF-8 string encoding/decoding #42
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
PhilGlass
wants to merge
2
commits into
madisp:main
Choose a base branch
from
PhilGlass:fix_string_decoding
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
2 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -24,39 +24,25 @@ | |
| import com.google.common.primitives.UnsignedBytes; | ||
|
|
||
| import java.nio.ByteBuffer; | ||
| import java.nio.charset.Charset; | ||
|
|
||
| /** Provides utilities to decode/encode a String packed in an arsc resource file. */ | ||
| public final class ResourceString { | ||
|
|
||
| /** Type of {@link ResourceString} to encode / decode. */ | ||
| public enum Type { | ||
| UTF8(UTF_8), | ||
| UTF16(UTF_16LE); | ||
|
|
||
| private final Charset charset; | ||
|
|
||
| Type(Charset charset) { | ||
| this.charset = charset; | ||
| } | ||
|
|
||
| public Charset charset() { | ||
| return charset; | ||
| } | ||
| UTF8, UTF16 | ||
| } | ||
|
|
||
| private ResourceString() {} // Private constructor | ||
|
|
||
| /** | ||
| * Given a buffer and an offset into the buffer, returns a String. The {@code offset} is the | ||
| * 0-based byte offset from the start of the buffer where the string resides. This should be the | ||
| * location in memory where the string's character count, followed by its byte count, and then | ||
| * followed by the actual string is located. | ||
| * 0-based byte offset from the start of the buffer where the string resides. How this data is | ||
| * interpreted depends on the string's {@code type}. | ||
|
Comment on lines
-52
to
+41
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These comments were only correct for UTF-8 strings. |
||
| * | ||
| * <p>Here's an example UTF-8-encoded string of ab©: | ||
| * <pre> | ||
| * 03 04 61 62 C2 A9 00 | ||
| * ^ Offset should be here | ||
| * </pre> | ||
| * | ||
| * @param buffer The buffer containing the string to decode. | ||
|
|
@@ -65,24 +51,34 @@ private ResourceString() {} // Private constructor | |
| * @return The decoded string. | ||
| */ | ||
| public static String decodeString(ByteBuffer buffer, int offset, Type type) { | ||
| int length; | ||
| int characterCount = decodeLength(buffer, offset, type); | ||
| offset += computeLengthOffset(characterCount, type); | ||
| // UTF-8 strings have 2 lengths: the number of characters, and then the encoding length. | ||
| // UTF-16 strings, however, only have 1 length: the number of characters. | ||
| // Both UTF-8 and UTF-16 strings begin with the length in UTF-16 code units (= 2-byte units). | ||
| // This is ignored when decoding a UTF-8 string, but we need to read it anyway to adjust our | ||
| // offset. | ||
| // | ||
| // See: https://cs.android.com/android/platform/superproject/main/+/main:frameworks/base/libs/androidfw/StringPool.cpp;l=364-427;drc=1d6d8ac9feb221f47692250647269f3753bdee60 | ||
| // See: https://cs.android.com/android/platform/superproject/main/+/main:frameworks/base/libs/androidfw/ResourceTypes.cpp;l=952-957;drc=61197364367c9e404c7da6900658f1b16c42d0da | ||
| int utf16CodeUnits = decodeLength(buffer, offset, type); | ||
| offset += computeLengthOffset(utf16CodeUnits, type); | ||
| if (type == Type.UTF8) { | ||
| length = decodeLength(buffer, offset, type); | ||
| offset += computeLengthOffset(length, type); | ||
| // For a UTF-8 string the next value is the length in UTF-8 code units (= 1-byte units). | ||
| int utf8CodeUnits = decodeLength(buffer, offset, type); | ||
| offset += computeLengthOffset(utf8CodeUnits, type); | ||
|
|
||
| // Strings in .arsc files are encoded as modified UTF-8, not regular UTF-8, so we need to | ||
| // convert between them. | ||
| // | ||
| // See: https://cs.android.com/android/platform/superproject/main/+/main:frameworks/base/libs/androidfw/Util.cpp;l=210-215;drc=a577514789fc241abe15f793a66f19d6431f7769 | ||
| // See: https://docs.oracle.com/en/java/javase/23/docs/api/java.base/java/io/DataInput.html#modified-utf-8 | ||
| byte[] utf8 = modifiedUtf8ToUtf8(buffer, offset, utf8CodeUnits); | ||
| return new String(utf8, UTF_8); | ||
| } else { | ||
| length = characterCount * 2; | ||
| int lengthBytes = utf16CodeUnits * 2; | ||
| return new String(buffer.array(), offset, lengthBytes, UTF_16LE); | ||
| } | ||
| return new String(buffer.array(), offset, length, type.charset()); | ||
| } | ||
|
|
||
| /** | ||
| * Encodes a string in either UTF-8 or UTF-16 and returns the bytes of the encoded string. | ||
| * Strings are prefixed by 2 values. The first is the number of characters in the string. | ||
| * The second is the encoding length (number of bytes in the string). | ||
| * | ||
| * <p>Here's an example UTF-8-encoded string of ab©: | ||
| * <pre>03 04 61 62 C2 A9 00</pre> | ||
|
|
@@ -92,15 +88,36 @@ public static String decodeString(ByteBuffer buffer, int offset, Type type) { | |
| * @return The encoded string. | ||
| */ | ||
| public static byte[] encodeString(String str, Type type) { | ||
| byte[] bytes = str.getBytes(type.charset()); | ||
| // The extra 5 bytes is for metadata (character count + byte count) and the NULL terminator. | ||
| byte[] bytes; | ||
| if (type == Type.UTF8) { | ||
| // Strings in .arsc files are encoded as modified UTF-8, not regular UTF-8, so we need to | ||
| // convert between them. | ||
| // | ||
| // See: https://cs.android.com/android/platform/superproject/main/+/main:frameworks/base/libs/androidfw/StringPool.cpp;l=367;drc=1d6d8ac9feb221f47692250647269f3753bdee60 | ||
| // See: https://docs.oracle.com/en/java/javase/23/docs/api/java.base/java/io/DataInput.html#modified-utf-8 | ||
| bytes = utf8ToModifiedUtf8(str.getBytes(UTF_8)); | ||
| } else { | ||
| bytes = str.getBytes(UTF_16LE); | ||
| } | ||
|
|
||
| // +5 bytes is for the length(s) (2+2 bytes for UTF-8, 4 bytes for UTF-16) and a null | ||
| // terminator. | ||
| ByteArrayDataOutput output = ByteStreams.newDataOutput(bytes.length + 5); | ||
| encodeLength(output, str.length(), type); | ||
| if (type == Type.UTF8) { // Only UTF-8 strings have the encoding length. | ||
|
|
||
| // Both UTF-8 and UTF-16 strings begin with the length in UTF-16 code units (= 2-byte units), | ||
| // which is what String.length() returns. | ||
| // | ||
| // See: https://cs.android.com/android/platform/superproject/main/+/main:frameworks/base/libs/androidfw/StringPool.cpp;l=364-427;drc=1d6d8ac9feb221f47692250647269f3753bdee60 | ||
| // See: https://cs.android.com/android/platform/superproject/main/+/main:frameworks/base/libs/androidfw/ResourceTypes.cpp;l=952-957;drc=61197364367c9e404c7da6900658f1b16c42d0da | ||
| int utf16CodeUnits = str.length(); | ||
| encodeLength(output, utf16CodeUnits, type); | ||
| if (type == Type.UTF8) { | ||
| // For a UTF-8 string the next value is the length in UTF-8 code units (= 1-byte units). | ||
| encodeLength(output, bytes.length, type); | ||
| } | ||
| // Next is the string's bytes. | ||
| output.write(bytes); | ||
| // NULL-terminate the string | ||
| // Then finally a null terminator. | ||
| if (type == Type.UTF8) { | ||
| output.write(0); | ||
| } else { | ||
|
|
@@ -161,4 +178,155 @@ private static int decodeLengthUTF16(ByteBuffer buffer, int offset) { | |
| } | ||
| return length; | ||
| } | ||
|
|
||
| // Converts modified UTF-8 to standard UTF-8. Modified UTF-8 differs from standard UTF-8 in two | ||
| // ways: | ||
| // | ||
| // 1. 4-byte sequences are not used. Instead, supplementary characters (code points above U+FFFF, | ||
| // outside the BMP) are encoded as a 3-byte surrogate pair. | ||
| // 2. null (U+0000) is encoded as the 2-byte sequence 0xC080 instead of the 1-byte sequence 0x00. | ||
| // | ||
| // All other characters use the same encoding in both formats. | ||
| // | ||
| // Based on fbjni's modifiedUTF8ToUTF8. | ||
| // | ||
| // See: https://docs.oracle.com/en/java/javase/23/docs/api/java.base/java/io/DataInput.html#modified-utf-8 | ||
| // See: https://github.com/facebookincubator/fbjni/blob/caacce89ac0c494034e8c36fd0ab0d6fce951785/cxx/fbjni/detail/utf8.cpp#L172 | ||
| private static byte[] modifiedUtf8ToUtf8(ByteBuffer modifiedUtf8, int offset, int len) { | ||
| // Modified UTF-8 is never shorter than the equivalent UTF-8 (surrogate pairs shrink from 6 | ||
| // bytes to 4 bytes, nulls shrink from 2 bytes to 1 byte), so this buffer will always be big | ||
| // enough. | ||
| byte[] utf8 = new byte[len]; | ||
| int modifiedIndex = 0; | ||
| int utf8Index = 0; | ||
|
|
||
| while (modifiedIndex < len) { | ||
| if (len >= modifiedIndex + 6 | ||
| && (modifiedUtf8.get(offset + modifiedIndex) & 0xFF) == 0xED | ||
| && ((modifiedUtf8.get(offset + modifiedIndex + 1) & 0xFF) & 0xF0) == 0xA0 | ||
| && (modifiedUtf8.get(offset + modifiedIndex + 3) & 0xFF) == 0xED | ||
| && ((modifiedUtf8.get(offset + modifiedIndex + 4) & 0xFF) & 0xF0) == 0xB0) { | ||
| // Supplementary characters encoded as a 3-byte surrogate pair become a 4-byte sequence. | ||
| int highSurrogate = decode3ByteUtf8(modifiedUtf8, offset + modifiedIndex); | ||
| int lowSurrogate = decode3ByteUtf8(modifiedUtf8, offset + modifiedIndex + 3); | ||
| int codePoint = 0x10000 + (((highSurrogate & 0x3FF) << 10) | (lowSurrogate & 0x3FF)); | ||
| encode4ByteUtf8(codePoint, utf8, utf8Index); | ||
| modifiedIndex += 6; | ||
| utf8Index += 4; | ||
| } else if (len >= modifiedIndex + 2 | ||
| && (modifiedUtf8.get(offset + modifiedIndex) & 0xFF) == 0xC0 | ||
| && (modifiedUtf8.get(offset + modifiedIndex + 1) & 0xFF) == 0x80) { | ||
| // Nulls (U+0000) encoded as a 2-byte sequence become a 1-byte sequence. | ||
| utf8[utf8Index] = 0; | ||
| modifiedIndex += 2; | ||
| utf8Index++; | ||
| } else { | ||
| // Everything else is unchanged. | ||
| utf8[utf8Index] = modifiedUtf8.get(offset + modifiedIndex); | ||
| modifiedIndex++; | ||
| utf8Index++; | ||
| } | ||
| } | ||
|
|
||
| byte[] result = new byte[utf8Index]; | ||
| System.arraycopy(utf8, 0, result, 0, utf8Index); | ||
| return result; | ||
| } | ||
|
|
||
| // See: https://github.com/facebookincubator/fbjni/blame/caacce89ac0c494034e8c36fd0ab0d6fce951785/cxx/fbjni/detail/utf8.cpp#L42 | ||
| private static int decode3ByteUtf8(ByteBuffer in, int offset) { | ||
| return ((in.get(offset) & 0x0F) << 12) | ||
| | ((in.get(offset + 1) & 0x3F) << 6) | ||
| | (in.get(offset + 2) & 0x3F); | ||
| } | ||
|
|
||
| // See: https://github.com/facebookincubator/fbjni/blame/caacce89ac0c494034e8c36fd0ab0d6fce951785/cxx/fbjni/detail/utf8.cpp#L46 | ||
| private static void encode4ByteUtf8(int codePoint, byte[] out, int offset) { | ||
| out[offset] = (byte) (0xF0 | (codePoint >> 18)); | ||
| out[offset + 1] = (byte) (0x80 | ((codePoint >> 12) & 0x3F)); | ||
| out[offset + 2] = (byte) (0x80 | ((codePoint >> 6) & 0x3F)); | ||
| out[offset + 3] = (byte) (0x80 | (codePoint & 0x3F)); | ||
| } | ||
|
|
||
| // Converts standard UTF-8 to modified UTF-8. Modified UTF-8 differs from standard UTF-8 in two | ||
| // ways: | ||
| // | ||
| // 1. 4-byte sequences are not used. Instead, supplementary characters (code points above U+FFFF, | ||
| // outside the BMP) are encoded as a 3-byte surrogate pair. | ||
| // 2. null (U+0000) is encoded as 0xC080 (2 bytes) instead of 0x00 (1 byte). | ||
| // | ||
| // All other characters use the same encoding in both formats. | ||
| // | ||
| // Based on fbjni's utf8ToModifiedUTF8. | ||
| // | ||
| // See: https://docs.oracle.com/en/java/javase/23/docs/api/java.base/java/io/DataInput.html#modified-utf-8 | ||
| // See: https://github.com/facebookincubator/fbjni/blob/caacce89ac0c494034e8c36fd0ab0d6fce951785/cxx/fbjni/detail/utf8.cpp#L106 | ||
| private static byte[] utf8ToModifiedUtf8(byte[] utf8) { | ||
| byte[] modified = new byte[modifiedUtf8Length(utf8)]; | ||
| int utf8Index = 0; | ||
| int modifiedIndex = 0; | ||
|
|
||
| while (utf8Index < utf8.length) { | ||
| if (utf8Index + 4 <= utf8.length && isFourByteUtf8Encoding(utf8[utf8Index])) { | ||
| // Supplementary characters encoded as a 4-byte sequence become a 3-byte surrogate pair. | ||
| int codePoint = ((utf8[utf8Index] & 0x07) << 18) | ||
| | ((utf8[utf8Index + 1] & 0x3F) << 12) | ||
| | ((utf8[utf8Index + 2] & 0x3F) << 6) | ||
| | (utf8[utf8Index + 3] & 0x3F); | ||
| int highSurrogate = ((codePoint - 0x10000) >> 10) | 0xD800; | ||
| int lowSurrogate = ((codePoint - 0x10000) & 0x3FF) | 0xDC00; | ||
| encode3ByteUtf8(highSurrogate, modified, modifiedIndex); | ||
| encode3ByteUtf8(lowSurrogate, modified, modifiedIndex + 3); | ||
| utf8Index += 4; | ||
| modifiedIndex += 6; | ||
| } else if (utf8[utf8Index] == 0) { | ||
| // Nulls (U+0000) encoded as a 1-byte sequence become a 2-byte sequence. | ||
| modified[modifiedIndex] = (byte) 0xC0; | ||
| modified[modifiedIndex + 1] = (byte) 0x80; | ||
| utf8Index++; | ||
| modifiedIndex += 2; | ||
| } else { | ||
| // Everything else is unchanged. | ||
| modified[modifiedIndex] = utf8[utf8Index]; | ||
| utf8Index++; | ||
| modifiedIndex++; | ||
| } | ||
| } | ||
|
|
||
| return modified; | ||
| } | ||
|
|
||
| // See: https://github.com/facebookincubator/fbjni/blob/caacce89ac0c494034e8c36fd0ab0d6fce951785/cxx/fbjni/detail/utf8.cpp#L61 | ||
| private static int modifiedUtf8Length(byte[] utf8) { | ||
| int modifiedUtf8Length = 0; | ||
| int index = 0; | ||
| while (index < utf8.length) { | ||
| if (index + 4 <= utf8.length && isFourByteUtf8Encoding(utf8[index])) { | ||
| // 4-byte sequences expand from 4 to 6 bytes. | ||
| modifiedUtf8Length += 6; | ||
| index += 4; | ||
| } else if (utf8[index] == 0) { | ||
| // Null (U+0000) expands from 1 to 2 bytes. | ||
| modifiedUtf8Length += 2; | ||
| index += 1; | ||
| } else { | ||
| // Everything else stays the same size. | ||
| modifiedUtf8Length += 1; | ||
| index += 1; | ||
| } | ||
| } | ||
| return modifiedUtf8Length; | ||
| } | ||
|
|
||
| // See: https://github.com/facebookincubator/fbjni/blob/caacce89ac0c494034e8c36fd0ab0d6fce951785/cxx/fbjni/detail/utf8.cpp#L58 | ||
| private static boolean isFourByteUtf8Encoding(byte b) { | ||
| return (b & 0xF8) == 0xF0; | ||
| } | ||
|
|
||
| // See: https://github.com/facebookincubator/fbjni/blob/caacce89ac0c494034e8c36fd0ab0d6fce951785/cxx/fbjni/detail/utf8.cpp#L32 | ||
| private static void encode3ByteUtf8(int codePoint, byte[] out, int offset) { | ||
| out[offset] = (byte) (0xE0 | (codePoint >> 12)); | ||
| out[offset + 1] = (byte) (0x80 | ((codePoint >> 6) & 0x3F)); | ||
| out[offset + 2] = (byte) (0x80 | (codePoint & 0x3F)); | ||
| } | ||
| } | ||
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is an API change, so I can revert it (+ deprecate?) if you'd rather maintain strict compatibility. But there's no standard
MUTF-8charset I could use to makecharset()return something correct, and I'd be surprised if it were used outside this library.