diff --git a/src/main/java/io/legado/app/utils/EncodingDetect.kt b/src/main/java/io/legado/app/utils/EncodingDetect.kt index f89f565e..6525bbca 100644 --- a/src/main/java/io/legado/app/utils/EncodingDetect.kt +++ b/src/main/java/io/legado/app/utils/EncodingDetect.kt @@ -1,11 +1,10 @@ package io.legado.app.utils +import android.text.TextUtils import io.legado.app.lib.icu4j.CharsetDetector import org.jsoup.Jsoup import java.io.File import java.io.FileInputStream -import java.nio.charset.StandardCharsets -import java.util.* /** * 自动获取文件的编码 @@ -13,28 +12,38 @@ import java.util.* @Suppress("MemberVisibilityCanBePrivate", "unused") object EncodingDetect { - fun getHtmlEncode(bytes: ByteArray): String? { + private val headTagRegex = "(?i)[\\s\\S]*?".toRegex() + private val headOpenBytes = "".toByteArray() + private val headCloseBytes = "".toByteArray() + + fun getHtmlEncode(bytes: ByteArray): String { try { - val doc = Jsoup.parse(String(bytes, StandardCharsets.UTF_8)) + var head: String? = null + val startIndex = bytes.indexOf(headOpenBytes) + if (startIndex > -1) { + val endIndex = bytes.indexOf(headCloseBytes, startIndex) + if (endIndex > -1) { + head = String(bytes.copyOfRange(startIndex, endIndex + headCloseBytes.size)) + } + } + val doc = Jsoup.parseBodyFragment(head ?: headTagRegex.find(String(bytes))!!.value) val metaTags = doc.getElementsByTag("meta") var charsetStr: String for (metaTag in metaTags) { charsetStr = metaTag.attr("charset") - if (!charsetStr.isEmpty()) { + if (!TextUtils.isEmpty(charsetStr)) { return charsetStr } - val content = metaTag.attr("content") val httpEquiv = metaTag.attr("http-equiv") - if (httpEquiv.lowercase(Locale.getDefault()) == "content-type") { - charsetStr = if (content.lowercase(Locale.getDefault()).contains("charset")) { - content.substring( - content.lowercase(Locale.getDefault()) - .indexOf("charset") + "charset=".length - ) + if (httpEquiv.equals("content-type", true)) { + val content = metaTag.attr("content") + val idx = content.indexOf("charset=", ignoreCase = true) + charsetStr = if (idx > -1) { + content.substring(idx + "charset=".length) } else { - content.substring(content.lowercase(Locale.getDefault()).indexOf(";") + 1) + content.substringAfter(";") } - if (!charsetStr.isEmpty()) { + if (!TextUtils.isEmpty(charsetStr)) { return charsetStr } } @@ -75,4 +84,4 @@ object EncodingDetect { } return byteArray } -} \ No newline at end of file +}