diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index 80073e53fb..986770a26c 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -5,14 +5,21 @@ updates:
directory: /
schedule:
interval: weekly
+ cooldown:
+ default-days: 12
ignore:
# Jetty 9.x needed for JDK8 compatibility; it still receives security updates. Only used in tests.
- dependency-name: "org.eclipse.jetty:jetty-server"
update-types: ["version-update:semver-major"]
- dependency-name: "org.eclipse.jetty:jetty-servlet"
update-types: ["version-update:semver-major"]
+ # Et tu, junit? Keep us on 5, as 6 has min JDK17 - https://docs.junit.org/6.0.0-RC3/release-notes/#release-notes-6.0.0-M1
+ - dependency-name: "org.junit.jupiter:junit-jupiter"
+ update-types: ["version-update:semver-major"]
- package-ecosystem: github-actions
directory: /
schedule:
interval: weekly
+ cooldown:
+ default-days: 12
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 0737dae6e1..f91448634f 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -10,15 +10,15 @@ jobs:
matrix:
os: [ubuntu-latest, windows-latest, macOS-latest]
# choosing to run a reduced set of LTS, current, and next, to balance coverage and execution time
- java: [8, 17, 21]
+ java: [8, 17, 25]
fail-fast: false
name: Test JDK ${{ matrix.java }}, ${{ matrix.os }}
steps:
- name: Checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: Set up JDK ${{ matrix.java }}
- uses: actions/setup-java@v4
+ uses: actions/setup-java@v5
with:
java-version: ${{ matrix.java }}
distribution: 'zulu'
diff --git a/.github/workflows/cifuzz.yml b/.github/workflows/cifuzz.yml
index 4228bdad80..27c5142db4 100644
--- a/.github/workflows/cifuzz.yml
+++ b/.github/workflows/cifuzz.yml
@@ -19,7 +19,7 @@ jobs:
dry-run: false
language: jvm
- name: Upload Crash
- uses: actions/upload-artifact@v4
+ uses: actions/upload-artifact@v6
if: failure() && steps.build.outcome == 'success'
with:
name: artifacts
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
index 9ad4905964..b98fc33133 100644
--- a/.github/workflows/codeql.yml
+++ b/.github/workflows/codeql.yml
@@ -12,19 +12,19 @@ jobs:
name: "CodeQL"
steps:
- name: Checkout
- uses: actions/checkout@v4
+ uses: actions/checkout@v6
- name: Set up JDK
- uses: actions/setup-java@v4
+ uses: actions/setup-java@v5
with:
java-version: 17
distribution: 'temurin'
cache: 'maven'
- name: CodeQL Initialization
- uses: github/codeql-action/init@v3
+ uses: github/codeql-action/init@v4
with:
languages: java
queries: +security-and-quality
- name: Autobuild
- uses: github/codeql-action/autobuild@v3
+ uses: github/codeql-action/autobuild@v4
- name: CodeQL Analysis
- uses: github/codeql-action/analyze@v3
+ uses: github/codeql-action/analyze@v4
diff --git a/CHANGES.md b/CHANGES.md
index b12d9d7246..a34c124c2b 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,20 +1,69 @@
# jsoup Changelog
-## 1.21.2 (PENDING)
+## 1.22.2 (PENDING)
+
+### Bug Fixes
+* Android (R8/ProGuard): added a rule to ignore the optional `re2j` dependency when not present. [#2459](https://github.com/jhy/jsoup/issues/2459)
+
+## 1.22.1 (2026-Jan-01)
+
+### Improvements
+* Added support for using the `re2j` regular expression engine for regex-based CSS selectors (e.g. `[attr~=regex]`, `:matches(regex)`), which ensures linear-time performance for regex evaluation. This allows safer handling of arbitrary user-supplied query regexes. To enable, add the `com.google.re2j` dependency to your classpath, e.g.:
+```xml
+
+ com.google.re2j
+ re2j
+ 1.8
+
+ ```
+ (If you already have that dependency in your classpath, but you want to keep using the Java regex engine, you can disable re2j via `System.setProperty("jsoup.useRe2j", "false")`.) You can confirm that the re2j engine has been enabled correctly by calling `org.jsoup.helper.Regex.usingRe2j()`. [#2407](https://github.com/jhy/jsoup/pull/2407)
+
+* Added an instance method `Parser#unescape(String, boolean)` that unescapes HTML entities using the parser's configuration (e.g. to support error tracking), complementing the existing static utility `Parser.unescapeEntities(String, boolean)`. [#2396](https://github.com/jhy/jsoup/pull/2396)
+* Added a configurable maximum parser depth (to limit the number of open elements on stack) to both HTML and XML parsers. The HTML parser now defaults to a depth of 512 to match browser behavior, and protect against unbounded stack growth, while the XML parser keeps unlimited depth by default, but can opt into a limit via `org.jsoup.parser.Parser#setMaxDepth`. [#2421](https://github.com/jhy/jsoup/issues/2421)
+* Build: added CI coverage for JDK 25 [#2403](https://github.com/jhy/jsoup/pull/2403)
+* Build: added a CI fuzzer for contextual fragment parsing (in addition to existing full body HTML and XML fuzzers). [oss-fuzz #14041](https://github.com/google/oss-fuzz/pull/14041)
+
+### Changes
+* Set a removal schedule of jsoup 1.24.1 for previously deprecated APIs.
+
+### Bug Fixes
+* Previously cached child `Elements` of an `Element` were not correctly invalidated in `Node#replaceWith(Node)`, which could lead to incorrect results when subsequently calling `Element#children()`. [#2391](https://github.com/jhy/jsoup/issues/2391)
+* Attribute selector values are now compared literally without trimming. Previously, jsoup trimmed whitespace from selector values and from element attribute values, which could cause mismatches with browser behavior (e.g. `[attr=" foo "]`). Now matches align with the CSS specification and browser engines. [#2380](https://github.com/jhy/jsoup/issues/2380)
+* When using the JDK HttpClient, any system default proxy (`ProxySelector.getDefault()`) was ignored. Now, the system proxy is used if a per-request proxy is not set. [#2388](https://github.com/jhy/jsoup/issues/2388), [#2390](https://github.com/jhy/jsoup/pull/2390)
+* A `ValidationException` could be thrown in the adoption agency algorithm with particularly broken input. Now logged as a parse error. [#2393](https://github.com/jhy/jsoup/issues/2393)
+* Null characters in the HTML body were not consistently removed; and in foreign content were not correctly replaced. [#2395](https://github.com/jhy/jsoup/issues/2395)
+* An `IndexOutOfBoundsException` could be thrown when parsing a body fragment with crafted input. Now logged as a parse error. [#2397](https://github.com/jhy/jsoup/issues/2397), [#2406](https://github.com/jhy/jsoup/issues/2406)
+* When using StructuralEvaluators (e.g., a `parent child` selector) across many retained threads, their memoized results could also be retained, increasing memory use. These results are now cleared immediately after use, reducing overall memory consumption. [#2411](https://github.com/jhy/jsoup/issues/2411)
+* Cloning a `Parser` now preserves any custom `TagSet` applied to the parser. [#2422](https://github.com/jhy/jsoup/issues/2422), [#2423](https://github.com/jhy/jsoup/pull/2423)
+* Custom tags marked as `Tag.Void` now parse and serialize like the built-in void elements: they no longer consume following content, and the XML serializer emits the expected self-closing form. [#2425](https://github.com/jhy/jsoup/issues/2425)
+* The ` ` element is once again classified as an inline tag (`Tag.isBlock() == false`), matching common developer expectations and its role as phrasing content in HTML, while pretty-printing and text extraction continue to treat it as a line break in the rendered output. [#2387](https://github.com/jhy/jsoup/issues/2387), [#2439](https://github.com/jhy/jsoup/issues/2439)
+* Fixed an intermittent truncation issue when fetching and parsing remote documents via `Jsoup.connect(url).get()`. On responses without a charset header, the initial charset sniff could sometimes (depending on buffering / `available()` behavior) be mistaken for end-of-stream and a partial parse reused, dropping trailing content. [#2448](https://github.com/jhy/jsoup/issues/2448)
+* `TagSet` copies no longer mutate their template during lazy lookups, preventing cross-thread `ConcurrentModificationException` when parsing with shared sessions. [#2453](https://github.com/jhy/jsoup/pull/2453)
+* Fixed parsing of `` `foreignObject` content nested within a ``, which could incorrectly move the HTML subtree outside the SVG. [#2452](https://github.com/jhy/jsoup/issues/2452)
+
+### Internal Changes
+* Deprecated internal helper `org.jsoup.internal.Functions` (for removal in v1.23.1). This was previously used to support older Android API levels without full `java.util.function` coverage; jsoup now requires core library desugaring so this indirection is no longer necessary. [#2412](https://github.com/jhy/jsoup/pull/2412)
+
+## 1.21.2 (2025-Aug-25)
### Changes
* Deprecated internal (yet visible) methods `Normalizer#normalize(String, bool)` and `Attribute#shouldCollapseAttribute(Document.OutputSettings)`. These will be removed in a future version.
+* Deprecated `Connection#sslSocketFactory(SSLSocketFactory)` in favor of the new `Connection#sslContext(SSLContext)`. Using `sslSocketFactory` will force the use of the legacy `HttpUrlConnection` implementation, which does not support HTTP/2. [#2370](https://github.com/jhy/jsoup/pull/2370)
### Improvements
* When pretty-printing, if there are consecutive text nodes (via DOM manipulation), the non-significant whitespace between them will be collapsed. [#2349](https://github.com/jhy/jsoup/pull/2349).
* Updated `Connection.Response#statusMessage()` to return a simple loggable string message (e.g. "OK") when using the `HttpClient` implementation, which doesn't otherwise return any server-set status message. [#2356](https://github.com/jhy/jsoup/issues/2346)
* `Attributes#size()` and `Attributes#isEmpty()` now exclude any internal attributes (such as user data) from their count. This aligns with the attributes' serialized output and iterator. [#2369](https://github.com/jhy/jsoup/pull/2369)
+* Added `Connection#sslContext(SSLContext)` to provide a custom SSL (TLS) context to requests, supporting both the `HttpClient` and the legacy `HttUrlConnection` implementations. [#2370](https://github.com/jhy/jsoup/pull/2370)
+* Performance optimizations for DOM manipulation methods including when repeatedly removing an element's first child (`element.child(0).remove()`, and when using `Parser#parseBodyFragement()` to parse a large number of direct children. [#2373](https://github.com/jhy/jsoup/pull/2373).
### Bug Fixes
* When parsing from an InputStream and a multibyte character happened to straddle a buffer boundary, the stream would not be completely read. [#2353](https://github.com/jhy/jsoup/issues/2353).
* In `NodeTraversor`, if a last child element was removed during the `head()` call, the parent would be visited twice. [#2355](https://github.com/jhy/jsoup/issues/2355).
* Cloning an Element that has an Attributes object would add an empty internal user-data attribute to that clone, which would cause unexpected results for `Attributes#size()` and `Attributes#isEmpty()`. [#2356](https://github.com/jhy/jsoup/issues/2356)
* In a multithreaded application where multiple threads are calling `Element#children()` on the same element concurrently, a race condition could happen when the method was generating the internal child element cache (a filtered view of its child nodes). Since concurrent reads of DOM objects should be threadsafe without external synchronization, this method has been updated to execute atomically. [#2366](https://github.com/jhy/jsoup/issues/2366)
+* When parsing HTML with svg:script elements in SVG elements, don't enter the Text insertion mode, but continue to parse as foreign content. Otherwise, misnested HTML could then cause an IndexOutOfBoundsException. [#2374](https://github.com/jhy/jsoup/issues/2374)
+* Malformed HTML could throw an IndexOutOfBoundsException during the adoption agency. [#2377](https://github.com/jhy/jsoup/pull/2377).
## 1.21.1 (2025-Jun-23)
diff --git a/jitpack.yml b/jitpack.yml
new file mode 100644
index 0000000000..7800d9ee8d
--- /dev/null
+++ b/jitpack.yml
@@ -0,0 +1,6 @@
+before_install:
+ - sdk install java 21.0.2-open
+ - sdk use java 21.0.2-open
+ - sdk install maven
+install:
+ - mvn clean install -Djapicmp.skip=true -DskipTests
diff --git a/pom.xml b/pom.xml
index 7b92058312..707cfd3d00 100644
--- a/pom.xml
+++ b/pom.xml
@@ -5,7 +5,7 @@
org.jsoup
jsoup
- 1.21.2-SNAPSHOT
+ 1.22.2-SNAPSHOT
https://jsoup.org/
jsoup is a Java library that simplifies working with real-world HTML and XML. It offers an easy-to-use API for URL fetching, data parsing, extraction, and manipulation using DOM API methods, CSS, and xpath selectors. jsoup implements the WHATWG HTML5 specification, and parses HTML to the same DOM as modern browsers.
2009
@@ -33,7 +33,7 @@
UTF-8
- 9.4.57.v20241219
+ 9.4.58.v20250814
@@ -41,7 +41,7 @@
org.apache.maven.plugins
maven-compiler-plugin
- 3.14.0
+ 3.15.0
UTF-8
false
@@ -66,7 +66,7 @@
org.codehaus.mojo
animal-sniffer-maven-plugin
- 1.24
+ 1.27
api-java8
@@ -133,7 +133,7 @@
org.apache.maven.plugins
maven-javadoc-plugin
- 3.11.2
+ 3.12.0
none
8
@@ -151,7 +151,7 @@
org.apache.maven.plugins
maven-source-plugin
- 3.3.1
+ 3.4.0
org/jsoup/examples/**
@@ -169,7 +169,7 @@
org.apache.maven.plugins
maven-jar-plugin
- 3.4.2
+ 3.5.0
@@ -240,16 +240,16 @@
org.apache.maven.plugins
maven-resources-plugin
- 3.3.1
+ 3.4.0
maven-release-plugin
- 3.1.1
+ 3.3.1
org.apache.maven.plugins
maven-surefire-plugin
- 3.5.3
+ 3.5.4
-Xss640k
@@ -257,7 +257,7 @@
maven-failsafe-plugin
- 3.5.3
+ 3.5.4
@@ -275,14 +275,14 @@
com.github.siom79.japicmp
japicmp-maven-plugin
- 0.23.1
+ 0.25.4
org.jsoup
jsoup
- 1.21.1
+ 1.21.2
jar
@@ -291,7 +291,7 @@
false
true
true
-
+
@@ -316,8 +316,21 @@
+
+ org.sonatype.central
+ central-publishing-maven-plugin
+ 0.10.0
+ true
+
+ central
+
+
+
+ src/main/resources
+ false
+
./
META-INF/jsoup/
@@ -329,19 +342,6 @@
-
-
- sonatype-nexus-snapshots
- Sonatype Nexus Snapshots
- https://oss.sonatype.org/content/repositories/snapshots
-
-
- sonatype-nexus-staging
- Nexus Release Repository
- https://oss.sonatype.org/service/local/staging/deploy/maven2/
-
-
-
@@ -381,6 +381,27 @@
+
+
+ org.codehaus.mojo
+ build-helper-maven-plugin
+ 3.6.1
+
+
+ add-java11-test-source
+ generate-test-sources
+
+ add-test-source
+
+
+
+ ${project.basedir}/src/test/java11
+
+
+
+
+
+
org.apache.maven.plugins
maven-compiler-plugin
@@ -412,23 +433,6 @@
true
-
-
-
- testcompile-java-11
- test-compile
-
- testCompile
-
-
- 11
-
-
- ${project.basedir}/src/test/java11
-
-
-
-
@@ -480,7 +484,7 @@
maven-failsafe-plugin
- 3.5.3
+ 3.5.4
@@ -501,7 +505,7 @@
org.junit.jupiter
junit-jupiter
- 5.13.3
+ 5.14.2
test
@@ -509,7 +513,7 @@
com.google.code.gson
gson
- 2.13.1
+ 2.13.2
test
@@ -544,6 +548,15 @@
1.0.0
provided
+
+
+
+ com.google.re2j
+ re2j
+ 1.8
+ true
+ compile
+
diff --git a/src/main/java/org/jsoup/Connection.java b/src/main/java/org/jsoup/Connection.java
index ed095f44d6..5d5c2a0140 100644
--- a/src/main/java/org/jsoup/Connection.java
+++ b/src/main/java/org/jsoup/Connection.java
@@ -6,6 +6,8 @@
import org.jsoup.parser.StreamParser;
import org.jspecify.annotations.Nullable;
+import javax.net.ssl.HostnameVerifier;
+import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLSocketFactory;
import java.io.BufferedInputStream;
import java.io.IOException;
@@ -39,7 +41,7 @@ the lifetime of the Connection object. A socket connection is only made at the p
#execute()}, {@link #get()}, or {@link #post()}), and the server's response consumed.
For multi-threaded implementations, it is important to use a {@link #newRequest()} for each request. The session may
be shared across concurrent threads, but a not a specific request.
- HTTP/2 support: On JDK/JRE 11 and above, requests use {@link java.net.http.HttpClient}, which supports
+
HTTP/2 support: On JVM 11 and above, requests use {@link java.net.http.HttpClient}, which supports
HTTP/2. To use the legacy {@link java.net.HttpURLConnection} instead, set
System.setProperty("jsoup.useHttpClient", "false").
*/
@@ -150,7 +152,7 @@ default Connection newRequest(URL url) {
The default timeout is 30 seconds (30,000 millis). A timeout of zero is treated as an infinite timeout.
This timeout specifies the combined maximum duration of the connection time and the time to read
the full response.
- Implementation note: when this Connection is backed by HttpURLConnection (rather than HttpClient, as used in JRE/JDK 11+), this timeout is implemented by setting both the socket connect and read timeouts to half of the specified value.
+ Implementation note: when this Connection is backed by HttpURLConnection (rather than HttpClient, as used in JVM 11+), this timeout is implemented by setting both the socket connect and read timeouts to half of the specified value.
@param millis number of milliseconds (thousandths of a second) before timing out connects or reads.
@return this Connection, for chaining
@@ -210,12 +212,38 @@ default Connection newRequest(URL url) {
Connection ignoreContentType(boolean ignoreContentType);
/**
- * Set custom SSL socket factory
- * @param sslSocketFactory custom SSL socket factory
- * @return this Connection, for chaining
+ Set a custom SSL socket factory for HTTPS connections.
+ Note: if set, the legacy HttpURLConnection will be used instead of the JVM's
+ HttpClient.
+
+ @param sslSocketFactory SSL socket factory
+ @return this Connection, for chaining
+ @see #sslContext(SSLContext)
+ @deprecated use {@link #sslContext(SSLContext)} instead; will be removed in jsoup 1.24.1.
*/
+ @Deprecated
Connection sslSocketFactory(SSLSocketFactory sslSocketFactory);
+ /**
+ Set a custom SSL context for HTTPS connections.
+ Note: when using the legacy HttpURLConnection, only the SSLSocketFactory from the
+ context will be used.
+
+ @param sslContext SSL context
+ @return this Connection, for chaining
+ @since 1.21.2
+ */
+ default Connection sslContext(SSLContext sslContext) {
+ throw new UnsupportedOperationException();
+ }
+
+ /**
+ * Set a custom hostname verifier to verify the hostname during handshake
+ * @param hostnameVerifier hostname verifier
+ * @return this Connection, for chaining
+ */
+ Connection hostnameVerifier(HostnameVerifier hostnameVerifier);
+
/**
* Add a request data parameter. Request parameters are sent in the request query string for GETs, and in the
* request body for POSTs. A request may have multiple values of the same name.
@@ -769,11 +797,53 @@ interface Request extends Base {
@Nullable SSLSocketFactory sslSocketFactory();
/**
- * Set a custom SSL socket factory.
- * @param sslSocketFactory SSL socket factory
+ Set a custom SSL socket factory for HTTPS connections.
+ Note: if set, the legacy HttpURLConnection will be used instead of the JVM's
+ HttpClient.
+
+ @param sslSocketFactory SSL socket factory
+ @see #sslContext(SSLContext)
+ @deprecated use {@link #sslContext(SSLContext)} instead; will be removed in jsoup 1.24.1.
*/
+ @Deprecated
void sslSocketFactory(SSLSocketFactory sslSocketFactory);
+ /**
+ Get the current custom SSL context, if any.
+
+ @return custom SSL context if set, null otherwise
+ @since 1.21.2
+ */
+ @Nullable
+ default SSLContext sslContext() {
+ throw new UnsupportedOperationException();
+ }
+
+ /**
+ Set a custom SSL context for HTTPS connections.
+ Note: when using the legacy HttpURLConnection, only the SSLSocketFactory from the
+ context will be used.
+
+ @param sslContext SSL context
+ @return this Request, for chaining
+ @since 1.21.2
+ */
+ default Request sslContext(SSLContext sslContext) {
+ throw new UnsupportedOperationException();
+ }
+
+ /**
+ * Get the current hostname verifier, if any.
+ * @return hostname verifier if set, null otherwise
+ */
+ @Nullable HostnameVerifier hostnameVerifier();
+
+ /**
+ * Set a custom hostname verifier to verify the hostname during handshake
+ * @param hostnameVerifier hostname verifier
+ */
+ void hostnameVerifier(HostnameVerifier hostnameVerifier);
+
/**
* Add a data parameter to the request
* @param keyval data to add.
@@ -983,7 +1053,7 @@ default Response readFully() throws IOException {
* Calling {@link #body() } or {@link #bodyAsBytes()} has the same effect.
* @return this response, for chaining
* @throws UncheckedIOException if an IO exception occurs during buffering.
- * @deprecated use {@link #readFully()} instead (for the checked exception). Will be removed in a future version.
+ * @deprecated use {@link #readFully()} instead (for the checked exception). Will be removed in jsoup 1.24.1.
*/
@Deprecated
Response bufferUp();
diff --git a/src/main/java/org/jsoup/helper/DataUtil.java b/src/main/java/org/jsoup/helper/DataUtil.java
index 87c76a3ca2..4124fe4fd0 100644
--- a/src/main/java/org/jsoup/helper/DataUtil.java
+++ b/src/main/java/org/jsoup/helper/DataUtil.java
@@ -248,6 +248,7 @@ static CharsetDoc detectCharset(ControllableInputStream input, @Nullable String
if (charsetName == null) { // read ahead and determine from meta. safe first parse as UTF-8
int origMax = input.max();
input.max(firstReadBufferSize);
+ input.resetFullyRead(); // clear any pre-read (e.g., BOM) state before capped sniff
input.mark(firstReadBufferSize);
input.allowClose(false); // ignores closes during parse, in case we need to rewind
try (Reader reader = new SimpleStreamReader(input, UTF_8)) { // input is currently capped to firstReadBufferSize
diff --git a/src/main/java/org/jsoup/helper/HttpConnection.java b/src/main/java/org/jsoup/helper/HttpConnection.java
index c0a6fdf8fc..9b5860d298 100644
--- a/src/main/java/org/jsoup/helper/HttpConnection.java
+++ b/src/main/java/org/jsoup/helper/HttpConnection.java
@@ -11,6 +11,8 @@
import org.jsoup.parser.StreamParser;
import org.jspecify.annotations.Nullable;
+import javax.net.ssl.HostnameVerifier;
+import javax.net.ssl.SSLContext;
import javax.net.ssl.SSLSocketFactory;
import java.io.BufferedInputStream;
import java.io.BufferedReader;
@@ -223,6 +225,18 @@ public Connection sslSocketFactory(SSLSocketFactory sslSocketFactory) {
return this;
}
+ @Override
+ public Connection sslContext(SSLContext sslContext) {
+ req.sslContext(sslContext);
+ return this;
+ }
+
+ @Override
+ public Connection hostnameVerifier(HostnameVerifier hostnameVerifier) {
+ req.hostnameVerifier(hostnameVerifier);
+ return this;
+ }
+
@Override
public Connection data(String key, String filename, InputStream inputStream) {
req.data(KeyVal.create(key, filename, inputStream));
@@ -618,11 +632,13 @@ public static class Request extends HttpConnection.Base impl
private boolean parserDefined = false; // called parser(...) vs initialized in ctor
private String postDataCharset = DataUtil.defaultCharsetName;
private @Nullable SSLSocketFactory sslSocketFactory;
+ @Nullable SSLContext sslContext;
private CookieManager cookieManager;
@Nullable RequestAuthenticator authenticator;
private @Nullable Progress responseProgress;
private final ReentrantLock executing = new ReentrantLock(); // detects and warns if same request used concurrently
+ private @Nullable HostnameVerifier hostnameVerifier;
Request() {
super();
@@ -652,6 +668,7 @@ public static class Request extends HttpConnection.Base impl
parser = copy.parser.newInstance(); // parsers and their tree-builders maintain state, so need a fresh copy
parserDefined = copy.parserDefined;
sslSocketFactory = copy.sslSocketFactory; // these are all synchronized so safe to share
+ sslContext = copy.sslContext;
cookieManager = copy.cookieManager;
authenticator = copy.authenticator;
responseProgress = copy.responseProgress;
@@ -724,6 +741,25 @@ public void sslSocketFactory(SSLSocketFactory sslSocketFactory) {
this.sslSocketFactory = sslSocketFactory;
}
+ @Override @Nullable
+ public SSLContext sslContext() {
+ return sslContext;
+ }
+
+ @Override
+ public Connection.Request sslContext(SSLContext sslContext) {
+ this.sslContext = sslContext;
+ return this;
+ }
+
+ public HostnameVerifier hostnameVerifier() {
+ return hostnameVerifier;
+ }
+
+ public void hostnameVerifier(HostnameVerifier hostnameVerifier) {
+ this.hostnameVerifier = hostnameVerifier;
+ }
+
@Override
public Connection.Request ignoreHttpErrors(boolean ignoreHttpErrors) {
this.ignoreHttpErrors = ignoreHttpErrors;
diff --git a/src/main/java/org/jsoup/helper/Re2jRegex.java b/src/main/java/org/jsoup/helper/Re2jRegex.java
new file mode 100644
index 0000000000..b9e65bed60
--- /dev/null
+++ b/src/main/java/org/jsoup/helper/Re2jRegex.java
@@ -0,0 +1,48 @@
+package org.jsoup.helper;
+
+/**
+ re2j-backed Regex implementation; must only be touched when re2j is on the classpath.
+ */
+final class Re2jRegex extends Regex {
+ private static final java.util.regex.Pattern unused = java.util.regex.Pattern.compile("");
+
+ private final com.google.re2j.Pattern re2jPattern;
+
+ private Re2jRegex(com.google.re2j.Pattern re2jPattern) {
+ super(unused);
+ this.re2jPattern = re2jPattern;
+ }
+
+ public static Regex compile(String regex) {
+ try {
+ return new Re2jRegex(com.google.re2j.Pattern.compile(regex));
+ } catch (RuntimeException e) {
+ throw new ValidationException("Pattern syntax error: " + e.getMessage());
+ } catch (OutOfMemoryError | StackOverflowError e) { // defensive check on regex to normalize exception
+ throw new ValidationException("Pattern complexity error: " + e.getMessage());
+ }
+ }
+
+ @Override
+ public Matcher matcher(CharSequence input) {
+ return new Re2jMatcher(re2jPattern.matcher(input));
+ }
+
+ @Override
+ public String toString() {
+ return re2jPattern.toString();
+ }
+
+ private static final class Re2jMatcher implements Matcher {
+ private final com.google.re2j.Matcher delegate;
+
+ Re2jMatcher(com.google.re2j.Matcher delegate) {
+ this.delegate = delegate;
+ }
+
+ @Override
+ public boolean find() {
+ return delegate.find();
+ }
+ }
+}
diff --git a/src/main/java/org/jsoup/helper/Regex.java b/src/main/java/org/jsoup/helper/Regex.java
new file mode 100644
index 0000000000..7b157ddc90
--- /dev/null
+++ b/src/main/java/org/jsoup/helper/Regex.java
@@ -0,0 +1,119 @@
+package org.jsoup.helper;
+
+import org.jsoup.internal.SharedConstants;
+
+import java.lang.reflect.InvocationTargetException;
+import java.lang.reflect.Method;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+
+/**
+ A regular expression abstraction. Allows jsoup to optionally use the re2j regular expression engine (linear time)
+ instead of the JDK's backtracking regex implementation.
+
+ If the {@code com.google.re2j} library is found on the classpath, by default it will be used. You can override this
+ by setting {@code -Djsoup.useRe2j=false} to explicitly disable, and use the JDK regex engine.
+
+ (Currently this a simplified implementation for jsoup's specific use; can extend as required.)
+ */
+public class Regex {
+ private static final boolean hasRe2j = hasRe2j();
+
+ private final Pattern jdkPattern;
+
+ Regex(Pattern jdkPattern) {
+ this.jdkPattern = jdkPattern;
+ }
+
+ /**
+ Compile a regex, using re2j if enabled and available; otherwise JDK regex.
+
+ @param regex the regex to compile
+ @return the compiled regex
+ @throws ValidationException if the regex is invalid
+ */
+ public static Regex compile(String regex) {
+ if (usingRe2j()) {
+ return Re2jRegex.compile(regex);
+ }
+
+ try {
+ return new Regex(Pattern.compile(regex));
+ } catch (PatternSyntaxException e) {
+ throw new ValidationException("Pattern syntax error: " + e.getMessage());
+ }
+ }
+
+ /** Wraps an existing JDK Pattern (for API compat); doesn't switch */
+ public static Regex fromPattern(Pattern pattern) {
+ return new Regex(pattern);
+ }
+
+ /**
+ Checks if re2j is available (on classpath) and enabled (via system property).
+ @return true if re2j is available and enabled
+ */
+ public static boolean usingRe2j() {
+ return hasRe2j && wantsRe2j();
+ }
+
+ static boolean wantsRe2j() {
+ return Boolean.parseBoolean(System.getProperty(SharedConstants.UseRe2j, "true"));
+ }
+
+ static void wantsRe2j(boolean use) {
+ System.setProperty(SharedConstants.UseRe2j, Boolean.toString(use));
+ }
+
+ static boolean hasRe2j() {
+ try {
+ Class> re2 = Class.forName("com.google.re2j.Pattern", false, Regex.class.getClassLoader()); // check if re2j is in classpath
+ try {
+ // if it is, and we are on JVM9+, we need to dork around with modules, because re2j doesn't publish a module name.
+ // done via reflection so we can still run on JVM 8.
+ // todo remove if re2j publishes as a module
+ Class> moduleCls = Class.forName("java.lang.Module");
+ Method getModule = Class.class.getMethod("getModule");
+ Object jsoupMod = getModule.invoke(Regex.class);
+ Object re2Mod = getModule.invoke(re2);
+ boolean reads = (boolean) moduleCls.getMethod("canRead", moduleCls).invoke(jsoupMod, re2Mod);
+ if (!reads) moduleCls.getMethod("addReads", moduleCls).invoke(jsoupMod, re2Mod);
+ } catch (ClassNotFoundException ignore) {
+ // jvm8 - no Module class; so we can use as-is
+ }
+ return true;
+ } catch (ClassNotFoundException e) {
+ return false; // no re2j
+ } catch (ReflectiveOperationException e) {
+ // unexpectedly couldn’t wire modules on 9+; return false to avoid IllegalAccessError later
+ System.err.println("Warning: (bug? please report) couldn't access re2j from jsoup due to modules: " + e);
+ return false;
+ }
+ }
+
+ public Matcher matcher(CharSequence input) {
+ return new JdkMatcher(jdkPattern.matcher(input));
+ }
+
+ @Override
+ public String toString() {
+ return jdkPattern.toString();
+ }
+
+ public interface Matcher {
+ boolean find();
+ }
+
+ private static final class JdkMatcher implements Matcher {
+ private final java.util.regex.Matcher delegate;
+
+ JdkMatcher(java.util.regex.Matcher delegate) {
+ this.delegate = delegate;
+ }
+
+ @Override
+ public boolean find() {
+ return delegate.find();
+ }
+ }
+}
diff --git a/src/main/java/org/jsoup/helper/RequestDispatch.java b/src/main/java/org/jsoup/helper/RequestDispatch.java
index fc73e7c99f..e79f3c41a3 100644
--- a/src/main/java/org/jsoup/helper/RequestDispatch.java
+++ b/src/main/java/org/jsoup/helper/RequestDispatch.java
@@ -9,7 +9,7 @@
import java.lang.reflect.Constructor;
/**
- Handles requests using either HttpClient (available in JDK 11+) or HttpURLConnection. During initialization, the
+ Handles requests using either HttpClient (available in JVM 11+) or HttpURLConnection. During initialization, the
HttpClientExecutor class is used if it can be instantiated, unless the system property
{@link SharedConstants#UseHttpClient} is explicitly set to {@code false}.
*/
@@ -32,6 +32,10 @@ class RequestDispatch {
static RequestExecutor get(Request request, @Nullable Response previousResponse) {
boolean useHttpClient = Boolean.parseBoolean(System.getProperty(SharedConstants.UseHttpClient, "true"));
+
+ if (request.sslSocketFactory() != null) // downgrade if a socket factory is set, as it can't be supplied to the HttpClient
+ useHttpClient = false;
+
if (useHttpClient && clientConstructor != null) {
try {
return clientConstructor.newInstance(request, previousResponse);
diff --git a/src/main/java/org/jsoup/helper/UrlConnectionExecutor.java b/src/main/java/org/jsoup/helper/UrlConnectionExecutor.java
index 9164c69d41..c2d6cabfc1 100644
--- a/src/main/java/org/jsoup/helper/UrlConnectionExecutor.java
+++ b/src/main/java/org/jsoup/helper/UrlConnectionExecutor.java
@@ -1,7 +1,6 @@
package org.jsoup.helper;
import org.jsoup.Connection;
-import org.jsoup.internal.Functions;
import org.jspecify.annotations.Nullable;
import javax.net.ssl.HttpsURLConnection;
@@ -90,8 +89,15 @@ private static HttpURLConnection createConnection(HttpConnection.Request req) th
conn.setConnectTimeout(req.timeout());
conn.setReadTimeout(req.timeout() / 2); // gets reduced after connection is made and status is read
- if (req.sslSocketFactory() != null && conn instanceof HttpsURLConnection)
- ((HttpsURLConnection) conn).setSSLSocketFactory(req.sslSocketFactory());
+ if (conn instanceof HttpsURLConnection) {
+ HttpsURLConnection scon = (HttpsURLConnection) conn;
+ if (req.sslContext != null)
+ scon.setSSLSocketFactory(req.sslContext.getSocketFactory());
+ else if (req.sslSocketFactory() != null)
+ scon.setSSLSocketFactory(req.sslSocketFactory());
+ if (req.hostnameVerifier() != null)
+ scon.setHostnameVerifier(req.hostnameVerifier());
+ }
if (req.authenticator != null)
AuthenticationHandler.handler.enable(req.authenticator, conn); // removed in finally
if (req.method().hasBody())
@@ -118,7 +124,7 @@ private static LinkedHashMap> createHeaderMap(HttpURLConnec
if (key == null || val == null)
continue; // skip http1.1 line
- final List vals = headers.computeIfAbsent(key, Functions.listFunction());
+ final List vals = headers.computeIfAbsent(key, k -> new java.util.ArrayList<>());
vals.add(val);
}
return headers;
diff --git a/src/main/java/org/jsoup/helper/Validate.java b/src/main/java/org/jsoup/helper/Validate.java
index cc8dcaf342..d8e29d6e44 100644
--- a/src/main/java/org/jsoup/helper/Validate.java
+++ b/src/main/java/org/jsoup/helper/Validate.java
@@ -48,7 +48,7 @@ public static void notNull(@Nullable Object obj, String msg) {
* @param obj nullable object to cast to not-null
* @return the object, or throws an exception if it is null
* @throws ValidationException if the object is null
- * @deprecated prefer to use {@link #expectNotNull(Object, String, Object...)} instead
+ * @deprecated prefer to use {@link #expectNotNull(Object, String, Object...)} instead; will be removed in jsoup 1.24.1
*/
@Deprecated
public static Object ensureNotNull(@Nullable Object obj) {
@@ -65,7 +65,7 @@ public static Object ensureNotNull(@Nullable Object obj) {
* @param args the arguments to the msg
* @return the object, or throws an exception if it is null
* @throws ValidationException if the object is null
- * @deprecated prefer to use {@link #expectNotNull(Object, String, Object...)} instead
+ * @deprecated prefer to use {@link #expectNotNull(Object, String, Object...)} instead; will be removed in jsoup 1.24.1
*/
@Deprecated
public static Object ensureNotNull(@Nullable Object obj, String msg, Object... args) {
diff --git a/src/main/java/org/jsoup/internal/ControllableInputStream.java b/src/main/java/org/jsoup/internal/ControllableInputStream.java
index cf8361805c..c3238a3fab 100644
--- a/src/main/java/org/jsoup/internal/ControllableInputStream.java
+++ b/src/main/java/org/jsoup/internal/ControllableInputStream.java
@@ -20,19 +20,19 @@
// reimplemented from ConstrainableInputStream for JDK21 - extending BufferedInputStream will pin threads during read
public class ControllableInputStream extends FilterInputStream {
private final SimpleBufferedInput buff; // super.in, but typed as SimpleBufferedInput
- private int maxSize;
- private long startTime;
- private long timeout = 0; // optional max time of request
- private int remaining;
- private int markPos;
- private boolean interrupted;
- private boolean allowClose = true; // for cases where we want to re-read the input, can ignore .close() from the parser
+ private int maxSize; // logical cap exposed to callers (0 == unlimited)
+ private long startTime; // start time for timeout checks, nanos
+ private long timeout = 0; // optional max time of request
+ private int remaining; // how many bytes may still be returned to caller under the current cap
+ private int markPos; // logical readPos snapshot for InputStream.mark/reset (not a buffer cursor)
+ private boolean interrupted; // true if Thread.interrupted() was detected, used to latch interrupted state
+ private boolean allowClose = true; // for cases where we want to re-read the input, can ignore .close() from the parser
// if we are tracking progress, will have the expected content length, progress callback, connection
private @Nullable Progress> progress;
private @Nullable Object progressContext;
- private int contentLength = -1;
- private int readPos = 0; // amount read; can be reset()
+ private int contentLength = -1; // expected content length for progress; -1 == unknown
+ private int readPos = 0; // amount read; can be reset()
private ControllableInputStream(SimpleBufferedInput in, int maxSize) {
super(in);
@@ -85,6 +85,7 @@ public int read(byte[] b, int off, int len) throws IOException {
if (capped && len > remaining)
len = remaining; // don't read more than desired, even if available
+ buff.capRemaining(capped ? remaining : Integer.MAX_VALUE);
while (true) { // loop trying to read until we get some data or hit the overall timeout, if we have one
if (expired())
@@ -95,7 +96,9 @@ public int read(byte[] b, int off, int len) throws IOException {
if (read == -1) { // completed
contentLength = readPos;
} else {
- remaining -= read;
+ if (capped && read > 0) {
+ remaining -= read; // track bytes returned to the caller
+ }
readPos += read;
}
emitProgress();
@@ -107,6 +110,11 @@ public int read(byte[] b, int off, int len) throws IOException {
}
}
+ @Override
+ public boolean markSupported() {
+ return true;
+ }
+
/**
* Reads this inputstream to a ByteBuffer. The supplied max may be less than the inputstream's max, to support
* reading just the first bytes.
@@ -145,15 +153,24 @@ public static ByteBuffer readToByteBuffer(InputStream in, int max) throws IOExce
@SuppressWarnings("NonSynchronizedMethodOverridesSynchronizedMethod") // not synchronized in later JDKs
@Override public void reset() throws IOException {
- super.reset();
- remaining = maxSize - markPos;
+ if (markPos < 0) throw new IOException("Resetting to invalid mark");
+ buff.rewindToMark();
+ buff.clearMark();
+ if (maxSize != 0) {
+ remaining = maxSize - markPos;
+ buff.capRemaining(remaining);
+ } else {
+ remaining = 0;
+ buff.capRemaining(Integer.MAX_VALUE);
+ }
readPos = markPos; // readPos is used for progress emits
+ markPos = -1;
}
@SuppressWarnings("NonSynchronizedMethodOverridesSynchronizedMethod") // not synchronized in later JDKs
@Override public void mark(int readlimit) {
- super.mark(readlimit);
- markPos = maxSize - remaining;
+ markPos = readPos;
+ buff.setMark();
}
/**
@@ -165,6 +182,10 @@ public boolean baseReadFully() {
return buff.baseReadFully();
}
+ public void resetFullyRead() {
+ buff.resetFullyRead();
+ }
+
/**
Get the max size of this stream (how far at most will be read from the underlying stream)
* @return the max size
@@ -175,7 +196,9 @@ public int max() {
public void max(int newMax) {
remaining += newMax - maxSize; // update remaining to reflect the difference in the new maxsize
+ if (remaining < 0) remaining = 0;
maxSize = newMax;
+ buff.capRemaining(newMax == 0 ? Integer.MAX_VALUE : remaining);
}
public void allowClose(boolean allowClose) {
diff --git a/src/main/java/org/jsoup/internal/Functions.java b/src/main/java/org/jsoup/internal/Functions.java
index 40227d8417..3d5d636416 100644
--- a/src/main/java/org/jsoup/internal/Functions.java
+++ b/src/main/java/org/jsoup/internal/Functions.java
@@ -11,8 +11,10 @@
/**
* An internal class containing functions for use with {@link Map#computeIfAbsent(Object, Function)}.
+ * @deprecated for removal in jsoup 1.23.1. Replace usages with direct constructor references / lambdas.
*/
@SuppressWarnings({"rawtypes", "unchecked"})
+@Deprecated
public final class Functions {
private static final Function ListFunction = key -> new ArrayList<>();
private static final Function SetFunction = key -> new HashSet<>();
diff --git a/src/main/java/org/jsoup/internal/Normalizer.java b/src/main/java/org/jsoup/internal/Normalizer.java
index 3659e14956..9fe85df336 100644
--- a/src/main/java/org/jsoup/internal/Normalizer.java
+++ b/src/main/java/org/jsoup/internal/Normalizer.java
@@ -23,7 +23,8 @@ public static String normalize(final String input) {
/**
If a string literal, just lower case the string; otherwise lower-case and trim.
- @deprecated internal function; will be removed in a future version.
+ @deprecated internal helper; replace with {@link #lowerCase(String)} for no-trim, or {@link #normalize(String)} for trim + lowercase.
+ Will be removed in jsoup 1.24.1.
*/
@Deprecated
public static String normalize(final String input, boolean isStringLiteral) {
diff --git a/src/main/java/org/jsoup/internal/SharedConstants.java b/src/main/java/org/jsoup/internal/SharedConstants.java
index baff2a36b6..8be99cb73b 100644
--- a/src/main/java/org/jsoup/internal/SharedConstants.java
+++ b/src/main/java/org/jsoup/internal/SharedConstants.java
@@ -21,5 +21,7 @@ public final class SharedConstants {
public static final String UseHttpClient = "jsoup.useHttpClient";
+ public static final String UseRe2j = "jsoup.useRe2j"; // enables use of the re2j regular expression engine when true and it's on the classpath
+
private SharedConstants() {}
}
diff --git a/src/main/java/org/jsoup/internal/SimpleBufferedInput.java b/src/main/java/org/jsoup/internal/SimpleBufferedInput.java
index c76dedc7d4..dafbd3af4a 100644
--- a/src/main/java/org/jsoup/internal/SimpleBufferedInput.java
+++ b/src/main/java/org/jsoup/internal/SimpleBufferedInput.java
@@ -17,11 +17,12 @@
class SimpleBufferedInput extends FilterInputStream {
static final int BufferSize = DefaultBufferSize;
static final SoftPool BufferPool = new SoftPool<>(() -> new byte[BufferSize]);
+ private int capRemaining = Integer.MAX_VALUE; // how many bytes we are allowed to pull from the underlying stream
private byte @Nullable [] byteBuf; // the byte buffer; recycled via SoftPool. Created in fill if required
private int bufPos;
private int bufLength;
- private int bufMark = -1;
+ private int bufMark = -1; // mark set by ControllableInputStream; -1 when unset
private boolean inReadFully = false; // true when the underlying inputstream has been read fully
SimpleBufferedInput(@Nullable InputStream in) {
@@ -50,12 +51,6 @@ public int read(byte[] dest, int offset, int desiredLen) throws IOException {
int bufAvail = bufLength - bufPos;
if (bufAvail <= 0) { // can't serve from the buffer
- if (!inReadFully && bufMark < 0) {
- // skip creating / copying into a local buffer; just pass through
- int read = in.read(dest, offset, desiredLen);
- closeIfDone(read);
- return read;
- }
fill();
bufAvail = bufLength - bufPos;
}
@@ -76,38 +71,25 @@ private void fill() throws IOException {
byteBuf = BufferPool.borrow();
}
- if (bufMark < 0) { // no mark, can lose buffer (assumes we've read to bufLen)
- bufPos = 0;
- } else if (bufPos >= BufferSize) { // no room left in buffer
- if (bufMark > 0) { // can throw away early part of the buffer
- int size = bufPos - bufMark;
- System.arraycopy(byteBuf, bufMark, byteBuf, 0, size);
- bufPos = size;
- bufMark = 0;
- } else { // invalidate mark
- bufMark = -1;
- bufPos = 0;
- }
- }
+ compact();
bufLength = bufPos;
- int read = in.read(byteBuf, bufPos, byteBuf.length - bufPos);
+ int toRead = Math.min(byteBuf.length - bufPos, capRemaining);
+ if (toRead <= 0) return;
+ int read = in.read(byteBuf, bufPos, toRead);
if (read > 0) {
bufLength = read + bufPos;
- while (byteBuf.length - bufLength > 0) { // read in more if we have space, without blocking
+ capRemaining -= read;
+ while (byteBuf.length - bufLength > 0 && capRemaining > 0) { // read in more if we have space, without blocking
if (in.available() < 1) break;
- read = in.read(byteBuf, bufLength, byteBuf.length - bufLength);
+ toRead = Math.min(byteBuf.length - bufLength, capRemaining);
+ if (toRead <= 0) break;
+ read = in.read(byteBuf, bufLength, toRead);
if (read <= 0) break;
bufLength += read;
+ capRemaining -= read;
}
}
- closeIfDone(read);
- }
-
- private void closeIfDone(int read) throws IOException {
- if (read == -1) {
- inReadFully = true;
- super.close(); // close underlying stream immediately; frees resources a little earlier
- }
+ if (read == -1) inReadFully = true;
}
byte[] getBuf() {
@@ -123,30 +105,55 @@ boolean baseReadFully() {
return inReadFully;
}
- @Override
- public int available() throws IOException {
- if (byteBuf != null && bufLength - bufPos > 0)
- return bufLength - bufPos; // doesn't include those in.available(), but mostly used as a block test
- return inReadFully ? 0 : in.available();
+ void resetFullyRead() {
+ if (in != null) // for null-wrapped streams, leave as fully read to avoid fill() on a null input
+ inReadFully = false;
}
- @SuppressWarnings("NonSynchronizedMethodOverridesSynchronizedMethod") // explicitly not synced
@Override
- public void mark(int readlimit) {
- if (readlimit > BufferSize) {
- throw new IllegalArgumentException("Read-ahead limit is greater than buffer size");
+ public int available() throws IOException {
+ int buffered = (byteBuf != null) ? (bufLength - bufPos) : 0;
+ if (buffered > 0) {
+ return buffered; // doesn't include those in.available(), but mostly used as a block test
}
+ int avail = inReadFully ? 0 : in.available();
+ return avail;
+ }
+
+ void capRemaining(int newRemaining) {
+ capRemaining = Math.max(0, newRemaining);
+ }
+
+ void setMark() {
bufMark = bufPos;
}
- @SuppressWarnings("NonSynchronizedMethodOverridesSynchronizedMethod") // explicitly not synced
- @Override
- public void reset() throws IOException {
+ void rewindToMark() throws IOException {
if (bufMark < 0)
throw new IOException("Resetting to invalid mark");
bufPos = bufMark;
}
+ void clearMark() {
+ bufMark = -1;
+ }
+
+ private void compact() {
+ if (byteBuf == null || bufPos == 0) return;
+ int keepFrom = bufMark >= 0 ? bufMark : bufPos;
+ if (keepFrom <= 0) return;
+
+ int remaining = bufLength - keepFrom;
+ if (remaining > 0) {
+ System.arraycopy(byteBuf, keepFrom, byteBuf, 0, remaining);
+ }
+ bufLength = remaining;
+ bufPos -= keepFrom;
+ if (bufMark >= 0) {
+ bufMark -= keepFrom;
+ }
+ }
+
@Override
public void close() throws IOException {
if (in != null) super.close();
diff --git a/src/main/java/org/jsoup/internal/StringUtil.java b/src/main/java/org/jsoup/internal/StringUtil.java
index a953a86ffa..50c650e4c4 100644
--- a/src/main/java/org/jsoup/internal/StringUtil.java
+++ b/src/main/java/org/jsoup/internal/StringUtil.java
@@ -148,8 +148,8 @@ public static String padding(int width, int maxPaddingWidth) {
* @param string string to test
* @return if string is blank
*/
- public static boolean isBlank(final String string) {
- if (string == null || string.length() == 0)
+ public static boolean isBlank(@Nullable String string) {
+ if (string == null || string.isEmpty())
return true;
int l = string.length();
diff --git a/src/main/java/org/jsoup/nodes/Attribute.java b/src/main/java/org/jsoup/nodes/Attribute.java
index d90ed7fa2b..a99f1dd153 100644
--- a/src/main/java/org/jsoup/nodes/Attribute.java
+++ b/src/main/java/org/jsoup/nodes/Attribute.java
@@ -202,13 +202,13 @@ static void html(String key, @Nullable String val, QuietAppendable accum, Docume
htmlNoValidate(key, val, accum, out);
}
- /** @deprecated internal method and will be removed in a future version */
+ /** @deprecated internal method; use {@link #html(String, String, QuietAppendable, Document.OutputSettings)} with {@link org.jsoup.internal.QuietAppendable#wrap(Appendable)} instead. Will be removed in jsoup 1.24.1. */
@Deprecated
protected void html(Appendable accum, Document.OutputSettings out) throws IOException {
html(key, val, accum, out);
}
- /** @deprecated internal method and will be removed in a future version */
+ /** @deprecated internal method; use {@link #html(String, String, QuietAppendable, Document.OutputSettings)} with {@link org.jsoup.internal.QuietAppendable#wrap(Appendable)} instead. Will be removed in jsoup 1.24.1. */
@Deprecated
protected static void html(String key, @Nullable String val, Appendable accum, Document.OutputSettings out) throws IOException {
html(key, val, QuietAppendable.wrap(accum), out);
@@ -306,7 +306,7 @@ protected static boolean isDataAttribute(String key) {
*
* @param out output settings
* @return Returns whether collapsible or not
- * @deprecated internal method and will be removed in a future version
+ * @deprecated internal method; use {@link #shouldCollapseAttribute(String, String, Document.OutputSettings)} instead. Will be removed in jsoup 1.24.1.
*/
@Deprecated
protected final boolean shouldCollapseAttribute(Document.OutputSettings out) {
diff --git a/src/main/java/org/jsoup/nodes/Attributes.java b/src/main/java/org/jsoup/nodes/Attributes.java
index 9fcf033ec5..eb400729a5 100644
--- a/src/main/java/org/jsoup/nodes/Attributes.java
+++ b/src/main/java/org/jsoup/nodes/Attributes.java
@@ -159,7 +159,7 @@ public Attributes put(String key, @Nullable String value) {
if (i != NotFound)
vals[i] = value;
else
- add(key, value);
+ addObject(key, value);
return this;
}
@@ -183,6 +183,13 @@ Map userData() {
return userData;
}
+ /**
+ Check if these attributes have any user data associated with them.
+ */
+ boolean hasUserData() {
+ return hasKey(SharedConstants.UserDataKey);
+ }
+
/**
Get an arbitrary user-data object by key.
* @param key case-sensitive key to the object.
@@ -193,7 +200,7 @@ Map userData() {
@Nullable
public Object userData(String key) {
Validate.notNull(key);
- if (!hasKey(SharedConstants.UserDataKey)) return null; // no user data exists
+ if (!hasUserData()) return null; // no user data exists
Map userData = userData();
return userData.get(key);
}
@@ -225,7 +232,7 @@ void putIgnoreCase(String key, @Nullable String value) {
keys[i] = key;
}
else
- add(key, value);
+ addObject(key, value);
}
/**
@@ -365,7 +372,7 @@ public void addAll(Attributes incoming) {
if (needsPut)
put(attr);
else
- add(attr.getKey(), attr.getValue());
+ addObject(attr.getKey(), attr.getValue());
}
}
diff --git a/src/main/java/org/jsoup/nodes/Document.java b/src/main/java/org/jsoup/nodes/Document.java
index 49051bcc45..db25382f95 100644
--- a/src/main/java/org/jsoup/nodes/Document.java
+++ b/src/main/java/org/jsoup/nodes/Document.java
@@ -220,7 +220,10 @@ public void title(String title) {
@return new element
*/
public Element createElement(String tagName) {
- return new Element(parser.tagSet().valueOf(tagName, parser.defaultNamespace(), ParseSettings.preserveCase), this.baseUri());
+ return new Element(
+ parser.tagSet().valueOf(tagName, parser.defaultNamespace(), ParseSettings.preserveCase),
+ searchUpForAttribute(this, BaseUriKey)
+ );
}
@Override
diff --git a/src/main/java/org/jsoup/nodes/Element.java b/src/main/java/org/jsoup/nodes/Element.java
index 36119e74dd..3e8388027f 100644
--- a/src/main/java/org/jsoup/nodes/Element.java
+++ b/src/main/java/org/jsoup/nodes/Element.java
@@ -3,6 +3,7 @@
import org.jsoup.helper.Validate;
import org.jsoup.internal.Normalizer;
import org.jsoup.internal.QuietAppendable;
+import org.jsoup.helper.Regex;
import org.jsoup.internal.StringUtil;
import org.jsoup.parser.ParseSettings;
import org.jsoup.parser.Parser;
@@ -50,7 +51,7 @@ public class Element extends Node implements Iterable {
private static final List EmptyChildren = Collections.emptyList();
private static final NodeList EmptyNodeList = new NodeList(0);
private static final Pattern ClassSplit = Pattern.compile("\\s+");
- private static final String BaseUriKey = Attributes.internalKey("baseUri");
+ static final String BaseUriKey = Attributes.internalKey("baseUri");
Tag tag;
NodeList childNodes;
@Nullable Attributes attributes; // field is nullable but all methods for attributes are non-null
@@ -87,8 +88,7 @@ public Element(Tag tag, @Nullable String baseUri, @Nullable Attributes attribute
childNodes = EmptyNodeList;
this.attributes = attributes;
this.tag = tag;
- if (baseUri != null)
- this.setBaseUri(baseUri);
+ if (!StringUtil.isBlank(baseUri)) this.setBaseUri(baseUri);
}
/**
@@ -130,17 +130,19 @@ public Attributes attributes() {
@Override
public String baseUri() {
- return searchUpForAttribute(this, BaseUriKey);
+ String baseUri = searchUpForAttribute(this, BaseUriKey);
+ return baseUri != null ? baseUri : "";
}
- private static String searchUpForAttribute(final Element start, final String key) {
+ @Nullable
+ static String searchUpForAttribute(final Element start, final String key) {
Element el = start;
while (el != null) {
if (el.attributes != null && el.attributes.hasKey(key))
return el.attributes.get(key);
el = el.parent();
}
- return "";
+ return null;
}
@Override
@@ -355,7 +357,18 @@ public Elements parents() {
* @see #childNode(int)
*/
public Element child(int index) {
- return childElementsList().get(index);
+ Validate.isTrue(index >= 0, "Index must be >= 0");
+ List cached = cachedChildren();
+ if (cached != null) return cached.get(index);
+ // otherwise, iter on elementChild; saves creating list
+ int size = childNodes.size();
+ for (int i = 0, e = 0; i < size; i++) { // direct iter is faster than chasing firstElSib, nextElSibd
+ Node node = childNodes.get(i);
+ if (node instanceof Element) {
+ if (e++ == index) return (Element) node;
+ }
+ }
+ throw new IndexOutOfBoundsException("No child at index: " + index);
}
/**
@@ -370,7 +383,8 @@ public Element child(int index) {
* @see #child(int)
*/
public int childrenSize() {
- return childElementsList().size();
+ if (childNodeSize() == 0) return 0;
+ return childElementsList().size(); // gets children into cache; faster subsequent child(i) if unmodified
}
/**
@@ -406,8 +420,9 @@ List childElementsList() {
private static final String childElsMod = "jsoup.childElsMod";
/** returns the cached child els, if they exist, and the modcount of our childnodes matches the stashed modcount */
- private @Nullable List cachedChildren() {
- Map userData = attributes().userData();
+ @Nullable List cachedChildren() {
+ if (attributes == null || !attributes.hasUserData()) return null; // don't create empty userdata
+ Map userData = attributes.userData();
//noinspection unchecked
WeakReference> ref = (WeakReference>) userData.get(childElsKey);
if (ref != null) {
@@ -872,10 +887,7 @@ public Element insertChildren(int index, Collection extends Node> children) {
int currentSize = childNodeSize();
if (index < 0) index += currentSize +1; // roll around
Validate.isTrue(index >= 0 && index <= currentSize, "Insert position out of bounds.");
-
- ArrayList nodes = new ArrayList<>(children);
- Node[] nodeArray = nodes.toArray(new Node[0]);
- addChildren(index, nodeArray);
+ addChildren(index, children.toArray(new Node[0]));
return this;
}
@@ -1054,9 +1066,9 @@ public Element after(Node node) {
@Override
public Element empty() {
// Detach each of the children -> parent links:
- for (Node child : childNodes) {
- child.parentNode = null;
- }
+ int size = childNodes.size();
+ for (int i = 0; i < size; i++)
+ childNodes.get(i).parentNode = null;
childNodes.clear();
return this;
}
@@ -1236,10 +1248,10 @@ private static int indexInList(Element search, List eleme
@since 1.15.2
*/
public @Nullable Element firstElementChild() {
- Node child = firstChild();
- while (child != null) {
- if (child instanceof Element) return (Element) child;
- child = child.nextSibling();
+ int size = childNodes.size();
+ for (int i = 0; i < size; i++) {
+ Node node = childNodes.get(i);
+ if (node instanceof Element) return (Element) node;
}
return null;
}
@@ -1252,10 +1264,9 @@ private static int indexInList(Element search, List eleme
@since 1.15.2
*/
public @Nullable Element lastElementChild() {
- Node child = lastChild();
- while (child != null) {
- if (child instanceof Element) return (Element) child;
- child = child.previousSibling();
+ for (int i = childNodes.size() - 1; i >= 0; i--) {
+ Node node = childNodes.get(i);
+ if (node instanceof Element) return (Element) node;
}
return null;
}
@@ -1394,7 +1405,6 @@ public Elements getElementsByAttributeValueContaining(String key, String match)
*/
public Elements getElementsByAttributeValueMatching(String key, Pattern pattern) {
return Collector.collect(new Evaluator.AttributeWithValueMatching(key, pattern), this);
-
}
/**
@@ -1404,13 +1414,13 @@ public Elements getElementsByAttributeValueMatching(String key, Pattern pattern)
* @return elements that have attributes matching this regular expression
*/
public Elements getElementsByAttributeValueMatching(String key, String regex) {
- Pattern pattern;
+ Regex pattern;
try {
- pattern = Pattern.compile(regex);
+ pattern = Regex.compile(regex);
} catch (PatternSyntaxException e) {
throw new IllegalArgumentException("Pattern syntax error: " + regex, e);
}
- return getElementsByAttributeValueMatching(key, pattern);
+ return Collector.collect(new Evaluator.AttributeWithValueMatching(key, pattern), this);
}
/**
@@ -1479,13 +1489,13 @@ public Elements getElementsMatchingText(Pattern pattern) {
* @see Element#text()
*/
public Elements getElementsMatchingText(String regex) {
- Pattern pattern;
+ Regex pattern;
try {
- pattern = Pattern.compile(regex);
+ pattern = Regex.compile(regex);
} catch (PatternSyntaxException e) {
throw new IllegalArgumentException("Pattern syntax error: " + regex, e);
}
- return getElementsMatchingText(pattern);
+ return Collector.collect(new Evaluator.Matches(pattern), this);
}
/**
@@ -1505,13 +1515,13 @@ public Elements getElementsMatchingOwnText(Pattern pattern) {
* @see Element#ownText()
*/
public Elements getElementsMatchingOwnText(String regex) {
- Pattern pattern;
+ Regex pattern;
try {
- pattern = Pattern.compile(regex);
+ pattern = Regex.compile(regex);
} catch (PatternSyntaxException e) {
throw new IllegalArgumentException("Pattern syntax error: " + regex, e);
}
- return getElementsMatchingOwnText(pattern);
+ return Collector.collect(new Evaluator.MatchesOwn(pattern), this);
}
/**
@@ -2066,12 +2076,36 @@ public Element filter(NodeFilter nodeFilter) {
}
static final class NodeList extends ArrayList {
+ /** Tracks if the children have valid sibling indices. We only need to reindex on siblingIndex() demand. */
+ boolean validChildren = true;
+
public NodeList(int size) {
super(size);
}
+ /** The modCount is used to invalidate the cached element children. */
int modCount() {
return this.modCount;
}
+
+ void incrementMod() {
+ this.modCount++;
+ }
+ }
+
+ void reindexChildren() {
+ final int size = childNodes.size();
+ for (int i = 0; i < size; i++) {
+ childNodes.get(i).setSiblingIndex(i);
+ }
+ childNodes.validChildren = true;
+ }
+
+ void invalidateChildren() {
+ childNodes.validChildren = false;
+ }
+
+ boolean hasValidChildren() {
+ return childNodes.validChildren;
}
}
diff --git a/src/main/java/org/jsoup/nodes/Node.java b/src/main/java/org/jsoup/nodes/Node.java
index e0a4bfe83b..9b39564351 100644
--- a/src/main/java/org/jsoup/nodes/Node.java
+++ b/src/main/java/org/jsoup/nodes/Node.java
@@ -366,8 +366,12 @@ public Node root() {
* @return the Document associated with this Node, or null if there is no such Document.
*/
public @Nullable Document ownerDocument() {
- Node root = root();
- return (root instanceof Document) ? (Document) root : null;
+ Node node = this;
+ while (node != null) {
+ if (node instanceof Document) return (Document) node;
+ node = node.parentNode;
+ }
+ return null;
}
/**
@@ -386,7 +390,7 @@ public void remove() {
* @see #after(String)
*/
public Node before(String html) {
- addSiblingHtml(siblingIndex, html);
+ addSiblingHtml(siblingIndex(), html);
return this;
}
@@ -403,7 +407,7 @@ public Node before(Node node) {
// if the incoming node is a sibling of this, remove it first so siblingIndex is correct on add
if (node.parentNode == parentNode) node.remove();
- parentNode.addChildren(siblingIndex, node);
+ parentNode.addChildren(siblingIndex(), node);
return this;
}
@@ -414,7 +418,7 @@ public Node before(Node node) {
* @see #before(String)
*/
public Node after(String html) {
- addSiblingHtml(siblingIndex + 1, html);
+ addSiblingHtml(siblingIndex() + 1, html);
return this;
}
@@ -431,7 +435,7 @@ public Node after(Node node) {
// if the incoming node is a sibling of this, remove it first so siblingIndex is correct on add
if (node.parentNode == parentNode) node.remove();
- parentNode.addChildren(siblingIndex + 1, node);
+ parentNode.addChildren(siblingIndex() + 1, node);
return this;
}
@@ -505,7 +509,7 @@ public Node wrap(String html) {
public @Nullable Node unwrap() {
Validate.notNull(parentNode);
Node firstChild = firstChild();
- parentNode.addChildren(siblingIndex, this.childNodesAsArray());
+ parentNode.addChildren(siblingIndex(), this.childNodesAsArray());
this.remove();
return firstChild;
@@ -547,19 +551,24 @@ protected void replaceChild(Node out, Node in) {
if (in.parentNode != null)
in.parentNode.removeChild(in);
- final int index = out.siblingIndex;
+ final int index = out.siblingIndex();
ensureChildNodes().set(index, in);
- assert this instanceof Element;
in.parentNode = (Element) this;
in.setSiblingIndex(index);
out.parentNode = null;
+
+ ((Element) this).childNodes.incrementMod(); // as mod count not changed in set(), requires explicit update, to invalidate the child element cache
}
protected void removeChild(Node out) {
Validate.isTrue(out.parentNode == this);
- final int index = out.siblingIndex;
- ensureChildNodes().remove(index);
- reindexChildren(index);
+ Element el = (Element) this;
+ if (el.hasValidChildren()) // can remove by index
+ ensureChildNodes().remove(out.siblingIndex);
+ else
+ ensureChildNodes().remove(out); // iterates, but potentially not every one
+
+ el.invalidateChildren();
out.parentNode = null;
}
@@ -575,10 +584,9 @@ protected void addChildren(Node... children) {
}
protected void addChildren(int index, Node... children) {
+ // todo clean up all these and use the list, not the var array. just need to be careful when iterating the incoming (as we are removing as we go)
Validate.notNull(children);
- if (children.length == 0) {
- return;
- }
+ if (children.length == 0) return;
final List nodes = ensureChildNodes();
// fast path - if used as a wrap (index=0, children = child[0].parent.children - do inplace
@@ -595,7 +603,6 @@ protected void addChildren(int index, Node... children) {
}
}
if (sameList) { // moving, so OK to empty firstParent and short-circuit
- boolean wasEmpty = childNodeSize() == 0;
firstParent.empty();
nodes.addAll(index, Arrays.asList(children));
i = children.length;
@@ -603,8 +610,7 @@ protected void addChildren(int index, Node... children) {
while (i-- > 0) {
children[i].parentNode = (Element) this;
}
- if (!(wasEmpty && children[0].siblingIndex == 0)) // skip reindexing if we just moved
- reindexChildren(index);
+ ((Element) this).invalidateChildren();
return;
}
}
@@ -614,22 +620,13 @@ protected void addChildren(int index, Node... children) {
reparentChild(child);
}
nodes.addAll(index, Arrays.asList(children));
- reindexChildren(index);
+ ((Element) this).invalidateChildren();
}
protected void reparentChild(Node child) {
child.setParentNode(this);
}
- private void reindexChildren(int start) {
- final int size = childNodeSize();
- if (size == 0) return;
- final List childNodes = ensureChildNodes();
- for (int i = start; i < size; i++) {
- childNodes.get(i).setSiblingIndex(i);
- }
- }
-
/**
Retrieves this node's sibling nodes. Similar to {@link #childNodes() node.parent.childNodes()}, but does not
include this node (a node is not a sibling of itself).
@@ -656,10 +653,12 @@ public List siblingNodes() {
return null; // root
final List siblings = parentNode.ensureChildNodes();
- final int index = siblingIndex+1;
- if (siblings.size() > index)
- return siblings.get(index);
- else
+ final int index = siblingIndex() + 1;
+ if (siblings.size() > index) {
+ Node node = siblings.get(index);
+ assert (node.siblingIndex == index); // sanity test that invalidations haven't missed
+ return node;
+ } else
return null;
}
@@ -671,7 +670,7 @@ public List siblingNodes() {
if (parentNode == null)
return null; // root
- if (siblingIndex > 0)
+ if (siblingIndex() > 0)
return parentNode.ensureChildNodes().get(siblingIndex-1);
else
return null;
@@ -684,6 +683,9 @@ public List siblingNodes() {
* @see org.jsoup.nodes.Element#elementSiblingIndex()
*/
public int siblingIndex() {
+ if (parentNode != null && !parentNode.childNodes.validChildren)
+ parentNode.reindexChildren();
+
return siblingIndex;
}
@@ -900,7 +902,7 @@ public String toString() {
return outerHtml();
}
- /** @deprecated internal method moved into Printer; will be removed in a future version */
+ /** @deprecated internal method moved into Printer; will be removed in jsoup 1.24.1. */
@Deprecated
protected void indent(Appendable accum, int depth, Document.OutputSettings out) throws IOException {
accum.append('\n').append(StringUtil.padding(depth * out.indentAmount(), out.maxPaddingWidth()));
@@ -1006,7 +1008,7 @@ protected Node doClone(@Nullable Node parent) {
}
clone.parentNode = (Element) parent; // can be null, to create an orphan split
- clone.siblingIndex = parent == null ? 0 : siblingIndex;
+ clone.siblingIndex = parent == null ? 0 : siblingIndex();
// if not keeping the parent, shallowClone the ownerDocument to preserve its settings
if (parent == null && !(this instanceof Document)) {
Document doc = ownerDocument();
diff --git a/src/main/java/org/jsoup/nodes/Printer.java b/src/main/java/org/jsoup/nodes/Printer.java
index 20b3266df1..6b83fdbfe2 100644
--- a/src/main/java/org/jsoup/nodes/Printer.java
+++ b/src/main/java/org/jsoup/nodes/Printer.java
@@ -159,6 +159,7 @@ boolean isBlockEl(@Nullable Node node) {
if (node == null) return false;
if (node instanceof Element) {
Element el = (Element) node;
+ if (el.nameIs("br")) return true; // give a newline; actually an inline tag
return el.isBlock() ||
(!el.tag.isKnownTag() && (el.parentNode instanceof Document || hasChildBlocks(el)));
}
diff --git a/src/main/java/org/jsoup/nodes/PseudoTextElement.java b/src/main/java/org/jsoup/nodes/PseudoTextElement.java
index d6f0f9b4ce..9ceb41507c 100644
--- a/src/main/java/org/jsoup/nodes/PseudoTextElement.java
+++ b/src/main/java/org/jsoup/nodes/PseudoTextElement.java
@@ -6,7 +6,8 @@
/**
* Represents a {@link TextNode} as an {@link Element}, to enable text nodes to be selected with
* the {@link org.jsoup.select.Selector} {@code :matchText} syntax.
- * @deprecated use {@link Element#selectNodes(String, Class)} instead, with selector of ::textnode and class TextNode.
+ * @deprecated use {@link Element#selectNodes(String, Class)} instead, with selector of ::textnode and class TextNode;
+ * will be removed in jsoup 1.24.1.
*/
@Deprecated
public class PseudoTextElement extends Element {
diff --git a/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java b/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java
index f539de93eb..fc873eb298 100644
--- a/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java
+++ b/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java
@@ -36,7 +36,7 @@ public class HtmlTreeBuilder extends TreeBuilder {
"annotation-xml", "mi", "mn", "mo", "ms", "mtext"
};
static final String[]TagSearchInScopeSvg = new String[] {
- "desc", "foreignObject", "title"
+ "desc", "foreignobject", "title" // note normalized to lowercase to match other scope searches; will preserve input case as appropriate
};
static final String[] TagSearchList = new String[]{"ol", "ul"};
@@ -60,7 +60,9 @@ public class HtmlTreeBuilder extends TreeBuilder {
"button", "fieldset", "input", "keygen", "object", "output", "select", "textarea"
};
- public static final int MaxScopeSearchDepth = 100; // prevents the parser bogging down in exceptionally broken pages
+ /** @deprecated Not used anymore; configure parser depth via {@link Parser#setMaxDepth(int)}. Will be removed in jsoup 1.24.1. */
+ @Deprecated
+ public static final int MaxScopeSearchDepth = 100;
private HtmlTreeBuilderState state; // the current state
private HtmlTreeBuilderState originalState; // original / marked state
@@ -306,9 +308,9 @@ void error(HtmlTreeBuilderState state) {
Element createElementFor(Token.StartTag startTag, String namespace, boolean forcePreserveCase) {
// dedupe and normalize the attributes:
Attributes attributes = startTag.attributes;
- if (!forcePreserveCase)
- attributes = settings.normalizeAttributes(attributes);
if (attributes != null && !attributes.isEmpty()) {
+ if (!forcePreserveCase)
+ settings.normalizeAttributes(attributes);
int dupes = attributes.deduplicate(settings);
if (dupes > 0) {
error("Dropped duplicate attribute(s) in tag [%s]", startTag.normalName);
@@ -332,7 +334,9 @@ Element insertElementFor(final Token.StartTag startTag) {
if (startTag.isSelfClosing()) {
Tag tag = el.tag();
tag.setSeenSelfClose(); // can infer output if in xml syntax
- if (tag.isKnownTag() && (tag.isEmpty() || tag.isSelfClosing())) {
+ if (tag.isEmpty()) {
+ // treated as empty below; nothing further
+ } else if (tag.isKnownTag() && tag.isSelfClosing()) {
// ok, allow it. effectively a pop, but fiddles with the state. handles empty style, title etc which would otherwise leave us in data state
tokeniser.transition(TokeniserState.Data); // handles , otherwise needs breakout steps from script data
tokeniser.emit(emptyEnd.reset().name(el.tagName())); // ensure we get out of whatever state we are in. emitted for yielded processing
@@ -342,6 +346,10 @@ Element insertElementFor(final Token.StartTag startTag) {
}
}
+ if (el.tag().isEmpty()) {
+ pop(); // custom void tags behave like built-in voids (no children, not left on the stack); known empty go via insertEmpty
+ }
+
return el;
}
@@ -386,6 +394,8 @@ FormElement insertFormElement(Token.StartTag startTag, boolean onStack, boolean
* @param el the Element to insert and make the current element
*/
private void doInsertElement(Element el) {
+ enforceStackDepthLimit();
+
if (formElement != null && el.tag().namespace.equals(NamespaceHtml) && StringUtil.inSorted(el.normalName(), TagFormListed))
formElement.addElement(el); // connect form controls to their form element
@@ -407,8 +417,20 @@ void insertCommentNode(Token.Comment token) {
onNodeInserted(node);
}
- /** Inserts the provided character token into the current element. */
+ /** Inserts the provided character token into the current element. Any nulls in the data will be removed. */
void insertCharacterNode(Token.Character characterToken) {
+ insertCharacterNode(characterToken, false);
+ }
+
+ /**
+ Inserts the provided character token into the current element. The tokenizer will have already raised precise character errors.
+
+ @param characterToken the character token to insert
+ @param replace if true, replaces any null chars in the data with the replacement char (U+FFFD). If false, removes
+ null chars.
+ */
+ void insertCharacterNode(Token.Character characterToken, boolean replace) {
+ characterToken.normalizeNulls(replace);
Element el = currentElement(); // will be doc if no current element; allows for whitespace to be inserted into the doc root object (not on the stack)
insertCharacterToElement(characterToken, el);
}
@@ -480,6 +502,20 @@ boolean removeFromStack(Element el) {
return false;
}
+ @Override
+ void onStackPrunedForDepth(Element element) {
+ // handle other effects of popping to keep state correct
+ if (element == headElement) headElement = null;
+ if (element == formElement) setFormElement(null);
+ removeFromActiveFormattingElements(element);
+ if (element.nameIs("template")) {
+ clearFormattingElementsToLastMarker();
+ if (templateModeSize() > 0)
+ popTemplateMode();
+ resetInsertionMode();
+ }
+ }
+
/** Pops the stack until the given HTML element is removed. */
@Nullable
Element popStackToClose(String elName) {
@@ -546,8 +582,8 @@ private void clearStackToContext(String... nodeNames) {
@return the Element immediately above the supplied element, or null if there is no such element.
*/
@Nullable Element aboveOnStack(Element el) {
- assert onStack(el);
- for (int pos = stack.size() -1; pos >= 0; pos--) {
+ if (!onStack(el)) return null;
+ for (int pos = stack.size() -1; pos > 0; pos--) {
Element next = stack.get(pos);
if (next == el) {
return stack.get(pos-1);
@@ -558,8 +594,13 @@ private void clearStackToContext(String... nodeNames) {
void insertOnStackAfter(Element after, Element in) {
int i = stack.lastIndexOf(after);
- Validate.isTrue(i != -1);
- stack.add(i+1, in);
+ if (i == -1) {
+ error("Did not find element on stack to insert after");
+ stack.add(in);
+ // may happen on particularly malformed inputs during adoption
+ } else {
+ stack.add(i+1, in);
+ }
}
void replaceOnStack(Element out, Element in) {
@@ -676,9 +717,8 @@ private boolean inSpecificScope(String targetName, String[] baseTypes, String[]
private boolean inSpecificScope(String[] targetNames, String[] baseTypes, @Nullable String[] extraTypes) {
// https://html.spec.whatwg.org/multipage/parsing.html#has-an-element-in-the-specific-scope
final int bottom = stack.size() -1;
- final int top = bottom > MaxScopeSearchDepth ? bottom - MaxScopeSearchDepth : 0;
// don't walk too far up the tree
- for (int pos = bottom; pos >= top; pos--) {
+ for (int pos = bottom; pos >= 0; pos--) {
Element el = stack.get(pos);
String elName = el.normalName();
// namespace checks - arguments provided are always in html ns, with this bolt-on for math and svg:
@@ -734,17 +774,12 @@ boolean inSelectScope(String targetName) {
if (!inSorted(elName, TagSearchSelectScope)) // all elements except
return false;
}
- Validate.fail("Should not be reachable");
- return false;
+ return false; // nothing left on stack
}
/** Tests if there is some element on the stack that is not in the provided set. */
boolean onStackNot(String[] allowedTags) {
- final int bottom = stack.size() -1;
- final int top = bottom > MaxScopeSearchDepth ? bottom - MaxScopeSearchDepth : 0;
- // don't walk too far up the tree
-
- for (int pos = bottom; pos >= top; pos--) {
+ for (int pos = stack.size() - 1; pos >= 0; pos--) {
final String elName = stack.get(pos).normalName();
if (!inSorted(elName, allowedTags))
return true;
diff --git a/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java b/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java
index 75db959c0f..6abe964715 100644
--- a/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java
+++ b/src/main/java/org/jsoup/parser/HtmlTreeBuilderState.java
@@ -286,15 +286,12 @@ private boolean anythingElse(Token t, HtmlTreeBuilder tb) {
switch (t.type) {
case Character: {
Token.Character c = t.asCharacter();
- if (c.getData().equals(nullString)) {
- tb.error(this);
- return false;
- } else if (tb.framesetOk() && isWhitespace(c)) { // don't check if whitespace if frames already closed
+ if (tb.framesetOk() && isWhitespace(c)) { // don't check if whitespace if frames already closed
tb.reconstructFormattingElements();
tb.insertCharacterNode(c);
} else {
tb.reconstructFormattingElements();
- tb.insertCharacterNode(c);
+ tb.insertCharacterNode(c); // strips nulls
tb.framesetOk(false);
}
break;
@@ -382,7 +379,7 @@ private boolean inBodyStartTag(Token t, HtmlTreeBuilder tb) {
case "body":
tb.error(this);
stack = tb.getStack();
- if (stack.size() == 1 || (stack.size() > 2 && !stack.get(1).nameIs("body")) || tb.onStack("template")) {
+ if (stack.size() < 2 || (stack.size() > 2 && !stack.get(1).nameIs("body")) || tb.onStack("template")) {
// only in fragment case
return false; // ignore
} else {
@@ -395,7 +392,7 @@ private boolean inBodyStartTag(Token t, HtmlTreeBuilder tb) {
case "frameset":
tb.error(this);
stack = tb.getStack();
- if (stack.size() == 1 || (stack.size() > 2 && !stack.get(1).nameIs("body"))) {
+ if (stack.size() < 2|| (stack.size() > 2 && !stack.get(1).nameIs("body"))) {
// only in fragment case
return false; // ignore
} else if (!tb.framesetOk()) {
@@ -924,7 +921,7 @@ private boolean inBodyEndTagAdoption(Token t, HtmlTreeBuilder tb) {
} else {
el = tb.aboveOnStack(el);
}
- if (el == null) {
+ if (el == null || el.nameIs("body")) {
tb.error(this); // shouldn't be able to hit
break;
}
@@ -945,6 +942,11 @@ private boolean inBodyEndTagAdoption(Token t, HtmlTreeBuilder tb) {
}
// 6. [Create an element for the token] for which the element node was created, in the [HTML namespace], with commonAncestor as the intended parent; replace the entry for node in the [list of active formatting elements] with an entry for the new element, replace the entry for node in the [stack of open elements] with an entry for the new element, and let node be the new element.
+ if (!tb.onStack(el)) { // stale formatting element; cannot adopt/replace
+ tb.error(this);
+ tb.removeFromActiveFormattingElements(el);
+ break; // exit inner loop; proceed with step 14 using current lastEl
+ }
Element replacement = new Element(tb.tagFor(el.nodeName(), el.normalName(), tb.defaultNamespace(), ParseSettings.preserveCase), tb.getBaseUri());
tb.replaceActiveFormattingElement(el, replacement);
tb.replaceOnStack(el, replacement);
@@ -1110,13 +1112,7 @@ boolean anythingElse(Token t, HtmlTreeBuilder tb) {
InTableText {
@Override boolean process(Token t, HtmlTreeBuilder tb) {
if (t.type == Token.TokenType.Character) {
- Token.Character c = t.asCharacter();
- if (c.getData().equals(nullString)) {
- tb.error(this);
- return false;
- } else {
- tb.addPendingTableCharacters(c);
- }
+ tb.addPendingTableCharacters(t.asCharacter()); // gets to insertCharacterNode, which strips nulls
} else {
// insert gathered table text into the correct element:
if (tb.getPendingTableCharacters().size() > 0) {
@@ -1449,13 +1445,7 @@ private void closeCell(HtmlTreeBuilder tb) {
switch (t.type) {
case Character:
- Token.Character c = t.asCharacter();
- if (c.getData().equals(nullString)) {
- tb.error(this);
- return false;
- } else {
- tb.insertCharacterNode(c);
- }
+ tb.insertCharacterNode(t.asCharacter());
break;
case Comment:
tb.insertCommentNode(t.asComment());
@@ -1485,7 +1475,11 @@ else if (name.equals("option")) {
tb.error(this);
if (!tb.inSelectScope("select"))
return false; // frag
- tb.processEndTag("select");
+ // spec says close select then reprocess; leads to recursion. iter directly:
+ do {
+ tb.popStackToClose("select");
+ tb.resetInsertionMode();
+ } while (tb.inSelectScope("select")); // collapse invalid nested selects
return tb.process(start);
} else if (name.equals("script") || name.equals("template")) {
return tb.process(t, InHead);
@@ -1693,7 +1687,7 @@ else if (name.equals("col")) {
return false;
}
} else if (t.isEndTag() && t.asEndTag().normalName().equals("frameset")) {
- if (tb.currentElementIs("html")) { // frag
+ if (!tb.currentElementIs("frameset")) { // spec checks if el is html; deviate to confirm we are about to pop the frameset el
tb.error(this);
return false;
} else {
@@ -1781,12 +1775,10 @@ else if (name.equals("col")) {
switch (t.type) {
case Character:
Token.Character c = t.asCharacter();
- if (c.getData().equals(nullString))
- tb.error(this);
- else if (HtmlTreeBuilderState.isWhitespace(c))
+ if (HtmlTreeBuilderState.isWhitespace(c))
tb.insertCharacterNode(c);
else {
- tb.insertCharacterNode(c);
+ tb.insertCharacterNode(c, true); // replace nulls
tb.framesetOk(false);
}
break;
@@ -1820,8 +1812,6 @@ else if (HtmlTreeBuilderState.isWhitespace(c))
tb.tokeniser.transition(TokeniserState.ScriptData);
else
tb.tokeniser.transition(textState);
- tb.markInsertionMode();
- tb.transition(Text);
}
break;
diff --git a/src/main/java/org/jsoup/parser/ParseSettings.java b/src/main/java/org/jsoup/parser/ParseSettings.java
index 5555223655..b31e5f8cce 100644
--- a/src/main/java/org/jsoup/parser/ParseSettings.java
+++ b/src/main/java/org/jsoup/parser/ParseSettings.java
@@ -75,11 +75,10 @@ public String normalizeAttribute(String name) {
return name;
}
- @Nullable Attributes normalizeAttributes(@Nullable Attributes attributes) {
- if (attributes != null && !preserveAttributeCase) {
+ void normalizeAttributes(Attributes attributes) {
+ if (!preserveAttributeCase) {
attributes.normalize();
}
- return attributes;
}
/** Returns the normal name that a Tag will have (trimmed and lower-cased) */
diff --git a/src/main/java/org/jsoup/parser/Parser.java b/src/main/java/org/jsoup/parser/Parser.java
index a171692bc8..5dd8eb7fd1 100644
--- a/src/main/java/org/jsoup/parser/Parser.java
+++ b/src/main/java/org/jsoup/parser/Parser.java
@@ -30,6 +30,7 @@ public class Parser implements Cloneable {
private boolean trackPosition = false;
private @Nullable TagSet tagSet;
private final ReentrantLock lock = new ReentrantLock();
+ private int maxDepth;
/**
* Create a new Parser, using the specified TreeBuilder
@@ -39,6 +40,7 @@ public Parser(TreeBuilder treeBuilder) {
this.treeBuilder = treeBuilder;
settings = treeBuilder.defaultSettings();
errors = ParseErrorList.noTracking();
+ maxDepth = treeBuilder.defaultMaxDepth();
}
/**
@@ -60,6 +62,8 @@ private Parser(Parser copy) {
errors = new ParseErrorList(copy.errors); // only copies size, not contents
settings = new ParseSettings(copy.settings);
trackPosition = copy.trackPosition;
+ maxDepth = copy.maxDepth;
+ tagSet = new TagSet(copy.tagSet());
}
/**
@@ -194,6 +198,28 @@ public ParseSettings settings() {
return settings;
}
+ /**
+ Set the parser's maximum stack depth (maximum number of open elements). When reached, new open elements will be
+ removed to prevent excessive nesting. Defaults to 512 for the HTML parser, and unlimited for the XML
+ parser.
+
+ @param maxDepth maximum parser depth; must be >= 1
+ @return this Parser, for chaining
+ */
+ public Parser setMaxDepth(int maxDepth) {
+ Validate.isTrue(maxDepth >= 1, "maxDepth must be >= 1");
+ this.maxDepth = maxDepth;
+ return this;
+ }
+
+ /**
+ * Get the maximum parser depth (maximum number of open elements).
+ * @return the current max parser depth
+ */
+ public int getMaxDepth() {
+ return maxDepth;
+ }
+
/**
Set a custom TagSet to use for this Parser. This allows you to define your own tags, and control how they are
parsed. For example, you can set a tag to preserve whitespace, or to be treated as a block tag.
@@ -295,26 +321,41 @@ public static Document parseBodyFragment(String bodyHtml, String baseUri) {
Document doc = Document.createShell(baseUri);
Element body = doc.body();
List nodeList = parseFragment(bodyHtml, body, baseUri);
- Node[] nodes = nodeList.toArray(new Node[0]); // the node list gets modified when re-parented
- for (int i = nodes.length - 1; i > 0; i--) {
- nodes[i].remove();
- }
- for (Node node : nodes) {
- body.appendChild(node);
- }
+ body.appendChildren(nodeList);
return doc;
}
/**
- * Utility method to unescape HTML entities from a string
- * @param string HTML escaped string
- * @param inAttribute if the string is to be escaped in strict mode (as attributes are)
- * @return an unescaped string
+ Utility method to unescape HTML entities from a string.
+ To track errors while unescaping, use
+ {@link #unescape(String, boolean)} with a Parser instance that has error tracking enabled.
+
+ @param string HTML escaped string
+ @param inAttribute if the string is to be escaped in strict mode (as attributes are)
+ @return an unescaped string
+ @see #unescape(String, boolean)
*/
public static String unescapeEntities(String string, boolean inAttribute) {
- Parser parser = Parser.htmlParser();
- parser.treeBuilder.initialiseParse(new StringReader(string), "", parser);
- Tokeniser tokeniser = new Tokeniser(parser.treeBuilder);
+ Validate.notNull(string);
+ if (string.indexOf('&') < 0) return string; // nothing to unescape
+ return Parser.htmlParser().unescape(string, inAttribute);
+ }
+
+ /**
+ Utility method to unescape HTML entities from a string, using this {@code Parser}'s configuration (for example, to
+ collect errors while unescaping).
+
+ @param string HTML escaped string
+ @param inAttribute if the string is to be escaped in strict mode (as attributes are)
+ @return an unescaped string
+ @see #setTrackErrors(int)
+ @see #unescapeEntities(String, boolean)
+ */
+ public String unescape(String string, boolean inAttribute) {
+ Validate.notNull(string);
+ if (string.indexOf('&') < 0) return string; // nothing to unescape
+ this.treeBuilder.initialiseParse(new StringReader(string), "", this);
+ Tokeniser tokeniser = new Tokeniser(this.treeBuilder);
return tokeniser.unescapeEntities(inAttribute);
}
@@ -335,6 +376,6 @@ public static Parser htmlParser() {
* @return a new simple XML parser.
*/
public static Parser xmlParser() {
- return new Parser(new XmlTreeBuilder());
+ return new Parser(new XmlTreeBuilder()).setMaxDepth(Integer.MAX_VALUE);
}
}
diff --git a/src/main/java/org/jsoup/parser/StreamParser.java b/src/main/java/org/jsoup/parser/StreamParser.java
index a4d8be3326..2e8827e435 100644
--- a/src/main/java/org/jsoup/parser/StreamParser.java
+++ b/src/main/java/org/jsoup/parser/StreamParser.java
@@ -47,8 +47,13 @@ Iterator interface. Elements returned will be complete with all their children,
stream consumers will throw an {@link java.io.UncheckedIOException} if the underlying Reader errors during read.
For examples, see the jsoup
StreamParser cookbook.
- @since 1.18.1
- */
+
+ Selectors that depend on knowing all siblings (e.g. {@code :last-child}, {@code :last-of-type}, {@code :nth-last-child},
+ {@code :only-child} and their negations) cannot be correctly evaluated while streaming, because the parser does not know
+ if a later sibling will appear. For those cases, run {@link #complete()} first to finish the parse (which is effectively
+ the same as using {@code Jsoup.parse(...)} unless you have already removed nodes during streaming).
+
+ @since 1.18.1 */
public class StreamParser implements Closeable {
final private Parser parser;
final private TreeBuilder treeBuilder;
diff --git a/src/main/java/org/jsoup/parser/Tag.java b/src/main/java/org/jsoup/parser/Tag.java
index 060fadf859..89f2777496 100644
--- a/src/main/java/org/jsoup/parser/Tag.java
+++ b/src/main/java/org/jsoup/parser/Tag.java
@@ -200,7 +200,7 @@ public Tag clear(int option) {
* @return The tag, either defined or new generic.
*/
public static Tag valueOf(String tagName, String namespace, ParseSettings settings) {
- return TagSet.Html().valueOf(tagName, ParseSettings.normalName(tagName), namespace, settings.preserveTagCase());
+ return TagSet.Html().valueOf(tagName, null, namespace, settings.preserveTagCase());
}
/**
@@ -245,7 +245,7 @@ public boolean isBlock() {
Get if this is an InlineContainer tag.
@return true if an InlineContainer (which formats children as inline).
- @deprecated setting is only used within the Printer. Will be removed in a future release.
+ @deprecated internal pretty-printing flag; use {@link #isInline()} or {@link #isBlock()} to check layout intent. Will be removed in jsoup 1.24.1.
*/
@Deprecated public boolean formatAsBlock() {
return (options & InlineContainer) != 0;
diff --git a/src/main/java/org/jsoup/parser/TagSet.java b/src/main/java/org/jsoup/parser/TagSet.java
index c61a3359bc..efa9ca41fb 100644
--- a/src/main/java/org/jsoup/parser/TagSet.java
+++ b/src/main/java/org/jsoup/parser/TagSet.java
@@ -25,22 +25,48 @@ public class TagSet {
static final TagSet HtmlTagSet = initHtmlDefault();
private final Map> tags = new HashMap<>(); // namespace -> tag name -> Tag
- private final @Nullable TagSet source; // source to pull tags from on demand
+ private final @Nullable TagSet source; // internal fallback for lazy tag copies
private @Nullable ArrayList> customizers; // optional onNewTag tag customizer
/**
Returns a mutable copy of the default HTML tag set.
*/
public static TagSet Html() {
- return new TagSet(HtmlTagSet);
+ return new TagSet(HtmlTagSet, null);
+ }
+
+ private TagSet(@Nullable TagSet source, @Nullable ArrayList> customizers) {
+ this.source = source;
+ this.customizers = customizers;
}
public TagSet() {
- source = null;
+ this(null, null);
+ }
+
+ /**
+ Creates a new TagSet by copying the current tags and customizers from the provided source TagSet. Changes made to
+ one TagSet will not affect the other.
+ @param template the TagSet to copy
+ */
+ public TagSet(TagSet template) {
+ this(template.source, copyCustomizers(template));
+ // copy tags eagerly; any lazy pull-through should come only from the root source (which would be the HTML defaults), not the template itself.
+ // that way the template tagset is not mutated when we do read through
+ if (template.tags.isEmpty()) return;
+
+ for (Map.Entry> namespaceEntry : template.tags.entrySet()) {
+ Map nsTags = new HashMap<>(namespaceEntry.getValue().size());
+ for (Map.Entry tagEntry : namespaceEntry.getValue().entrySet()) {
+ nsTags.put(tagEntry.getKey(), tagEntry.getValue().clone());
+ }
+ tags.put(namespaceEntry.getKey(), nsTags);
+ }
}
- public TagSet(TagSet original) {
- this.source = original;
+ private static @Nullable ArrayList> copyCustomizers(TagSet base) {
+ if (base.customizers == null) return null;
+ return new ArrayList<>(base.customizers);
}
/**
@@ -103,8 +129,11 @@ private void doAdd(Tag tag) {
return null;
}
- /** Tag.valueOf with the normalName via the token.normalName, to save redundant lower-casing passes. */
- Tag valueOf(String tagName, String normalName, String namespace, boolean preserveTagCase) {
+ /**
+ Tag.valueOf with the normalName via the token.normalName, to save redundant lower-casing passes.
+ Provide a null normalName unless we already have one; will be normalized if required from tagName.
+ */
+ Tag valueOf(String tagName, @Nullable String normalName, String namespace, boolean preserveTagCase) {
Validate.notNull(tagName);
Validate.notNull(namespace);
tagName = tagName.trim();
@@ -113,6 +142,7 @@ Tag valueOf(String tagName, String normalName, String namespace, boolean preserv
if (tag != null) return tag;
// not found by tagName, try by normal
+ if (normalName == null) normalName = ParseSettings.normalName(tagName);
tagName = preserveTagCase ? tagName : normalName;
tag = get(normalName, namespace);
if (tag != null) {
@@ -141,7 +171,7 @@ Get a Tag by name from this TagSet. If not previously defined (unknown), returns
@return The tag, either defined or new generic.
*/
public Tag valueOf(String tagName, String namespace, ParseSettings settings) {
- return valueOf(tagName, ParseSettings.normalName(tagName), namespace, settings.preserveTagCase());
+ return valueOf(tagName, null, namespace, settings.preserveTagCase());
}
/**
@@ -206,7 +236,7 @@ static TagSet initHtmlDefault() {
String[] blockTags = {
"html", "head", "body", "frameset", "script", "noscript", "style", "meta", "link", "title", "frame",
"noframes", "section", "nav", "aside", "hgroup", "header", "footer", "p", "h1", "h2", "h3", "h4", "h5",
- "h6", "br", "button",
+ "h6", "button",
"ul", "ol", "pre", "div", "blockquote", "hr", "address", "figure", "figcaption", "form", "fieldset", "ins",
"del", "dl", "dt", "dd", "li", "table", "caption", "thead", "tfoot", "tbody", "colgroup", "col", "tr", "th",
"td", "video", "audio", "canvas", "details", "menu", "plaintext", "template", "article", "main",
diff --git a/src/main/java/org/jsoup/parser/Token.java b/src/main/java/org/jsoup/parser/Token.java
index 25ff30eb76..6d6882ea93 100644
--- a/src/main/java/org/jsoup/parser/Token.java
+++ b/src/main/java/org/jsoup/parser/Token.java
@@ -415,6 +415,20 @@ public String toString() {
return getData();
}
+ /**
+ Normalize null chars in the data. If replace is true, replaces with the replacement char; if false, removes.
+ */
+ public void normalizeNulls(boolean replace) {
+ String data = this.data.value();
+ if (data.indexOf(TokeniserState.nullChar) == -1) return;
+
+ data = (replace ?
+ data.replace(TokeniserState.nullChar, Tokeniser.replacementChar) :
+ data.replace(nullString, ""));
+ this.data.set(data);
+ }
+
+ private static final String nullString = String.valueOf(TokeniserState.nullChar);
}
final static class CData extends Character {
diff --git a/src/main/java/org/jsoup/parser/TreeBuilder.java b/src/main/java/org/jsoup/parser/TreeBuilder.java
index 69110277f4..6b1e617c3f 100644
--- a/src/main/java/org/jsoup/parser/TreeBuilder.java
+++ b/src/main/java/org/jsoup/parser/TreeBuilder.java
@@ -11,11 +11,8 @@
import org.jspecify.annotations.Nullable;
import java.io.Reader;
-import java.io.StringReader;
import java.util.ArrayList;
-import java.util.HashMap;
import java.util.List;
-import java.util.Map;
import static org.jsoup.parser.Parser.NamespaceHtml;
@@ -174,6 +171,33 @@ final void push(Element element) {
onNodeInserted(element);
}
+ /**
+ Ensures the stack respects {@link Parser#getMaxDepth()} by closing the deepest open elements until there is room for
+ a new insertion.
+ */
+ final void enforceStackDepthLimit() {
+ final int maxDepth = parser.getMaxDepth();
+ if (maxDepth == Integer.MAX_VALUE) return;
+ while (stack.size() >= maxDepth) {
+ Element trimmed = pop();
+ onStackPrunedForDepth(trimmed);
+ }
+ }
+
+ /**
+ Hook for the HTML Tree Builder that needs to clean up when an element is removed due to the depth limit
+ */
+ void onStackPrunedForDepth(Element element) {
+ // default no-op
+ }
+
+ /**
+ Default maximum depth for parsers using this tree builder.
+ */
+ int defaultMaxDepth() {
+ return 512;
+ }
+
/**
Get the current element (last on the stack). If all items have been removed, returns the document instead
(which might not actually be on the stack; use stack.size() == 0 to test if required.
diff --git a/src/main/java/org/jsoup/parser/XmlTreeBuilder.java b/src/main/java/org/jsoup/parser/XmlTreeBuilder.java
index befe929f63..796f16952c 100644
--- a/src/main/java/org/jsoup/parser/XmlTreeBuilder.java
+++ b/src/main/java/org/jsoup/parser/XmlTreeBuilder.java
@@ -106,6 +106,11 @@ TagSet defaultTagSet() {
return new TagSet(); // an empty tagset
}
+ @Override
+ int defaultMaxDepth() {
+ return Integer.MAX_VALUE;
+ }
+
@Override
protected boolean process(Token token) {
currentToken = token;
@@ -145,15 +150,18 @@ void insertElementFor(Token.StartTag startTag) {
Attributes attributes = startTag.attributes;
if (attributes != null) {
+ settings.normalizeAttributes(attributes);
attributes.deduplicate(settings);
processNamespaces(attributes, namespaces);
applyNamespacesToAttributes(attributes, namespaces);
}
+ enforceStackDepthLimit();
+
String tagName = startTag.tagName.value();
String ns = resolveNamespace(tagName, namespaces);
Tag tag = tagFor(tagName, startTag.normalName, ns, settings);
- Element el = new Element(tag, null, settings.normalizeAttributes(attributes));
+ Element el = new Element(tag, null, attributes);
currentElement().appendChild(el);
push(el);
diff --git a/src/main/java/org/jsoup/safety/Safelist.java b/src/main/java/org/jsoup/safety/Safelist.java
index 486a23f67f..cf76c08d3b 100644
--- a/src/main/java/org/jsoup/safety/Safelist.java
+++ b/src/main/java/org/jsoup/safety/Safelist.java
@@ -6,7 +6,6 @@ Thank you to Ryan Grove (wonko.com) for the Ruby HTML cleaner http://github.com/
*/
import org.jsoup.helper.Validate;
-import org.jsoup.internal.Functions;
import org.jsoup.internal.Normalizer;
import org.jsoup.nodes.Attribute;
import org.jsoup.nodes.Attributes;
@@ -306,7 +305,7 @@ public Safelist addAttributes(String tag, String... attributes) {
Validate.notEmpty(key);
attributeSet.add(AttributeKey.valueOf(key));
}
- Set currentSet = this.attributes.computeIfAbsent(tagName, Functions.setFunction());
+ Set currentSet = this.attributes.computeIfAbsent(tagName, k -> new HashSet<>());
currentSet.addAll(attributeSet);
return this;
}
@@ -380,7 +379,7 @@ public Safelist addEnforcedAttribute(String tag, String attribute, String value)
AttributeKey attrKey = AttributeKey.valueOf(attribute);
AttributeValue attrVal = AttributeValue.valueOf(value);
- Map attrMap = enforcedAttributes.computeIfAbsent(tagName, Functions.mapFunction());
+ Map attrMap = enforcedAttributes.computeIfAbsent(tagName, k -> new HashMap<>());
attrMap.put(attrKey, attrVal);
return this;
}
@@ -453,8 +452,8 @@ public Safelist addProtocols(String tag, String attribute, String... protocols)
TagName tagName = TagName.valueOf(tag);
AttributeKey attrKey = AttributeKey.valueOf(attribute);
- Map> attrMap = this.protocols.computeIfAbsent(tagName, Functions.mapFunction());
- Set protSet = attrMap.computeIfAbsent(attrKey, Functions.setFunction());
+ Map> attrMap = this.protocols.computeIfAbsent(tagName, k -> new HashMap<>());
+ Set protSet = attrMap.computeIfAbsent(attrKey, k -> new HashSet<>());
for (String protocol : protocols) {
Validate.notEmpty(protocol);
diff --git a/src/main/java/org/jsoup/select/Collector.java b/src/main/java/org/jsoup/select/Collector.java
index db1f672051..74ce0f83d0 100644
--- a/src/main/java/org/jsoup/select/Collector.java
+++ b/src/main/java/org/jsoup/select/Collector.java
@@ -31,8 +31,9 @@ public static Elements collect(Evaluator eval, Element root) {
Stream stream = eval.wantsNodes() ?
streamNodes(eval, root, Element.class) :
stream(eval, root);
-
- return stream.collect(toCollection(Elements::new));
+ Elements els = stream.collect(toCollection(Elements::new));
+ eval.reset(); // drops any held memos
+ return els;
}
/**
@@ -72,7 +73,9 @@ public static Stream streamNodes(Evaluator evaluator, Elemen
@return the first match; {@code null} if none
*/
public static @Nullable Element findFirst(Evaluator eval, Element root) {
- return stream(eval, root).findFirst().orElse(null);
+ Element el = stream(eval, root).findFirst().orElse(null);
+ eval.reset();
+ return el;
}
/**
@@ -86,7 +89,9 @@ public static Stream streamNodes(Evaluator evaluator, Elemen
@since 1.21.1
*/
public static @Nullable T findFirstNode(Evaluator eval, Element root, Class type) {
- return streamNodes(eval, root, type).findFirst().orElse(null);
+ T node = streamNodes(eval, root, type).findFirst().orElse(null);
+ eval.reset();
+ return node;
}
/**
diff --git a/src/main/java/org/jsoup/select/Evaluator.java b/src/main/java/org/jsoup/select/Evaluator.java
index 8c93fbaacf..26ea8a5f2a 100644
--- a/src/main/java/org/jsoup/select/Evaluator.java
+++ b/src/main/java/org/jsoup/select/Evaluator.java
@@ -11,10 +11,10 @@
import org.jsoup.nodes.TextNode;
import org.jsoup.nodes.XmlDeclaration;
import org.jsoup.parser.ParseSettings;
+import org.jsoup.helper.Regex;
import java.util.List;
import java.util.function.Predicate;
-import java.util.regex.Matcher;
import java.util.regex.Pattern;
import static org.jsoup.internal.Normalizer.lowerCase;
@@ -272,7 +272,7 @@ public AttributeWithValue(String key, String value) {
@Override
public boolean matches(Element root, Element element) {
- return element.hasAttr(key) && value.equalsIgnoreCase(element.attr(key).trim());
+ return element.hasAttr(key) && value.equalsIgnoreCase(element.attr(key));
}
@Override protected int cost() {
@@ -315,7 +315,7 @@ public String toString() {
*/
public static final class AttributeWithValueStarting extends AttributeKeyPair {
public AttributeWithValueStarting(String key, String value) {
- super(key, value, false);
+ super(key, value);
}
@Override
@@ -338,7 +338,7 @@ public String toString() {
*/
public static final class AttributeWithValueEnding extends AttributeKeyPair {
public AttributeWithValueEnding(String key, String value) {
- super(key, value, false);
+ super(key, value);
}
@Override
@@ -385,13 +385,17 @@ public String toString() {
*/
public static final class AttributeWithValueMatching extends Evaluator {
final String key;
- final Pattern pattern;
+ final Regex pattern;
- public AttributeWithValueMatching(String key, Pattern pattern) {
+ public AttributeWithValueMatching(String key, Regex pattern) {
this.key = normalize(key);
this.pattern = pattern;
}
+ public AttributeWithValueMatching(String key, Pattern pattern) {
+ this(key, Regex.fromPattern(pattern)); // api compat
+ }
+
@Override
public boolean matches(Element root, Element element) {
return element.hasAttr(key) && pattern.matcher(element.attr(key)).find();
@@ -416,26 +420,30 @@ public abstract static class AttributeKeyPair extends Evaluator {
final String value;
public AttributeKeyPair(String key, String value) {
- this(key, value, true);
- }
-
- public AttributeKeyPair(String key, String value, boolean trimQuoted) {
Validate.notEmpty(key);
- Validate.notEmpty(value);
+ Validate.notNull(value);
this.key = normalize(key);
boolean quoted = value.startsWith("'") && value.endsWith("'")
|| value.startsWith("\"") && value.endsWith("\"");
- if (quoted)
+ if (quoted) {
+ Validate.isTrue(value.length() > 1, "Quoted value must have content");
value = value.substring(1, value.length() - 1);
+ }
+
+ this.value = lowerCase(value); // case-insensitive match
+ }
- // normalize value based on whether it was quoted and trimQuoted flag
- // keeps whitespace for attribute val starting or ending, when quoted
- if (trimQuoted || !quoted)
- this.value = normalize(value); // lowercase and trims
- else
- this.value = lowerCase(value); // only lowercase
+ /**
+ @deprecated since 1.22.1, use {@link #AttributeKeyPair(String, String)}; the previous trimQuoted parameter is no longer used.
+ This constructor will be removed in jsoup 1.24.1.
+ */
+ @Deprecated
+ public AttributeKeyPair(String key, String value, boolean ignored) {
+ this(key, value);
}
+
+
}
/**
@@ -922,16 +930,19 @@ public String toString() {
* Evaluator for matching Element (and its descendants) text with regex
*/
public static final class Matches extends Evaluator {
- private final Pattern pattern;
+ private final Regex pattern;
- public Matches(Pattern pattern) {
+ public Matches(Regex pattern) {
this.pattern = pattern;
}
+ public Matches(Pattern pattern) {
+ this(Regex.fromPattern(pattern));
+ }
+
@Override
public boolean matches(Element root, Element element) {
- Matcher m = pattern.matcher(element.text());
- return m.find();
+ return pattern.matcher(element.text()).find();
}
@Override protected int cost() {
@@ -948,16 +959,19 @@ public String toString() {
* Evaluator for matching Element's own text with regex
*/
public static final class MatchesOwn extends Evaluator {
- private final Pattern pattern;
+ private final Regex pattern;
- public MatchesOwn(Pattern pattern) {
+ public MatchesOwn(Regex pattern) {
this.pattern = pattern;
}
+ public MatchesOwn(Pattern pattern) {
+ this(Regex.fromPattern(pattern));
+ }
+
@Override
public boolean matches(Element root, Element element) {
- Matcher m = pattern.matcher(element.ownText());
- return m.find();
+ return pattern.matcher(element.ownText()).find();
}
@Override protected int cost() {
@@ -975,16 +989,19 @@ public String toString() {
* @since 1.15.1.
*/
public static final class MatchesWholeText extends Evaluator {
- private final Pattern pattern;
+ private final Regex pattern;
- public MatchesWholeText(Pattern pattern) {
+ public MatchesWholeText(Regex pattern) {
this.pattern = pattern;
}
+ public MatchesWholeText(Pattern pattern) {
+ this.pattern = Regex.fromPattern(pattern);
+ }
+
@Override
public boolean matches(Element root, Element element) {
- Matcher m = pattern.matcher(element.wholeText());
- return m.find();
+ return pattern.matcher(element.wholeText()).find();
}
@Override protected int cost() {
@@ -1002,15 +1019,19 @@ public String toString() {
* @since 1.15.1.
*/
public static final class MatchesWholeOwnText extends Evaluator {
- private final Pattern pattern;
+ private final Regex pattern;
- public MatchesWholeOwnText(Pattern pattern) {
+ public MatchesWholeOwnText(Regex pattern) {
this.pattern = pattern;
}
+ public MatchesWholeOwnText(Pattern pattern) {
+ this(Regex.fromPattern(pattern));
+ }
+
@Override
public boolean matches(Element root, Element element) {
- Matcher m = pattern.matcher(element.wholeOwnText());
+ Regex.Matcher m = pattern.matcher(element.wholeOwnText());
return m.find();
}
@@ -1025,7 +1046,7 @@ public String toString() {
}
/**
- @deprecated This selector is deprecated and will be removed in a future version. Migrate to ::textnode using the Element#selectNodes() method instead.
+ @deprecated This selector is deprecated and will be removed in jsoup 1.24.1. Migrate to ::textnode using the Element#selectNodes() method instead.
*/
@Deprecated
public static final class MatchText extends Evaluator {
@@ -1035,7 +1056,7 @@ public MatchText() {
// log a deprecated error on first use; users typically won't directly construct this Evaluator and so won't otherwise get deprecation warnings
if (!loggedError) {
loggedError = true;
- System.err.println("WARNING: :matchText selector is deprecated and will be removed in a future version. Use Element#selectNodes(String, Class) with selector ::textnode and class TextNode instead.");
+ System.err.println("WARNING: :matchText selector is deprecated and will be removed in jsoup 1.24.1. Use Element#selectNodes(String, Class) with selector ::textnode and class TextNode instead.");
}
}
diff --git a/src/main/java/org/jsoup/select/NodeEvaluator.java b/src/main/java/org/jsoup/select/NodeEvaluator.java
index cfaee79cdf..6ceff07c75 100644
--- a/src/main/java/org/jsoup/select/NodeEvaluator.java
+++ b/src/main/java/org/jsoup/select/NodeEvaluator.java
@@ -4,8 +4,7 @@
import org.jsoup.nodes.Element;
import org.jsoup.nodes.LeafNode;
import org.jsoup.nodes.Node;
-
-import java.util.regex.Pattern;
+import org.jsoup.helper.Regex;
import static org.jsoup.internal.Normalizer.lowerCase;
import static org.jsoup.internal.StringUtil.normaliseWhitespace;
@@ -98,9 +97,9 @@ public String toString() {
}
static class MatchesValue extends NodeEvaluator {
- private final Pattern pattern;
+ private final Regex pattern;
- protected MatchesValue(Pattern pattern) {
+ protected MatchesValue(Regex pattern) {
this.pattern = pattern;
}
diff --git a/src/main/java/org/jsoup/select/QueryParser.java b/src/main/java/org/jsoup/select/QueryParser.java
index c29a021647..205b50a452 100644
--- a/src/main/java/org/jsoup/select/QueryParser.java
+++ b/src/main/java/org/jsoup/select/QueryParser.java
@@ -1,5 +1,6 @@
package org.jsoup.select;
+import org.jsoup.helper.Regex;
import org.jsoup.internal.StringUtil;
import org.jsoup.helper.Validate;
import org.jsoup.nodes.CDataNode;
@@ -344,7 +345,9 @@ private Evaluator byAttribute() {
private Evaluator evaluatorForAttribute(TokenQueue cq) {
String key = cq.consumeToAny(AttributeEvals); // eq, not, start, end, contain, match, (no val)
+ key = normalize(key);
Validate.notEmpty(key);
+ Validate.isFalse(key.equals("abs:"), "Absolute attribute key must have a name");
cq.consumeWhitespace();
final Evaluator eval;
@@ -367,7 +370,7 @@ else if (cq.matchChomp("$="))
else if (cq.matchChomp("*="))
eval = new Evaluator.AttributeWithValueContaining(key, cq.remainder());
else if (cq.matchChomp("~="))
- eval = new Evaluator.AttributeWithValueMatching(key, Pattern.compile(cq.remainder()));
+ eval = new Evaluator.AttributeWithValueMatching(key, Regex.compile(cq.remainder()));
else
throw new Selector.SelectorParseException(
"Could not parse attribute query '%s': unexpected token at '%s'", query, cq.remainder());
@@ -472,7 +475,7 @@ private Evaluator matches(boolean own) {
String query = own ? ":matchesOwn" : ":matches";
String regex = consumeParens(); // don't unescape, as regex bits will be escaped
Validate.notEmpty(regex, query + "(regex) query must not be empty");
- Pattern pattern = Pattern.compile(regex);
+ Regex pattern = Regex.compile(regex);
if (inNodeContext)
return new NodeEvaluator.MatchesValue(pattern);
@@ -488,9 +491,10 @@ private Evaluator matchesWholeText(boolean own) {
String regex = consumeParens(); // don't unescape, as regex bits will be escaped
Validate.notEmpty(regex, query + "(regex) query must not be empty");
+ Regex pattern = Regex.compile(regex);
return own
- ? new Evaluator.MatchesWholeOwnText(Pattern.compile(regex))
- : new Evaluator.MatchesWholeText(Pattern.compile(regex));
+ ? new Evaluator.MatchesWholeOwnText(pattern)
+ : new Evaluator.MatchesWholeText(pattern);
}
// :not(selector)
diff --git a/src/main/java/org/jsoup/select/StructuralEvaluator.java b/src/main/java/org/jsoup/select/StructuralEvaluator.java
index ed01acf859..6998bf82aa 100644
--- a/src/main/java/org/jsoup/select/StructuralEvaluator.java
+++ b/src/main/java/org/jsoup/select/StructuralEvaluator.java
@@ -1,6 +1,5 @@
package org.jsoup.select;
-import org.jsoup.internal.Functions;
import org.jsoup.internal.SoftPool;
import org.jsoup.internal.StringUtil;
import org.jsoup.nodes.Element;
@@ -10,8 +9,8 @@
import org.jsoup.nodes.TextNode;
import java.util.ArrayList;
-import java.util.IdentityHashMap;
import java.util.Map;
+import java.util.WeakHashMap;
/**
* Base structural evaluator.
@@ -32,17 +31,16 @@ boolean wantsNodes() {
// Memoize inner matches, to save repeated re-evaluations of parent, sibling etc.
// root + element: Boolean matches. ThreadLocal in case the Evaluator is compiled then reused across multi threads
- final ThreadLocal>>
- threadMemo = ThreadLocal.withInitial(IdentityHashMap::new);
+ final ThreadLocal>> threadMemo = ThreadLocal.withInitial(WeakHashMap::new);
boolean memoMatches(final Element root, final Node node) {
- Map> rootMemo = threadMemo.get();
- Map memo = rootMemo.computeIfAbsent(root, Functions.identityMapFunction());
- return memo.computeIfAbsent(node, key -> evaluator.matches(root, key));
+ Map> rootMemo = threadMemo.get();
+ Map memo = rootMemo.computeIfAbsent(root, r -> new WeakHashMap<>());
+ return memo.computeIfAbsent(node, test -> evaluator.matches(root, test));
}
@Override protected void reset() {
- threadMemo.get().clear();
+ threadMemo.remove();
evaluator.reset();
super.reset();
}
diff --git a/src/main/java11/org/jsoup/helper/HttpClientExecutor.java b/src/main/java11/org/jsoup/helper/HttpClientExecutor.java
index f36b325bb4..b67eeb00ae 100644
--- a/src/main/java11/org/jsoup/helper/HttpClientExecutor.java
+++ b/src/main/java11/org/jsoup/helper/HttpClientExecutor.java
@@ -31,7 +31,7 @@ class HttpClientExecutor extends RequestExecutor {
// HttpClient expects proxy settings per client; we do per request, so held as a thread local. Can't do same for
// auth because that callback is on a worker thread, so can only do auth per Connection. So we create a new client
// if the authenticator is different between requests
- static ThreadLocal perRequestProxy = new ThreadLocal<>();
+ static ThreadLocal<@Nullable Proxy> perRequestProxy = new ThreadLocal<>();
@Nullable
HttpResponse hRes;
@@ -45,17 +45,25 @@ public HttpClientExecutor(HttpConnection.Request request, HttpConnection.@Nullab
same Connection (session).
*/
HttpClient client() {
- // we try to reuse the same Client across requests in a given Connection; but if the request auth has changed, we need to create a new client
- RequestAuthenticator prevAuth = req.connection.lastAuth;
- req.connection.lastAuth = req.authenticator;
- if (req.connection.client != null && prevAuth == req.authenticator) { // might both be null
- return (HttpClient) req.connection.client;
+ // we try to reuse the same Client across requests in a given Connection; but if the request's auth or ssl context have changed, we need to create a new client
+ if (req.connection.client != null) {
+ HttpClient client = (HttpClient) req.connection.client;
+ boolean reuse = true;
+
+ RequestAuthenticator prevAuth = req.connection.lastAuth;
+ req.connection.lastAuth = req.authenticator;
+ if (prevAuth != req.authenticator) // might both be null
+ reuse = false;
+ if (req.sslContext != null && !(client.sslContext() == req.sslContext)) // client returns default context if not otherwise set
+ reuse = false;
+ if (reuse) return client;
}
HttpClient.Builder builder = HttpClient.newBuilder();
builder.followRedirects(HttpClient.Redirect.NEVER); // customized redirects
builder.proxy(new ProxyWrap()); // thread local impl for per request; called on executing thread
if (req.authenticator != null) builder.authenticator(new AuthenticationHandler(req.authenticator));
+ if (req.sslContext != null) builder.sslContext(req.sslContext);
HttpClient client = builder.build();
req.connection.client = client;
@@ -155,12 +163,25 @@ static class ProxyWrap extends ProxySelector {
@Override
public List select(URI uri) {
Proxy proxy = perRequestProxy.get();
- return proxy != null ? Collections.singletonList(proxy) : NoProxy;
+ if (proxy != null) {
+ return Collections.singletonList(proxy);
+ }
+ ProxySelector defaultSelector = ProxySelector.getDefault();
+ if (defaultSelector != null && defaultSelector != this) { // avoid recursion if we were set as default
+ return defaultSelector.select(uri);
+ }
+ return NoProxy;
}
@Override
public void connectFailed(URI uri, SocketAddress sa, IOException ioe) {
- // no-op
+ if (perRequestProxy.get() != null) {
+ return; // no-op
+ }
+ ProxySelector defaultSelector = ProxySelector.getDefault();
+ if (defaultSelector != null && defaultSelector != this) {
+ defaultSelector.connectFailed(uri, sa, ioe);
+ }
}
}
}
diff --git a/src/main/resources/META-INF/proguard/org.jsoup_jsoup.pro b/src/main/resources/META-INF/proguard/org.jsoup_jsoup.pro
new file mode 100644
index 0000000000..4ac0fee6b7
--- /dev/null
+++ b/src/main/resources/META-INF/proguard/org.jsoup_jsoup.pro
@@ -0,0 +1 @@
+-dontwarn com.google.re2j.**
diff --git a/src/test/java/org/jsoup/helper/DataUtilTest.java b/src/test/java/org/jsoup/helper/DataUtilTest.java
index 2eab9771fa..f557d2895c 100644
--- a/src/test/java/org/jsoup/helper/DataUtilTest.java
+++ b/src/test/java/org/jsoup/helper/DataUtilTest.java
@@ -3,9 +3,7 @@
import org.jsoup.Jsoup;
import org.jsoup.integration.ParseTest;
import org.jsoup.internal.ControllableInputStream;
-import org.jsoup.internal.SimpleStreamReader;
import org.jsoup.nodes.Document;
-import org.jsoup.parser.CharacterReader;
import org.jsoup.parser.Parser;
import org.jsoup.parser.StreamParser;
import org.junit.jupiter.api.Test;
@@ -361,4 +359,63 @@ void handlesUnlimitedRead() throws IOException {
Document doc = Jsoup.parse(ParseTest.getPath("/fuzztests/2353.html.gz"));
assertTrue(doc.html().contains("Read-Fully!"));
}
+
+ @Test
+ void charsetSniffingCanReuseTruncatedPreParse() throws IOException {
+ // #2448: when available() reports buffered bytes after the first read, the sniffed pre-parse may be reused while capped, leading to truncation
+
+ StringBuilder sb = new StringBuilder();
+ sb.append("t ");
+ while (sb.length() < 6200) {
+ sb.append("0123456789 abcdefghijklmnopqrstuvwxyz\n");
+ }
+ sb.append(" list ");
+ String html = sb.toString();
+
+
+ byte[] bytes = html.getBytes(StandardCharsets.UTF_8);
+ ControllableInputStream in = ControllableInputStream.wrap(new BufferedOnceAvailableStream(bytes), 0);
+
+ DataUtil.CharsetDoc charsetDoc = DataUtil.detectCharset(in, null, "http://example.com/", Parser.htmlParser());
+ Document doc = DataUtil.parseInputStream(charsetDoc, "http://example.com/", Parser.htmlParser());
+
+ assertNotNull(doc.selectFirst("hr"), "hr should survive the sniff + full parse");
+ }
+
+ // delivers all bytes in the first read, then signals available()>0 once to trigger a second read and baseReadFully=true
+ static final class BufferedOnceAvailableStream extends InputStream {
+ private final byte[] data;
+ private int pos = 0;
+ private boolean extraSignal = true;
+
+ BufferedOnceAvailableStream(byte[] data) {
+ this.data = data;
+ }
+
+ @Override
+ public int read(byte[] b, int off, int len) {
+ if (pos >= data.length) return -1;
+ int take = Math.min(len, data.length - pos);
+ System.arraycopy(data, pos, b, off, take);
+ pos += take;
+ return take;
+ }
+
+ @Override
+ public int read() {
+ return pos < data.length ? (data[pos++] & 0xff) : -1;
+ }
+
+ @Override
+ public int available() {
+ if (pos < data.length)
+ return data.length - pos;
+ if (extraSignal) {
+ extraSignal = false;
+ return 1; // nudge SimpleBufferedInput.fill() to try another read
+ }
+ return 0;
+ }
+ }
+
}
diff --git a/src/test/java/org/jsoup/helper/HttpConnectionTest.java b/src/test/java/org/jsoup/helper/HttpConnectionTest.java
index 2ef4f1ab18..a26d18cad2 100644
--- a/src/test/java/org/jsoup/helper/HttpConnectionTest.java
+++ b/src/test/java/org/jsoup/helper/HttpConnectionTest.java
@@ -8,6 +8,7 @@
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.ValueSource;
+import javax.net.ssl.HostnameVerifier;
import java.io.IOException;
import java.net.Authenticator;
import java.net.MalformedURLException;
@@ -258,6 +259,13 @@ public void caseInsensitiveHeaders(Locale locale) {
assertEquals("foo", con.request().requestBody());
}
+ @Test public void hostnameVerifier() {
+ Connection con = HttpConnection.connect("http://example.com/");
+ HostnameVerifier hostnameVerifier = (hostname, session) -> false;
+ con.hostnameVerifier(hostnameVerifier);
+ assertEquals(hostnameVerifier, con.request().hostnameVerifier());
+ }
+
@Test public void encodeUrl() throws MalformedURLException {
URL url1 = new URL("https://test.com/foo%20bar/%5BOne%5D?q=white+space#frag");
URL url2 = new UrlBuilder(url1).build();
diff --git a/src/test/java/org/jsoup/helper/RegexTest.java b/src/test/java/org/jsoup/helper/RegexTest.java
new file mode 100644
index 0000000000..14398df01d
--- /dev/null
+++ b/src/test/java/org/jsoup/helper/RegexTest.java
@@ -0,0 +1,75 @@
+package org.jsoup.helper;
+
+import org.jsoup.select.QueryParser;
+import org.jsoup.select.Selector;
+import org.junit.jupiter.api.AfterEach;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.ValueSource;
+
+import static org.junit.jupiter.api.Assertions.*;
+
+public class RegexTest {
+
+ private boolean originalUseRe2j; // track original setting
+
+ @BeforeEach
+ void setUp() {
+ originalUseRe2j = Regex.wantsRe2j();
+ }
+
+ @AfterEach
+ void tearDown() {
+ Regex.wantsRe2j(originalUseRe2j); // restore original setting
+ }
+
+ @ParameterizedTest
+ @ValueSource(booleans = {false, true})
+ void testRegexDelegates(boolean useRe2j) {
+ Regex.wantsRe2j(useRe2j);
+ assertEquals(Regex.usingRe2j(), useRe2j);
+ String pattern = "(\\d+)";
+ String input = "12345";
+
+ Regex regex = Regex.compile(pattern);
+ Regex.Matcher matcher = regex.matcher(input);
+ assertTrue(matcher.find());
+ }
+
+ @Test
+ void jdkSupportsBackreferenceMatches() {
+ Regex.wantsRe2j(false);
+ String pattern = "(\\w+)\\s+\\1"; // backreference to group 1
+ String input = "hello hello";
+
+ Regex regex = Regex.compile(pattern);
+ Regex.Matcher matcher = regex.matcher(input);
+ assertTrue(matcher.find());
+ }
+
+ @Test
+ void re2jRejectsBackreferenceThrows() {
+ Regex.wantsRe2j(true);
+ String pattern = "(\\w+)\\s+\\1"; // backreference unsupported by RE2J
+
+ assertThrows(ValidationException.class, () -> Regex.compile(pattern));
+ // and not the rej2 PatternSyntaxException
+ }
+
+ @ParameterizedTest
+ @ValueSource(booleans = {false, true})
+ void queryParserThrowsSelectorExceptionOnMalformedRegex(boolean useRe2j) {
+ Regex.wantsRe2j(useRe2j);
+ String query = "[attr~=(unclosed]";
+
+ boolean threw = false;
+ try {
+ QueryParser.parse(query);
+ } catch (Selector.SelectorParseException e) {
+ threw = true;
+ assertTrue(e.getMessage().contains("Pattern syntax error"));
+ }
+ assertTrue(threw);
+ }
+}
diff --git a/src/test/java/org/jsoup/integration/FuzzFixesTest.java b/src/test/java/org/jsoup/integration/FuzzFixesTest.java
index 0fd668f256..ebb2c2b55b 100644
--- a/src/test/java/org/jsoup/integration/FuzzFixesTest.java
+++ b/src/test/java/org/jsoup/integration/FuzzFixesTest.java
@@ -2,6 +2,7 @@
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
+import org.jsoup.nodes.Element;
import org.jsoup.parser.Parser;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
@@ -47,11 +48,25 @@ public void bookmark() {
assertNotNull(xmlDoc);
}
+ @Test void fragment() {
+ Parser.htmlParser().parseFragmentInput(">l\u0000<\u0000\u0000< \\", new Element("colgroup"), "");
+ }
+
@ParameterizedTest
@MethodSource("testFiles")
void testHtmlParse(File file) throws IOException {
Document doc = Jsoup.parse(file, "UTF-8", "https://example.com/");
assertNotNull(doc);
+ doc = Jsoup.parse(file, "UTF-8", ""); // no base href attr; so same as a parse(string), which can have subtly different semantics
+ assertNotNull(doc);
+ }
+
+ @ParameterizedTest
+ @MethodSource("testFiles")
+ void testHtmlFragmentParse(File file) throws IOException {
+ String html = ParseTest.getFileAsString(file);
+ Document doc = Jsoup.parseBodyFragment(html);
+ assertNotNull(doc);
}
@ParameterizedTest
@@ -59,5 +74,6 @@ void testHtmlParse(File file) throws IOException {
void testXmlParse(File file) throws IOException {
Document doc = Jsoup.parse(file, "UTF-8", "https://example.com/", Parser.xmlParser());
assertNotNull(doc);
+ doc = Jsoup.parse(file, "UTF-8", "", Parser.xmlParser()); // no base href attr
}
}
diff --git a/src/test/java/org/jsoup/integration/ProxyTest.java b/src/test/java/org/jsoup/integration/ProxyTest.java
index 18c1a60b49..9eb1e15d16 100644
--- a/src/test/java/org/jsoup/integration/ProxyTest.java
+++ b/src/test/java/org/jsoup/integration/ProxyTest.java
@@ -100,7 +100,9 @@ void canAuthenticateToProxy(String url) throws IOException {
// the proxy wants auth, but not the server. HTTP and HTTPS, so tests direct proxy and CONNECT
Connection session = Jsoup.newSession()
- .proxy(proxy.hostname, proxy.authedPort).ignoreHttpErrors(true);
+ .proxy(proxy.hostname, proxy.authedPort)
+ .ignoreHttpErrors(true)
+ .ignoreContentType(true); // ignore content type, as error served may not have a content type
String password = AuthFilter.newProxyPassword();
// fail first
@@ -110,8 +112,7 @@ void canAuthenticateToProxy(String url) throws IOException {
int code = execute.statusCode(); // no auth sent
assertEquals(HttpServletResponse.SC_PROXY_AUTHENTICATION_REQUIRED, code);
} catch (IOException e) {
- // in CONNECT (for the HTTPS url), URLConnection will throw the proxy connect as a Stringly typed IO exception - "Unable to tunnel through proxy. Proxy returns "HTTP/1.1 407 Proxy Authentication Required"". (Not a response code)
- assertTrue(e.getMessage().contains("407"));
+ assertAuthRequiredException(e);
}
try {
@@ -125,7 +126,7 @@ void canAuthenticateToProxy(String url) throws IOException {
assertEquals(MaxAttempts, count.get());
assertEquals(HttpServletResponse.SC_PROXY_AUTHENTICATION_REQUIRED, res.statusCode());
} catch (IOException e) {
- assertTrue(e.getMessage().contains("407"));
+ assertAuthRequiredException(e);
}
AtomicInteger successCount = new AtomicInteger(0);
@@ -139,6 +140,18 @@ void canAuthenticateToProxy(String url) throws IOException {
assertEquals(HttpServletResponse.SC_OK, successRes.statusCode());
}
+ static void assertAuthRequiredException(IOException e) {
+ // in CONNECT (for the HTTPS url), URLConnection will throw the proxy connect as a Stringly typed IO exception - "Unable to tunnel through proxy. Proxy returns "HTTP/1.1 407 Proxy Authentication Required"". (Not a response code)
+ // Alternatively, some platforms (?) will report: "No credentials provided"
+ String err = e.getMessage();
+ if (!(err.contains("407") || err.contains("No credentials provided") || err.contains("exch.exchImpl"))) {
+ // https://github.com/jhy/jsoup/pull/2403 - Ubuntu Azul 25 throws `Cannot invoke "jdk.internal.net.http.ExchangeImpl.cancel(java.io.IOException)" because "exch.exchImpl" is null` here but is just from cancelling the 407 req
+ System.err.println("Not a 407 exception? " + e.getClass());
+ e.printStackTrace(System.err);
+ fail("Expected 407 Proxy Authentication Required, got: " + err);
+ }
+ }
+
@ParameterizedTest @MethodSource("echoUrls")
void canAuthToProxyAndServer(String url) throws IOException {
String serverPassword = AuthFilter.newServerPassword();
diff --git a/src/test/java/org/jsoup/integration/SessionTest.java b/src/test/java/org/jsoup/integration/SessionTest.java
index 4ae5c5e623..a95da59bae 100644
--- a/src/test/java/org/jsoup/integration/SessionTest.java
+++ b/src/test/java/org/jsoup/integration/SessionTest.java
@@ -7,11 +7,14 @@
import org.jsoup.integration.servlets.FileServlet;
import org.jsoup.nodes.Document;
import org.jsoup.parser.Parser;
+import org.jsoup.parser.Tag;
+import org.jsoup.parser.TagSet;
import org.jsoup.select.Elements;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Test;
import java.io.IOException;
+import java.lang.reflect.Field;
import java.util.Map;
import static org.junit.jupiter.api.Assertions.assertEquals;
@@ -134,4 +137,50 @@ public void testCanChangeParsers() throws IOException {
Document doc3 = session.newRequest().url(xmlUrl).get();
assertEquals(xmlVal, doc3.html()); // did not blow away xml default
}
+
+ @Test
+ public void sessionTagSetDoesNotMutateRoot() {
+ Connection session = Jsoup.newSession();
+ TagSet rootTags = session.request().parser().tagSet();
+
+ int rootNamespacesBefore = tagSetNamespaceCount(rootTags);
+
+ Connection request = session.newRequest();
+ Parser parser = request.request().parser();
+ parser.parseInput("One Two ", "http://example.com/");
+
+ int rootNamespacesAfter = tagSetNamespaceCount(rootTags);
+ assertEquals(rootNamespacesBefore, rootNamespacesAfter);
+ }
+
+ @Test
+ public void sessionTagSetCustomizerDoesNotMutateRoot() {
+ Connection session = Jsoup.newSession();
+ TagSet rootTags = session.request().parser().tagSet();
+ rootTags.onNewTag(tag -> {
+ if (!tag.isKnownTag())
+ tag.set(Tag.RcData);
+ });
+
+ int rootNamespacesBefore = tagSetNamespaceCount(rootTags);
+
+ Connection request = session.newRequest();
+ Parser parser = request.request().parser();
+ Document doc = parser.parseInput("One Two ", "https://example.com/");
+ assertEquals(0, doc.select("custom b").size());
+
+ int rootNamespacesAfter = tagSetNamespaceCount(rootTags);
+ assertEquals(rootNamespacesBefore, rootNamespacesAfter);
+ }
+
+ private static int tagSetNamespaceCount(TagSet tagSet) {
+ try {
+ Field tagsField = TagSet.class.getDeclaredField("tags");
+ tagsField.setAccessible(true);
+ Map, ?> tags = (Map, ?>) tagsField.get(tagSet);
+ return tags.size();
+ } catch (ReflectiveOperationException e) {
+ throw new RuntimeException(e);
+ }
+ }
}
diff --git a/src/test/java/org/jsoup/integration/TestServer.java b/src/test/java/org/jsoup/integration/TestServer.java
index 3224a7895c..4a4ddb562e 100644
--- a/src/test/java/org/jsoup/integration/TestServer.java
+++ b/src/test/java/org/jsoup/integration/TestServer.java
@@ -177,6 +177,7 @@ private static void addHttpsConnector(File keystoreFile, Server server) {
server,
new SslConnectionFactory(sslContextFactory, HttpVersion.HTTP_1_1.asString()),
new HttpConnectionFactory(httpsConfig));
+ sslConnector.setHost(Localhost);
server.addConnector(sslConnector);
}
diff --git a/src/test/java/org/jsoup/internal/ControllableInputStreamTest.java b/src/test/java/org/jsoup/internal/ControllableInputStreamTest.java
new file mode 100644
index 0000000000..fc90fe8510
--- /dev/null
+++ b/src/test/java/org/jsoup/internal/ControllableInputStreamTest.java
@@ -0,0 +1,85 @@
+package org.jsoup.internal;
+
+import org.junit.jupiter.api.Test;
+
+import java.io.ByteArrayInputStream;
+import java.io.FilterInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+import static org.junit.jupiter.api.Assertions.*;
+
+class ControllableInputStreamTest {
+
+ @Test
+ void respectsMaxCapDuringFill() throws IOException {
+ byte[] data = "0123456789".getBytes(); // 10 bytes
+ CountingInputStream counting = new CountingInputStream(new ByteArrayInputStream(data));
+
+ ControllableInputStream in = ControllableInputStream.wrap(counting, 5); // cap at 5 bytes
+ byte[] buf = new byte[10];
+
+ int read = in.read(buf);
+ assertEquals(5, read, "should only read up to cap");
+ assertEquals(5, counting.count, "underlying stream should not be pulled past cap");
+ assertFalse(in.baseReadFully(), "cap hit is not EOF");
+
+ int second = in.read(buf);
+ assertEquals(-1, second, "further reads return -1 once cap is exhausted");
+ assertFalse(in.baseReadFully(), "still not true EOF");
+ in.close();
+ }
+
+ @Test
+ void compactsBufferWithActiveMark() throws IOException {
+ int size = SharedConstants.DefaultBufferSize * 2;
+ byte[] data = new byte[size];
+ for (int i = 0; i < size; i++) data[i] = (byte) (i % 256);
+
+ ControllableInputStream in = ControllableInputStream.wrap(new ByteArrayInputStream(data), 0);
+
+ byte[] first = new byte[500];
+ assertEquals(500, in.read(first));
+
+ in.mark(SharedConstants.DefaultBufferSize); // mark at logical pos 500
+
+ byte[] consume = new byte[SharedConstants.DefaultBufferSize];
+ int firstRead = in.read(consume); // serves remainder of current buffer (BufferSize - 500)
+ assertEquals(SharedConstants.DefaultBufferSize - 500, firstRead);
+
+ byte[] more = new byte[1000];
+ int secondRead = in.read(more); // triggers fill() with active mark, then consumes from freshly filled buffer
+ assertEquals(SharedConstants.DefaultBufferSize - firstRead, secondRead);
+
+ in.reset(); // should rewind to mark despite prior compaction
+
+ byte[] reread = new byte[1000];
+ assertEquals(1000, in.read(reread));
+ for (int i = 0; i < reread.length; i++) {
+ assertEquals(data[500 + i], reread[i], "byte mismatch at " + i);
+ }
+ in.close();
+ }
+
+ private static final class CountingInputStream extends FilterInputStream {
+ int count = 0;
+
+ CountingInputStream(InputStream in) {
+ super(in);
+ }
+
+ @Override
+ public int read(byte[] b, int off, int len) throws IOException {
+ int r = super.read(b, off, len);
+ if (r > 0) count += r;
+ return r;
+ }
+
+ @Override
+ public int read() throws IOException {
+ int r = super.read();
+ if (r != -1) count++;
+ return r;
+ }
+ }
+}
diff --git a/src/test/java/org/jsoup/internal/ReaderTest.java b/src/test/java/org/jsoup/internal/ReaderTest.java
new file mode 100644
index 0000000000..293ecb3774
--- /dev/null
+++ b/src/test/java/org/jsoup/internal/ReaderTest.java
@@ -0,0 +1,52 @@
+package org.jsoup.internal;
+
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.parser.CharacterReader;
+import org.jsoup.parser.Parser;
+import org.junit.jupiter.api.Test;
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+
+import static org.jsoup.integration.ParseTest.getPath;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+public class ReaderTest {
+ @Test void readerOfStringAndFile() throws IOException {
+ // make sure that reading from a String and from a File produce the same bytes
+ Path path = getPath("/fuzztests/garble.html");
+ byte[] bytes = Files.readAllBytes(path);
+ String fromBytes = new String(bytes, StandardCharsets.UTF_8);
+
+ SimpleStreamReader streamReader = getReader(path);
+ String fromStream = getString(streamReader);
+ assertEquals(fromBytes, fromStream);
+
+ SimpleStreamReader reader2 = getReader(path);
+ CharacterReader cr = new CharacterReader(reader2);
+ String fullRead = cr.consumeTo('X'); // does not exist in input
+ assertEquals(fromBytes, fullRead);
+ }
+
+ private static String getString(SimpleStreamReader streamReader) throws IOException {
+ // read streamreader to a string:
+ StringBuilder builder = new StringBuilder();
+ char[] cbuffer = new char[1024];
+ int read;
+ while ((read = streamReader.read(cbuffer)) != -1) {
+ builder.append(cbuffer, 0, read);
+ }
+ return builder.toString();
+ }
+
+ private static SimpleStreamReader getReader(Path path) throws IOException {
+ // set up a chain as in when we parse: simplebufferedinput -> controllableinputstream -> simplestreamreader -> characterreader
+ SimpleBufferedInput input = new SimpleBufferedInput(Files.newInputStream(path));
+ ControllableInputStream stream = ControllableInputStream.wrap(input, 0);
+ return new SimpleStreamReader(stream, StandardCharsets.UTF_8);
+ }
+}
diff --git a/src/test/java/org/jsoup/nodes/ElementIT.java b/src/test/java/org/jsoup/nodes/ElementIT.java
index 849441c3a0..5878b97885 100644
--- a/src/test/java/org/jsoup/nodes/ElementIT.java
+++ b/src/test/java/org/jsoup/nodes/ElementIT.java
@@ -1,6 +1,7 @@
package org.jsoup.nodes;
import org.jsoup.Jsoup;
+import org.jsoup.parser.Parser;
import org.jsoup.select.Elements;
import org.junit.jupiter.api.Test;
@@ -125,6 +126,7 @@ public void testFastReparentExistingContent() {
@Test void wrapNoOverflow() {
// deepChild was recursive, so could overflow if presented with a fairly insane wrap
Document doc = new Document("https://example.com/");
+ doc.parser().setMaxDepth(Integer.MAX_VALUE); // don't limit to 512
Element el = doc.body().appendElement("p");
int num = 50000;
StringBuilder sb = new StringBuilder();
@@ -134,7 +136,7 @@ public void testFastReparentExistingContent() {
el.wrap(sb.toString());
String html = doc.body().html();
assertTrue(html.startsWith(""));
- assertEquals(num + 3, el.parents().size());
+ assertEquals(num + 3, el.parents().size()); // + 3 is for body, html, document
}
@Test
diff --git a/src/test/java/org/jsoup/nodes/ElementTest.java b/src/test/java/org/jsoup/nodes/ElementTest.java
index bf70d0aeb4..2395a1eb98 100644
--- a/src/test/java/org/jsoup/nodes/ElementTest.java
+++ b/src/test/java/org/jsoup/nodes/ElementTest.java
@@ -26,6 +26,7 @@
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Pattern;
+import java.util.stream.Collectors;
import java.util.stream.Stream;
import static org.jsoup.nodes.NodeIteratorTest.assertIterates;
@@ -2844,7 +2845,7 @@ void prettySerializationRoundTrips(Document.OutputSettings settings) {
Document doc = Jsoup.parse(reference);
Throwable ex = assertThrows(IllegalArgumentException.class,
() -> doc.getElementsByAttributeValueMatching("key", "\\x"));
- assertEquals("Pattern syntax error: \\x", ex.getMessage());
+ assertTrue(ex.getMessage().contains("Pattern syntax error"));
}
@Test void getElementsByIndexEquals() {
@@ -2874,7 +2875,7 @@ void prettySerializationRoundTrips(Document.OutputSettings settings) {
Document doc = Jsoup.parse(reference);
Throwable ex = assertThrows(IllegalArgumentException.class,
() -> doc.getElementsMatchingText("\\x"));
- assertEquals("Pattern syntax error: \\x", ex.getMessage());
+ assertTrue(ex.getMessage().contains("Pattern syntax error:"));
}
@Test void getElementsMatchingText() {
@@ -2896,7 +2897,7 @@ void prettySerializationRoundTrips(Document.OutputSettings settings) {
Document doc = Jsoup.parse(reference);
Throwable ex = assertThrows(IllegalArgumentException.class,
() -> doc.getElementsMatchingOwnText("\\x"));
- assertEquals("Pattern syntax error: \\x", ex.getMessage());
+ assertTrue(ex.getMessage().contains("Pattern syntax error:"));
}
@Test void hasText() {
@@ -3247,4 +3248,79 @@ public void deselectAll() {
}
assertTrue(threw);
}
+
+ @Test void childByIndex() {
+ // uncached, cached paths
+ Element el = Jsoup.parse("
One
Two
Three
Four
Five
Six
").expectFirst("div");
+
+ // uncached
+ Element p0 = el.child(0);
+ Element p1 = el.child(1);
+ Element p2 = el.child(2);
+ assertNull(el.cachedChildren());
+
+ assertEquals("Two", p0.text());
+ assertEquals("Four", p1.text());
+ assertEquals("Six", p2.text());
+
+ // cached
+ Elements children = el.children();
+ assertNotNull(el.cachedChildren());
+ assertSame(p0, el.child(0));
+ assertSame(p1, el.child(1));
+ assertSame(p2, el.child(2));
+ }
+
+ @Test public void testChildThrowsIndexOutOfBoundsWhenCachedChildrenIsNull() {
+ Element el = Jsoup.parse("
").expectFirst("div");
+ assertNull(el.cachedChildren());
+ Exception exception = assertThrows(IndexOutOfBoundsException.class, () -> {
+ el.child(5);
+ });
+ assertTrue(exception.getMessage().contains("No child at index: 5"));
+ }
+
+ @Test public void testChildrenSizeUncachedAndCached() {
+ Element el = Jsoup.parse("
One
Two
Three
Four
Five
Six
").expectFirst("div");
+
+ // uncached
+ assertNull(el.cachedChildren());
+ assertEquals(3, el.childrenSize());
+ // gets cached. As we have to iter elements anyway, might as well make and cache the list, so later child(i) is fast. supports for(i=0;i
One Two
Three Four
Five Six
").expectFirst("div"); // resest
+ assertNull(el.cachedChildren());
+ el.children();
+ assertNotNull(el.cachedChildren());
+ assertEquals(4, el.childrenSize());
+
+ Element empty = el.expectFirst("b");
+ assertEquals(0, empty.childrenSize());
+ assertNull(empty.cachedChildren()); // 0 node fast path, does not create list
+ }
+
+ @Test public void testReplaceInvalidates() {
+ // https://github.com/jhy/jsoup/issues/2391
+ String html = "test
";
+ Document doc = Jsoup.parseBodyFragment(html);
+ Element div = doc.expectFirst("div");
+
+ // Cached
+ Elements divChildren = div.children(); // 0 child elements, 1 node
+ int origCount = divChildren.size();
+
+ // Modify child
+ TextNode text = (TextNode) div.childNode(0);
+ Element p = doc.createElement("p");
+ text.replaceWith(p);
+ p.appendChild(text);
+
+ int reported = div.childrenSize(); // invalidated ^^
+ long actualSize = div.childNodes().stream().filter(node -> node instanceof Element).count();
+
+ assertEquals(0, origCount);
+ assertEquals(1, actualSize);
+ assertEquals(1, reported); // was 0 via cache
+ }
}
diff --git a/src/test/java/org/jsoup/nodes/LeafNodeTest.java b/src/test/java/org/jsoup/nodes/LeafNodeTest.java
index 0398ebcf73..a0d735d838 100644
--- a/src/test/java/org/jsoup/nodes/LeafNodeTest.java
+++ b/src/test/java/org/jsoup/nodes/LeafNodeTest.java
@@ -13,9 +13,11 @@ public class LeafNodeTest {
public void doesNotGetAttributesTooEasily() {
// test to make sure we're not setting attributes on all nodes right away
String body = "One Three
";
- Document doc = Jsoup.parse(body);
+ Document doc = Jsoup.parse(body, "https://example.com/");
assertTrue(hasAnyAttributes(doc)); // should have one - the base uri on the doc
+ assertFalse(hasAnyAttributes(Jsoup.parse("None
"))); // no base uri
+
Element html = doc.child(0);
assertFalse(hasAnyAttributes(html));
diff --git a/src/test/java/org/jsoup/parser/HtmlParserTest.java b/src/test/java/org/jsoup/parser/HtmlParserTest.java
index 43c7339426..a73366767e 100644
--- a/src/test/java/org/jsoup/parser/HtmlParserTest.java
+++ b/src/test/java/org/jsoup/parser/HtmlParserTest.java
@@ -7,6 +7,7 @@
import org.jsoup.nodes.*;
import org.jsoup.safety.Safelist;
import org.jsoup.select.Elements;
+import org.junit.jupiter.api.Nested;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.Arguments;
@@ -471,6 +472,22 @@ private static Stream dupeAttributeData() {
assertEquals(" \n", doc.body().html());
}
+ @Test public void siblingIndexFromFragment() {
+ Document doc = Jsoup.parseBodyFragment("");
+ Element input = doc.expectFirst("input");
+ Element table = doc.expectFirst("table");
+ assertEquals(0, input.siblingIndex());
+ assertEquals(1, table.siblingIndex());
+ }
+
+ @Test public void siblingIndexFromParse() {
+ Document doc = Jsoup.parse("");
+ Element input = doc.expectFirst("input");
+ Element table = doc.expectFirst("table");
+ assertEquals(0, input.siblingIndex());
+ assertEquals(1, table.siblingIndex());
+ }
+
@Test public void handlesUnknownNamespaceTags() {
String h = "FooHello
There ";
Parser parser = Parser.htmlParser();
@@ -840,7 +857,7 @@ private static Stream dupeAttributeData() {
@Test public void handlesNullInData() {
Document doc = Jsoup.parse("Blah \u0000
");
- assertEquals("Blah
", doc.body().html()); // replaced in attr, NOT replaced in data (but is escaped as control char <0x20)
+ assertEquals("Blah
", doc.body().html()); // replaced in attr, discarded in data
}
@Test public void handlesNullInComments() {
@@ -1840,6 +1857,18 @@ private boolean didAddElements(String input) {
"", serialized);
}
+ @Test void svgForeignObjectInParagraph() {
+ String html = "
";
+ Document doc = Jsoup.parse(html);
+
+ Element foreignObject = doc.expectFirst("foreignObject");
+ assertSvgNamespace(foreignObject);
+ Element div = foreignObject.selectFirst("div");
+ assertNotNull(div, "div should stay within foreignObject");
+ assertHtmlNamespace(div);
+ assertEquals("One", div.expectFirst("p").text());
+ }
+
@Test void mathParseText() {
String html = "";
Document doc = Jsoup.parse(html);
@@ -2090,6 +2119,45 @@ static void assertErrorsDoNotContain(String msg, ParseErrorList errors) {
assertEquals("
Foo ", TextUtil.stripNewlines(doc.body().html()));
}
+ @Test void customVoidTagsBehaveLikeHtmlVoids() {
+ Parser parser = Parser.htmlParser().setTrackErrors(10).tagSet(TagSet.Html());
+ TagSet tags = parser.tagSet();
+ tags.valueOf("voidtag", Parser.NamespaceHtml).set(Tag.Void);
+
+ String html = "Hello World
";
+ Document doc = Jsoup.parse(html, parser);
+ assertEquals(0, parser.getErrors().size());
+
+ doc.outputSettings().syntax(Document.OutputSettings.Syntax.html);
+ String emittedHtml = TextUtil.stripNewlines(doc.body().html());
+ assertEquals("Hello World
", emittedHtml);
+ assertEquals("Hello World", doc.body().text());
+
+ doc.outputSettings().syntax(Document.OutputSettings.Syntax.xml);
+ assertEquals(" Hello World
", TextUtil.stripNewlines(doc.body().html()));
+ }
+
+ @Test void customSelfClosingVoidTagsRoundTrip() {
+ Parser parser = Parser.htmlParser().setTrackErrors(10).tagSet(TagSet.Html());
+ TagSet tags = parser.tagSet();
+ tags.valueOf("selfclosingvoidtag", Parser.NamespaceHtml).set(Tag.Void).set(Tag.SelfClose);
+
+ String html = " Hello World
";
+ Document doc = Jsoup.parse(html, parser);
+ assertEquals(0, parser.getErrors().size());
+
+ doc.outputSettings().syntax(Document.OutputSettings.Syntax.html);
+ String emittedHtml = TextUtil.stripNewlines(doc.body().html());
+ assertEquals("Hello World
", emittedHtml);
+
+ Document reparsed = Jsoup.parse(emittedHtml, parser);
+ reparsed.outputSettings().syntax(Document.OutputSettings.Syntax.html);
+ assertEquals(emittedHtml, TextUtil.stripNewlines(reparsed.body().html()));
+
+ doc.outputSettings().syntax(Document.OutputSettings.Syntax.xml);
+ assertEquals(" Hello World
", TextUtil.stripNewlines(doc.body().html()));
+ }
+
@Test void svgScriptParsedAsScriptData() {
// https://github.com/jhy/jsoup/issues/2320
String html = " ";
@@ -2114,4 +2182,151 @@ static void assertErrorsDoNotContain(String msg, ParseErrorList errors) {
assertEquals("a < b", data.data());
assertEquals("a < b ", data.outerHtml());
}
+
+ @Test void dropsNullsFromBody() {
+ // https://github.com/jhy/jsoup/issues/2395
+ String html = "\u0000
\u0000\u0000
Hi\u0000
";
+
+ Parser parser = Parser.htmlParser();
+ parser.setTrackErrors(10);
+
+ Document doc = Jsoup.parse(html, parser);
+ assertEquals("
\n
\nHi
", doc.body().html());
+ assertEquals("Hi", doc.body().text());
+
+ ParseErrorList errors = parser.getErrors();
+ assertEquals(4, errors.size());
+ assertEquals("<1:4>: Unexpected character '\u0000' in input state [Data]", errors.get(0).toString());
+ assertEquals("<1:12>: Unexpected character '\u0000' in input state [Data]", errors.get(1).toString());
+ assertEquals("<1:13>: Unexpected character '\u0000' in input state [Data]", errors.get(2).toString());
+ assertEquals("<1:23>: Unexpected character '\u0000' in input state [Data]", errors.get(3).toString());
+ // todo should we replace that null, for convenience?
+ }
+
+ @Test void replacesNullsInForeign() {
+ String html = "\u0000 \u0000\u0000 Hi\u0000 ";
+ Parser parser = Parser.htmlParser();
+ parser.setTrackErrors(10);
+
+ Document doc = Jsoup.parse(html, parser);
+ assertEquals("\n � �� Hi� \n ", doc.body().html());
+ assertEquals("���Hi�", doc.body().text());
+
+ ParseErrorList errors = parser.getErrors();
+ assertEquals(4, errors.size());
+ assertEquals("<1:12>: Unexpected character '\u0000' in input state [Data]", errors.get(0).toString());
+ assertEquals("<1:26>: Unexpected character '\u0000' in input state [Data]", errors.get(1).toString());
+ assertEquals("<1:27>: Unexpected character '\u0000' in input state [Data]", errors.get(2).toString());
+ assertEquals("<1:43>: Unexpected character '\u0000' in input state [Data]", errors.get(3).toString());
+ }
+
+ @Nested class DeepHtmlTrees {
+ private int depth(Element el) {
+ int depth = 0;
+ while ((el = el.parent()) != null) {
+ depth++;
+ }
+ return depth;
+ }
+
+ /**
+ * Parse the HTML code in `contents`, wrapped in enough divs to ensure that the root elements
+ * of contents are at depth `startingDepth`.
+ */
+ private Element parseDeepHtml(int startingDepth, String contents) {
+ StringBuilder html = new StringBuilder();
+ html.append("");
+ for (int i = 0; i < startingDepth - 4; i++) {
+ html.append("");
+ }
+ html.append("
");
+ html.append(contents);
+
+ Parser parser = Parser.htmlParser();
+ Document doc = Jsoup.parse(html.toString(), parser);
+ Element container = doc.getElementById("container");
+ assertNotNull(container);
+ assertEquals(startingDepth - 1, depth(container));
+
+ return container;
+ }
+
+ @Test void nestedDivs() {
+ Element container = parseDeepHtml(511, "
");
+
+ assertEquals("
", container.html());
+ }
+
+ @Test void closingTagOfTagClosedByDepthLimit() {
+ // The
tag would be nested too deep, so it first closes the innermost
.
+ // This means that the first will close the outer
, as it's the only
+ // one that is currently open. The last is then just ignored, as there is no
+ // open
left to close.
+ Element container = parseDeepHtml(511, " ");
+
+ assertEquals(" ", container.html());
+ }
+
+ @Test void tableAtDepthLimitWithDirectTd() {
+ Element container = parseDeepHtml(512, "");
+
+ assertEquals("\n \n \n ", container.html());
+ }
+
+ @Test void tableRightBeforeDepthLimitWithDirectTd() {
+ Element container = parseDeepHtml(511, "");
+
+ assertEquals("", container.html());
+ }
+
+ @Test void customDepthLimit() {
+ Parser parser = Parser.htmlParser().setMaxDepth(5);
+ String input = "";
+
+ Document doc = Jsoup.parse(input, parser);
+ String expected = new StringBuilder()
+ .append("\n")
+ .append(" \n")
+ .append(" \n")
+ .append("
\n")
+ .append("
\n")
+ .append("
\n")
+ .append("
\n")
+ .append("
\n")
+ .append("
\n")
+ .append("
\n")
+ .append("
\n")
+ .append(" \n")
+ .append("")
+ .toString();
+
+ assertEquals(expected, doc.html());
+ }
+
+ @Test void formControlsDetachWhenFormTrimmed() {
+ Parser parser = Parser.htmlParser().setMaxDepth(3);
+ String input = "
";
+
+ Document doc = Jsoup.parse(input, "", parser);
+ Element formEl = doc.getElementById("f");
+ assertNotNull(formEl);
+ assertTrue(formEl instanceof FormElement);
+ FormElement form = (FormElement) formEl;
+ assertEquals("", form.html());
+ assertEquals(0, form.elements().size());
+ }
+
+ @Test void templateModesClearedWhenTrimmed() {
+ Parser parser = Parser.htmlParser().setMaxDepth(3);
+ String input = "
One
Two
";
+
+ Document doc = Jsoup.parse(input, "", parser);
+ Element template = doc.getElementById("tmpl");
+ assertNotNull(template);
+ assertEquals("", template.html());
+ Element paragraph = doc.selectFirst("p");
+ assertNotNull(paragraph);
+ assertEquals("Two", paragraph.text());
+ }
+ }
}
diff --git a/src/test/java/org/jsoup/parser/ParserIT.java b/src/test/java/org/jsoup/parser/ParserIT.java
index e1904ddc20..368c772ef3 100644
--- a/src/test/java/org/jsoup/parser/ParserIT.java
+++ b/src/test/java/org/jsoup/parser/ParserIT.java
@@ -49,8 +49,16 @@ public void handlesDeepStack() {
long start = System.currentTimeMillis();
Document doc = Parser.parseBodyFragment(longBody.toString(), "");
+ int depth = 1;
+ Element el = doc.body();
+ while (el.childrenSize() > 0) {
+ el = el.child(0);
+ depth++;
+ }
+
// Assert
- assertEquals(2, doc.body().childNodeSize());
+ assertEquals(1, doc.body().childrenSize());
+ assertEquals(512, depth);
assertEquals(25000, doc.select("dd").size());
assertTrue(System.currentTimeMillis() - start < 20000); // I get ~ 1.5 seconds, but others have reported slower
// was originally much longer, or stack overflow.
diff --git a/src/test/java/org/jsoup/parser/ParserSettingsTest.java b/src/test/java/org/jsoup/parser/ParserSettingsTest.java
index 7856287cb1..c21bf8341a 100644
--- a/src/test/java/org/jsoup/parser/ParserSettingsTest.java
+++ b/src/test/java/org/jsoup/parser/ParserSettingsTest.java
@@ -48,8 +48,7 @@ public void attributesCaseNormalization(Locale locale) {
Attributes attributes = new Attributes();
attributes.put("ITEM", "1");
- Attributes normalizedAttributes = parseSettings.normalizeAttributes(attributes);
-
- assertEquals("item", normalizedAttributes.asList().get(0).getKey());
+ parseSettings.normalizeAttributes(attributes);
+ assertEquals("item", attributes.asList().get(0).getKey());
}
}
diff --git a/src/test/java/org/jsoup/parser/ParserTest.java b/src/test/java/org/jsoup/parser/ParserTest.java
index 1418de0ee9..bfebf6beba 100644
--- a/src/test/java/org/jsoup/parser/ParserTest.java
+++ b/src/test/java/org/jsoup/parser/ParserTest.java
@@ -9,7 +9,10 @@
import java.nio.charset.StandardCharsets;
import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertNotSame;
+import static org.junit.jupiter.api.Assertions.assertNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
public class ParserTest {
@@ -30,6 +33,24 @@ public void unescapeEntitiesHandlesLargeInput() {
assertEquals(body, Parser.unescapeEntities(body, false));
}
+ @Test public void unescapeTracksErrors() {
+ Parser parser = Parser.htmlParser();
+ parser.setTrackErrors(10);
+
+ String s = parser.unescape("One &bogus; & > Two", false);
+ assertEquals("One &bogus; & > Two", s);
+ ParseErrorList errors = parser.getErrors();
+ assertEquals(2, errors.size());
+ assertEquals("<1:6>: Invalid character reference: invalid named reference [bogus]", errors.get(0).toString());
+ assertEquals("<1:22>: Invalid character reference: missing semicolon on [>]", errors.get(1).toString());
+
+ // can reuse parser; errors will be reset
+ s = parser.unescape("One & &bogus; Two", false);
+ assertEquals("One & &bogus; Two", s);
+ assertEquals(1, parser.getErrors().size());
+ assertEquals("<1:12>: Invalid character reference: invalid named reference [bogus]", parser.getErrors().get(0).toString());
+ }
+
@Test
public void testUtf8() throws IOException {
// testcase for https://github.com/jhy/jsoup/issues/1557. no repro.
@@ -58,4 +79,24 @@ public void testClone() {
assertEquals(xmlParser.settings().preserveTagCase(), xmlClone.settings().preserveTagCase());
assertEquals(xmlParser.settings().preserveAttributeCase(), xmlClone.settings().preserveAttributeCase());
}
+
+ @Test
+ public void testCloneCopyTagSet() {
+ Parser parser = Parser.htmlParser();
+ parser.tagSet().add(new Tag("foo"));
+ parser.tagSet().onNewTag(tag -> tag.set(Tag.SelfClose));
+ Parser clone = parser.clone();
+
+ // Ensure the tagsets are different instances
+ assertNotSame(clone.tagSet(), parser.tagSet());
+ // Check that cloned tagset contains same tag
+ assertNotNull(clone.tagSet().get("foo", Parser.NamespaceHtml));
+ // Ensure onNewTag customizers are retained
+ Tag custom = clone.tagSet().valueOf("qux", Parser.NamespaceHtml);
+ assertTrue(custom.isSelfClosing());
+ // Check that cloned tagset does not observe modifications made to the original
+ assertNull(clone.tagSet().get("bar", Parser.NamespaceHtml));
+ parser.tagSet().add(new Tag("bar"));
+ assertNull(clone.tagSet().get("bar", Parser.NamespaceHtml));
+ }
}
diff --git a/src/test/java/org/jsoup/parser/TagSetTest.java b/src/test/java/org/jsoup/parser/TagSetTest.java
index 8c8faf15c7..6ba19b8445 100644
--- a/src/test/java/org/jsoup/parser/TagSetTest.java
+++ b/src/test/java/org/jsoup/parser/TagSetTest.java
@@ -4,6 +4,10 @@
import org.jsoup.nodes.Element;
import org.junit.jupiter.api.Test;
+import java.lang.reflect.Field;
+import java.util.Map;
+import java.util.concurrent.atomic.AtomicInteger;
+
import static org.jsoup.parser.Parser.NamespaceHtml;
import static org.junit.jupiter.api.Assertions.*;
@@ -182,4 +186,36 @@ public class TagSetTest {
assertTrue(copy.valueOf("custom-tag", NamespaceHtml).is(Tag.Void));
assertFalse(source.valueOf("custom-tag", NamespaceHtml).is(Tag.Void));
}
+
+ @Test void copyPullThroughDoesNotMutateSource() {
+ TagSet source = TagSet.Html();
+ TagSet copy = new TagSet(source);
+
+ int sourceNamespacesBefore = tagSetNamespaceCount(source);
+ assertNotNull(copy.get("div", NamespaceHtml));
+ int sourceNamespacesAfter = tagSetNamespaceCount(source);
+ assertEquals(sourceNamespacesBefore, sourceNamespacesAfter);
+ }
+
+ @Test void copyPullWithCustomizerThroughDoesNotMutateSource() {
+ TagSet source = TagSet.Html();
+ TagSet copy = new TagSet(source);
+
+ AtomicInteger sourceAdds = new AtomicInteger();
+ source.onNewTag(tag -> sourceAdds.incrementAndGet());
+
+ assertNotNull(copy.get("div", NamespaceHtml));
+ assertEquals(0, sourceAdds.get());
+ }
+
+ private static int tagSetNamespaceCount(TagSet tagSet) {
+ try {
+ Field tagsField = TagSet.class.getDeclaredField("tags");
+ tagsField.setAccessible(true);
+ Map, ?> tags = (Map, ?>) tagsField.get(tagSet);
+ return tags.size();
+ } catch (ReflectiveOperationException e) {
+ throw new RuntimeException(e);
+ }
+ }
}
diff --git a/src/test/java/org/jsoup/parser/TagTest.java b/src/test/java/org/jsoup/parser/TagTest.java
index d9f7138980..4ed0e17977 100644
--- a/src/test/java/org/jsoup/parser/TagTest.java
+++ b/src/test/java/org/jsoup/parser/TagTest.java
@@ -81,6 +81,12 @@ public void canBeInsensitive(Locale locale) {
assertFalse(p.isInline());
}
+ @Test public void brSemantics() {
+ Tag br = Tag.valueOf("br");
+ assertTrue(br.isInline());
+ assertFalse(br.isBlock());
+ }
+
@Test public void imgSemantics() {
Tag img = Tag.valueOf("img");
assertTrue(img.isInline());
diff --git a/src/test/java/org/jsoup/parser/XmlTreeBuilderTest.java b/src/test/java/org/jsoup/parser/XmlTreeBuilderTest.java
index dcba7986a1..ff70991896 100644
--- a/src/test/java/org/jsoup/parser/XmlTreeBuilderTest.java
+++ b/src/test/java/org/jsoup/parser/XmlTreeBuilderTest.java
@@ -647,4 +647,41 @@ private static void assertXmlNamespace(Element el) {
assertEquals("
Foo
", TextUtil.stripNewlines(doc.outerHtml()));
// we infer that empty els can be represented with self-closing if seen in parse
}
+
+ @Test public void xmlParserHasUnlimitedDepthByDefault() {
+ Parser parser = Parser.xmlParser();
+ Document doc = Jsoup.parse(deepXml(600), "", parser);
+ Element target = doc.selectFirst("target");
+ assertNotNull(target);
+ assertTrue(depth(target) > 512);
+ }
+
+ @Test public void xmlParserRespectsConfiguredMaxDepth() {
+ Parser parser = Parser.xmlParser().setMaxDepth(5);
+ Document doc = Jsoup.parse(deepXml(100), "", parser);
+ Element target = doc.selectFirst("target");
+ assertNotNull(target);
+ assertEquals(parser.getMaxDepth(), depth(target));
+ }
+
+ private static String deepXml(int depth) {
+ StringBuilder xml = new StringBuilder("
");
+ for (int i = 0; i < depth; i++) {
+ xml.append("");
+ }
+ xml.append(" ");
+ for (int i = 0; i < depth; i++) {
+ xml.append(" ");
+ }
+ xml.append(" ");
+ return xml.toString();
+ }
+
+ private static int depth(Element el) {
+ int d = 0;
+ while ((el = el.parent()) != null) {
+ d++;
+ }
+ return d;
+ }
}
diff --git a/src/test/java/org/jsoup/select/EvaluatorTest.java b/src/test/java/org/jsoup/select/EvaluatorTest.java
index ff456d757d..31c1cc75f5 100644
--- a/src/test/java/org/jsoup/select/EvaluatorTest.java
+++ b/src/test/java/org/jsoup/select/EvaluatorTest.java
@@ -1,6 +1,7 @@
package org.jsoup.select;
import org.jsoup.Jsoup;
+import org.jsoup.helper.Regex;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.junit.jupiter.api.Test;
@@ -257,6 +258,34 @@ public void testMatchesWholeOwnTextToString() {
assertEquals(":matchesWholeOwnText(example)", evaluator.toString());
}
+ @Test
+ public void testMatchesToStringRegex() {
+ Regex pattern = Regex.compile("example");
+ Evaluator.Matches evaluator = new Evaluator.Matches(pattern);
+ assertEquals(":matches(example)", evaluator.toString());
+ }
+
+ @Test
+ public void testMatchesOwnToStringRegex() {
+ Regex pattern = Regex.compile("example");
+ Evaluator.MatchesOwn evaluator = new Evaluator.MatchesOwn(pattern);
+ assertEquals(":matchesOwn(example)", evaluator.toString());
+ }
+
+ @Test
+ public void testMatchesWholeTextToStringRegex() {
+ Regex pattern = Regex.compile("example");
+ Evaluator.MatchesWholeText evaluator = new Evaluator.MatchesWholeText(pattern);
+ assertEquals(":matchesWholeText(example)", evaluator.toString());
+ }
+
+ @Test
+ public void testMatchesWholeOwnTextToStringRegex() {
+ Regex pattern = Regex.compile("example");
+ Evaluator.MatchesWholeOwnText evaluator = new Evaluator.MatchesWholeOwnText(pattern);
+ assertEquals(":matchesWholeOwnText(example)", evaluator.toString());
+ }
+
@Test
public void testMatchTextToString() {
Evaluator.MatchText evaluator = new Evaluator.MatchText();
diff --git a/src/test/java/org/jsoup/select/SelectorIT.java b/src/test/java/org/jsoup/select/SelectorIT.java
index 71f0d6f0b1..1b8f59269c 100644
--- a/src/test/java/org/jsoup/select/SelectorIT.java
+++ b/src/test/java/org/jsoup/select/SelectorIT.java
@@ -57,40 +57,4 @@ public void uncaughtException(Thread t, Throwable e) {
exceptionCount.incrementAndGet();
}
}
-
- @Test public void streamParserSelect() throws Exception {
- // https://github.com/jhy/jsoup/issues/2277
- // The memo in the StructuralEvaluator was not getting reset correctly, and so would run out of memory
- // Test tracks memory consumption. Will be interesting to see how it behaves on the CI workers.
-
- String xml = "
1";
- Evaluator query = QueryParser.parse("A B C");
- Runtime runtime = Runtime.getRuntime();
-
- System.gc();
- Thread.sleep(100);
- long initialUsed = runtime.totalMemory() - runtime.freeMemory();
-
- for (int i = 0; i < 50_000; i++) { // Before fix, would exceed 10MB in ~ 9000 iters
- try (StreamParser parser = new StreamParser(Parser.xmlParser())) {
- parser.parse(xml, "");
- parser.selectFirst(query);
- parser.stop();
- }
-
- if (i % 1000 == 0) {
- System.gc();
- Thread.sleep(100);
- long currentUsed = runtime.totalMemory() - runtime.freeMemory();
- long delta = currentUsed - initialUsed;
-
- // Fail if we grow + 10MB
- if (delta > 10_000_000) {
- fail(String.format("Memo leak detected. Memory increased by %,d bytes after %,d iterations",
- delta, i));
- }
- }
- }
- }
-
}
diff --git a/src/test/java/org/jsoup/select/SelectorTest.java b/src/test/java/org/jsoup/select/SelectorTest.java
index 6207a6cc57..8fe7d00eaa 100644
--- a/src/test/java/org/jsoup/select/SelectorTest.java
+++ b/src/test/java/org/jsoup/select/SelectorTest.java
@@ -10,10 +10,12 @@
import org.jsoup.nodes.TextNode;
import org.jsoup.parser.Parser;
import org.junit.jupiter.api.Test;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.ValueSource;
-import java.util.IdentityHashMap;
import java.util.List;
import java.util.Locale;
+import java.util.Map;
import java.util.stream.Collectors;
import static org.jsoup.select.EvaluatorDebug.sexpr;
@@ -896,14 +898,12 @@ public void selectClassWithSpace() {
Document doc = Jsoup.parse(html);
Elements found = doc.select("div[class=value ]");
- assertEquals(2, found.size());
- assertEquals("class without space", found.get(0).text());
- assertEquals("class with space", found.get(1).text());
+ assertEquals(1, found.size());
+ assertEquals("class with space", found.get(0).text());
found = doc.select("div[class=\"value \"]");
- assertEquals(2, found.size());
- assertEquals("class without space", found.get(0).text());
- assertEquals("class with space", found.get(1).text());
+ assertEquals(1, found.size());
+ assertEquals("class with space", found.get(0).text());
found = doc.select("div[class=\"value\\ \"]");
assertEquals(0, found.size());
@@ -1194,7 +1194,7 @@ public void wildcardNamespaceMatchesNoNamespace() {
Evaluator eval = QueryParser.parse("p ~ p");
CombiningEvaluator.And andEval = (CombiningEvaluator.And) eval;
StructuralEvaluator.PreviousSibling prevEval = (StructuralEvaluator.PreviousSibling) andEval.evaluators.get(0);
- IdentityHashMap> map = prevEval.threadMemo.get();
+ Map> map = prevEval.threadMemo.get();
assertEquals(0, map.size()); // no memo yet
Document doc1 = Jsoup.parse("One
Two
Three");
@@ -1207,7 +1207,7 @@ public void wildcardNamespaceMatchesNoNamespace() {
assertEquals(2, s2.size());
assertEquals("Two2", s2.first().text());
- assertEquals(1, map.size()); // root of doc 2
+ assertEquals(0, map.size()); // reset after collect
}
@Test public void blankTextNodesAreConsideredEmpty() {
@@ -1729,4 +1729,68 @@ public void testAncestorChain() {
);
}
+ @Test void attributeSelectorQuotedWhitespace() {
+ // https://github.com/jhy/jsoup/issues/2380
+ Document doc = Jsoup.parse(
+ "
" +
+ "
" +
+ "
"
+ );
+
+ // match: literal compare (no trimming)
+ assertSelectedIds(doc.select("div[data=\"foobar\"]"), "1");
+ assertSelectedIds(doc.select("div[data=\" foobar \"]"), "2");
+
+ // prefix
+ assertSelectedIds(doc.select("div[data^=\"foo\"]"), "1");
+ assertSelectedIds(doc.select("div[data^=\" foo\"]"), "2");
+
+ // suffix
+ assertSelectedIds(doc.select("div[data$=\"bar\"]"), "1");
+ assertSelectedIds(doc.select("div[data$=\"bar \"]"), "2");
+
+ // contains
+ assertSelectedIds(doc.select("div[data*=\"foobar\"]"), "1", "2", "3");
+ assertSelectedIds(doc.select("div[data*=\" foobar \"]"), "2");
+ }
+
+ @Test void canSelectBlankAttribute() {
+ Document doc = Jsoup.parse(
+ "
" +
+ "
" +
+ "
"
+ );
+
+ assertSelectedIds(doc.select("div[data]"), "1", "2", "3");
+ assertSelectedIds(doc.select("div[data='']"), "1", "2");
+ assertSelectedIds(doc.select("div[data=]"), "1", "2");
+
+ assertSelectedIds(doc.select("div[data^='']"), "1", "2", "3");
+ assertSelectedIds(doc.select("div[data$='']"), "1", "2", "3");
+ assertSelectedIds(doc.select("div[data*='']"), "1", "2", "3");
+ }
+
+ @ParameterizedTest
+ @ValueSource(strings = {"[abs:!=]", "[ abs:^=]"})
+ void parseExceptionOnEmptyAbsKey(String query) {
+ Selector.SelectorParseException ex = assertThrows(
+ Selector.SelectorParseException.class,
+ () -> Selector.evaluatorOf(query)
+ );
+ assertEquals("Absolute attribute key must have a name", ex.getMessage());
+ }
+
+ @Test void parseExceptionOnEmptyKeyVal() {
+ // was previously firing at match time, not eval time
+ String q = "[\"=\"]";
+ boolean threw = false;
+ try {
+ Evaluator e = Selector.evaluatorOf(q);
+ } catch (Selector.SelectorParseException ex) {
+ threw = true;
+ assertEquals("Quoted value must have content", ex.getMessage());
+ }
+ assertTrue(threw);
+ }
+
}
diff --git a/src/test/java/org/jsoup/select/StructuralEvaluatorTest.java b/src/test/java/org/jsoup/select/StructuralEvaluatorTest.java
new file mode 100644
index 0000000000..87c50540cb
--- /dev/null
+++ b/src/test/java/org/jsoup/select/StructuralEvaluatorTest.java
@@ -0,0 +1,104 @@
+package org.jsoup.select;
+
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.Arguments;
+import org.junit.jupiter.params.provider.MethodSource;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.stream.Stream;
+
+import static org.junit.jupiter.api.Assertions.*;
+
+class StructuralEvaluatorTest {
+ private static final String Html =
+ "" +
+ "
" +
+ "
One
" +
+ "
Two
" +
+ "
" +
+ "
" +
+ "
Span1 " +
+ "
Link " +
+ "
" +
+ "
" +
+ "
";
+
+ @ParameterizedTest
+ @MethodSource("selectorMemoData")
+ void selectorMemoIsClearedOnReset(String selector, boolean expectMemos) {
+ // test that the structural evaluator memos are used, and are reset
+
+ Document doc = Jsoup.parse(Html);
+ Evaluator evaluator = Selector.evaluatorOf(selector);
+
+ // collect all StructuralEvaluator instances from the parsed evaluator tree
+ List structuralEvals = new ArrayList<>();
+ collectEvals(evaluator, structuralEvals);
+
+ // use Collector.stream vs Selector.select(), as the later is able to reset after executing
+ Collector.stream(evaluator, doc).count(); // consume stream to populate memos
+ assertFalse(structuralEvals.isEmpty());
+
+ boolean hadMemos = false;
+ for (StructuralEvaluator se : structuralEvals) {
+ if (!se.threadMemo.get().isEmpty()) {
+ hadMemos = true;
+ break;
+ }
+ }
+
+ evaluator.reset();
+
+ // verify all structural evaluator thread-local maps are cleared
+ for (StructuralEvaluator se : structuralEvals) {
+ assertTrue(se.threadMemo.get().isEmpty());
+ }
+
+ assertEquals(expectMemos, hadMemos);
+ }
+
+ private static Stream selectorMemoData() {
+ return Stream.of(
+ Arguments.of("div:not(.b)", true), // Not (uses memoMatches)
+ Arguments.of("div p", true), // Ancestor (ancestor chain checks)
+ Arguments.of("span ~ a", true), // PreviousSibling
+ Arguments.of("span + a", true), // ImmediatePreviousSibling
+ Arguments.of("div > span > a", false), // ImmediateParentRun does not use memoMatches
+ Arguments.of("div:has(p)", false) // Has (coverage; does not use memo for these inputs)
+ );
+ }
+
+ private static void collectEvals(Evaluator evaluator, List out) {
+ // recursive traversal of evaluator trees to find StructuralEvaluator instances
+ if (evaluator instanceof CombiningEvaluator) {
+ CombiningEvaluator ce = (CombiningEvaluator) evaluator;
+ for (Evaluator inner : ce.evaluators) {
+ collectEvals(inner, out);
+ }
+ return;
+ }
+
+ if (evaluator instanceof StructuralEvaluator.ImmediateParentRun) {
+ StructuralEvaluator.ImmediateParentRun run = (StructuralEvaluator.ImmediateParentRun) evaluator;
+ out.add(run);
+ for (Evaluator inner : run.evaluators) {
+ collectEvals(inner, out);
+ }
+ return;
+ }
+
+ if (evaluator instanceof StructuralEvaluator) {
+ StructuralEvaluator se = (StructuralEvaluator) evaluator;
+ out.add(se);
+ collectEvals(se.evaluator, out);
+ }
+
+ }
+}
diff --git a/src/test/java11/org/jsoup/helper/HttpClientExecutorTest.java b/src/test/java11/org/jsoup/helper/HttpClientExecutorTest.java
index 475c67888d..07a853b280 100644
--- a/src/test/java11/org/jsoup/helper/HttpClientExecutorTest.java
+++ b/src/test/java11/org/jsoup/helper/HttpClientExecutorTest.java
@@ -2,17 +2,19 @@
import org.jsoup.internal.SharedConstants;
import org.junit.jupiter.api.Test;
-import static org.junit.jupiter.api.Assertions.assertEquals;
-import static org.junit.jupiter.api.Assertions.assertInstanceOf;
+import java.io.IOException;
+import java.net.*;
+import java.util.Collections;
+import java.util.List;
+
+import static org.junit.jupiter.api.Assertions.*;
public class HttpClientExecutorTest {
@Test void getsHttpClient() {
try {
enableHttpClient();
- RequestExecutor executor = RequestDispatch.get(null, null);
- //assertInstanceOf(HttpClientExecutor.class, executor);
- assertEquals("org.jsoup.helper.HttpClientExecutor", executor.getClass().getName());
- // Haven't figured out how to get Maven to allow this mjar code to be on the classpath for the surefire tests, hence not instanceof
+ RequestExecutor executor = RequestDispatch.get(new HttpConnection.Request(), null);
+ assertInstanceOf(HttpClientExecutor.class, executor);
} finally {
disableHttpClient(); // reset to previous default for JDK8 compat tests
}
@@ -20,8 +22,8 @@ public class HttpClientExecutorTest {
@Test void getsHttpUrlConnectionByDefault() {
System.clearProperty(SharedConstants.UseHttpClient);
- RequestExecutor executor = RequestDispatch.get(null, null);
- assertEquals("org.jsoup.helper.HttpClientExecutor", executor.getClass().getName());
+ RequestExecutor executor = RequestDispatch.get(new HttpConnection.Request(), null);
+ assertInstanceOf(HttpClientExecutor.class, executor);
}
public static void enableHttpClient() {
@@ -31,4 +33,86 @@ public static void enableHttpClient() {
public static void disableHttpClient() {
System.setProperty(SharedConstants.UseHttpClient, "false");
}
+
+ @Test void proxyWrapUsesSystemDefaultProxySelector() {
+ ProxySelector originalSelector = ProxySelector.getDefault();
+ InetSocketAddress defaultProxy = new InetSocketAddress("system.proxy", 8080);
+
+ try {
+ ProxySelector.setDefault(new ProxySelector() {
+ @Override
+ public List select(URI uri) {
+ return Collections.singletonList(
+ new Proxy(Proxy.Type.HTTP, defaultProxy)
+ );
+ }
+
+ @Override
+ public void connectFailed(URI uri, SocketAddress sa, IOException ioe) {}
+ });
+
+ HttpClientExecutor.ProxyWrap wrap = new HttpClientExecutor.ProxyWrap();
+ List proxies = wrap.select(URI.create("http://example.com"));
+
+ assertEquals(1, proxies.size());
+ assertSame(defaultProxy, proxies.get(0).address());
+ } finally {
+ ProxySelector.setDefault(originalSelector);
+ }
+ }
+
+ @Test void proxyWrapConnectFailedOnlyForSystemProxy() {
+ HttpClientExecutor.ProxyWrap wrap = new HttpClientExecutor.ProxyWrap();
+ HttpClientExecutor.perRequestProxy.set(new Proxy(Proxy.Type.HTTP, new InetSocketAddress("custom", 9090)));
+ wrap.connectFailed(URI.create("http://example.com"),
+ new InetSocketAddress("custom", 9090),
+ new IOException("test"));
+ HttpClientExecutor.perRequestProxy.remove();
+ }
+
+ @Test
+ void perRequestProxyOverridesSystemDefault() {
+ ProxySelector original = ProxySelector.getDefault();
+ InetSocketAddress sysProxy = new InetSocketAddress("system.proxy", 8080);
+ InetSocketAddress perReqProxy = new InetSocketAddress("per.request", 9999);
+ try {
+ ProxySelector.setDefault(new ProxySelector() {
+ @Override
+ public List select(URI uri) {
+ return Collections.singletonList(
+ new Proxy(Proxy.Type.HTTP, sysProxy));
+ }
+ @Override
+ public void connectFailed(URI uri, SocketAddress sa, IOException ioe) {}
+ });
+
+ HttpClientExecutor.perRequestProxy.set(
+ new Proxy(Proxy.Type.HTTP, perReqProxy));
+
+ HttpClientExecutor.ProxyWrap wrap = new HttpClientExecutor.ProxyWrap();
+ List proxies = wrap.select(URI.create("http://example.com"));
+ assertSame(perReqProxy, proxies.get(0).address());
+ } finally {
+ HttpClientExecutor.perRequestProxy.remove();
+ ProxySelector.setDefault(original);
+ }
+ }
+
+ @Test void connectFailedDelegatesToSystemDefault() {
+ ProxySelector original = ProxySelector.getDefault();
+ final boolean[] called = {false};
+ try {
+ ProxySelector.setDefault(new ProxySelector() {
+ @Override
+ public List select(URI uri) { return Collections.singletonList(Proxy.NO_PROXY); }
+ @Override
+ public void connectFailed(URI uri, SocketAddress sa, IOException ioe) { called[0] = true; }
+ });
+ new HttpClientExecutor.ProxyWrap()
+ .connectFailed(URI.create("http://example.com"), new InetSocketAddress("x", 80), new IOException("x"));
+ assertTrue(called[0]);
+ } finally {
+ ProxySelector.setDefault(original);
+ }
+ }
}
diff --git a/src/test/resources/fuzztests/2374.html.gz b/src/test/resources/fuzztests/2374.html.gz
new file mode 100644
index 0000000000..1541e0ef07
Binary files /dev/null and b/src/test/resources/fuzztests/2374.html.gz differ
diff --git a/src/test/resources/fuzztests/2393.html.gz b/src/test/resources/fuzztests/2393.html.gz
new file mode 100644
index 0000000000..02213d2950
Binary files /dev/null and b/src/test/resources/fuzztests/2393.html.gz differ
diff --git a/src/test/resources/fuzztests/2397.html.gz b/src/test/resources/fuzztests/2397.html.gz
new file mode 100644
index 0000000000..81900aa368
Binary files /dev/null and b/src/test/resources/fuzztests/2397.html.gz differ
diff --git a/src/test/resources/fuzztests/48116.html.gz b/src/test/resources/fuzztests/48116.html.gz
index 37367dc8cc..748c5efddd 100644
Binary files a/src/test/resources/fuzztests/48116.html.gz and b/src/test/resources/fuzztests/48116.html.gz differ
diff --git a/src/test/resources/fuzztests/9056.html.gz b/src/test/resources/fuzztests/9056.html.gz
new file mode 100644
index 0000000000..21c10af80d
Binary files /dev/null and b/src/test/resources/fuzztests/9056.html.gz differ
diff --git a/src/test/resources/fuzztests/as-replace.html.gz b/src/test/resources/fuzztests/as-replace.html.gz
new file mode 100644
index 0000000000..5770cd6834
Binary files /dev/null and b/src/test/resources/fuzztests/as-replace.html.gz differ
diff --git a/src/test/resources/fuzztests/ex-inselect16.html b/src/test/resources/fuzztests/ex-inselect16.html
new file mode 100644
index 0000000000..75cefea18d
--- /dev/null
+++ b/src/test/resources/fuzztests/ex-inselect16.html
@@ -0,0 +1 @@
+<b>