diff --git a/.github/dependabot.yml b/.github/dependabot.yml index 80073e53fb..986770a26c 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -5,14 +5,21 @@ updates: directory: / schedule: interval: weekly + cooldown: + default-days: 12 ignore: # Jetty 9.x needed for JDK8 compatibility; it still receives security updates. Only used in tests. - dependency-name: "org.eclipse.jetty:jetty-server" update-types: ["version-update:semver-major"] - dependency-name: "org.eclipse.jetty:jetty-servlet" update-types: ["version-update:semver-major"] + # Et tu, junit? Keep us on 5, as 6 has min JDK17 - https://docs.junit.org/6.0.0-RC3/release-notes/#release-notes-6.0.0-M1 + - dependency-name: "org.junit.jupiter:junit-jupiter" + update-types: ["version-update:semver-major"] - package-ecosystem: github-actions directory: / schedule: interval: weekly + cooldown: + default-days: 12 diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 0737dae6e1..f91448634f 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -10,15 +10,15 @@ jobs: matrix: os: [ubuntu-latest, windows-latest, macOS-latest] # choosing to run a reduced set of LTS, current, and next, to balance coverage and execution time - java: [8, 17, 21] + java: [8, 17, 25] fail-fast: false name: Test JDK ${{ matrix.java }}, ${{ matrix.os }} steps: - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@v6 - name: Set up JDK ${{ matrix.java }} - uses: actions/setup-java@v4 + uses: actions/setup-java@v5 with: java-version: ${{ matrix.java }} distribution: 'zulu' diff --git a/.github/workflows/cifuzz.yml b/.github/workflows/cifuzz.yml index 4228bdad80..27c5142db4 100644 --- a/.github/workflows/cifuzz.yml +++ b/.github/workflows/cifuzz.yml @@ -19,7 +19,7 @@ jobs: dry-run: false language: jvm - name: Upload Crash - uses: actions/upload-artifact@v4 + uses: actions/upload-artifact@v6 if: failure() && steps.build.outcome == 'success' with: name: artifacts diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 9ad4905964..b98fc33133 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -12,19 +12,19 @@ jobs: name: "CodeQL" steps: - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@v6 - name: Set up JDK - uses: actions/setup-java@v4 + uses: actions/setup-java@v5 with: java-version: 17 distribution: 'temurin' cache: 'maven' - name: CodeQL Initialization - uses: github/codeql-action/init@v3 + uses: github/codeql-action/init@v4 with: languages: java queries: +security-and-quality - name: Autobuild - uses: github/codeql-action/autobuild@v3 + uses: github/codeql-action/autobuild@v4 - name: CodeQL Analysis - uses: github/codeql-action/analyze@v3 + uses: github/codeql-action/analyze@v4 diff --git a/CHANGES.md b/CHANGES.md index b12d9d7246..a34c124c2b 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,20 +1,69 @@ # jsoup Changelog -## 1.21.2 (PENDING) +## 1.22.2 (PENDING) + +### Bug Fixes +* Android (R8/ProGuard): added a rule to ignore the optional `re2j` dependency when not present. [#2459](https://github.com/jhy/jsoup/issues/2459) + +## 1.22.1 (2026-Jan-01) + +### Improvements +* Added support for using the `re2j` regular expression engine for regex-based CSS selectors (e.g. `[attr~=regex]`, `:matches(regex)`), which ensures linear-time performance for regex evaluation. This allows safer handling of arbitrary user-supplied query regexes. To enable, add the `com.google.re2j` dependency to your classpath, e.g.: +```xml + + com.google.re2j + re2j + 1.8 + + ``` + (If you already have that dependency in your classpath, but you want to keep using the Java regex engine, you can disable re2j via `System.setProperty("jsoup.useRe2j", "false")`.) You can confirm that the re2j engine has been enabled correctly by calling `org.jsoup.helper.Regex.usingRe2j()`. [#2407](https://github.com/jhy/jsoup/pull/2407) + +* Added an instance method `Parser#unescape(String, boolean)` that unescapes HTML entities using the parser's configuration (e.g. to support error tracking), complementing the existing static utility `Parser.unescapeEntities(String, boolean)`. [#2396](https://github.com/jhy/jsoup/pull/2396) +* Added a configurable maximum parser depth (to limit the number of open elements on stack) to both HTML and XML parsers. The HTML parser now defaults to a depth of 512 to match browser behavior, and protect against unbounded stack growth, while the XML parser keeps unlimited depth by default, but can opt into a limit via `org.jsoup.parser.Parser#setMaxDepth`. [#2421](https://github.com/jhy/jsoup/issues/2421) +* Build: added CI coverage for JDK 25 [#2403](https://github.com/jhy/jsoup/pull/2403) +* Build: added a CI fuzzer for contextual fragment parsing (in addition to existing full body HTML and XML fuzzers). [oss-fuzz #14041](https://github.com/google/oss-fuzz/pull/14041) + +### Changes +* Set a removal schedule of jsoup 1.24.1 for previously deprecated APIs. + +### Bug Fixes +* Previously cached child `Elements` of an `Element` were not correctly invalidated in `Node#replaceWith(Node)`, which could lead to incorrect results when subsequently calling `Element#children()`. [#2391](https://github.com/jhy/jsoup/issues/2391) +* Attribute selector values are now compared literally without trimming. Previously, jsoup trimmed whitespace from selector values and from element attribute values, which could cause mismatches with browser behavior (e.g. `[attr=" foo "]`). Now matches align with the CSS specification and browser engines. [#2380](https://github.com/jhy/jsoup/issues/2380) +* When using the JDK HttpClient, any system default proxy (`ProxySelector.getDefault()`) was ignored. Now, the system proxy is used if a per-request proxy is not set. [#2388](https://github.com/jhy/jsoup/issues/2388), [#2390](https://github.com/jhy/jsoup/pull/2390) +* A `ValidationException` could be thrown in the adoption agency algorithm with particularly broken input. Now logged as a parse error. [#2393](https://github.com/jhy/jsoup/issues/2393) +* Null characters in the HTML body were not consistently removed; and in foreign content were not correctly replaced. [#2395](https://github.com/jhy/jsoup/issues/2395) +* An `IndexOutOfBoundsException` could be thrown when parsing a body fragment with crafted input. Now logged as a parse error. [#2397](https://github.com/jhy/jsoup/issues/2397), [#2406](https://github.com/jhy/jsoup/issues/2406) +* When using StructuralEvaluators (e.g., a `parent child` selector) across many retained threads, their memoized results could also be retained, increasing memory use. These results are now cleared immediately after use, reducing overall memory consumption. [#2411](https://github.com/jhy/jsoup/issues/2411) +* Cloning a `Parser` now preserves any custom `TagSet` applied to the parser. [#2422](https://github.com/jhy/jsoup/issues/2422), [#2423](https://github.com/jhy/jsoup/pull/2423) +* Custom tags marked as `Tag.Void` now parse and serialize like the built-in void elements: they no longer consume following content, and the XML serializer emits the expected self-closing form. [#2425](https://github.com/jhy/jsoup/issues/2425) +* The `
` element is once again classified as an inline tag (`Tag.isBlock() == false`), matching common developer expectations and its role as phrasing content in HTML, while pretty-printing and text extraction continue to treat it as a line break in the rendered output. [#2387](https://github.com/jhy/jsoup/issues/2387), [#2439](https://github.com/jhy/jsoup/issues/2439) +* Fixed an intermittent truncation issue when fetching and parsing remote documents via `Jsoup.connect(url).get()`. On responses without a charset header, the initial charset sniff could sometimes (depending on buffering / `available()` behavior) be mistaken for end-of-stream and a partial parse reused, dropping trailing content. [#2448](https://github.com/jhy/jsoup/issues/2448) +* `TagSet` copies no longer mutate their template during lazy lookups, preventing cross-thread `ConcurrentModificationException` when parsing with shared sessions. [#2453](https://github.com/jhy/jsoup/pull/2453) +* Fixed parsing of `` `foreignObject` content nested within a `

`, which could incorrectly move the HTML subtree outside the SVG. [#2452](https://github.com/jhy/jsoup/issues/2452) + +### Internal Changes +* Deprecated internal helper `org.jsoup.internal.Functions` (for removal in v1.23.1). This was previously used to support older Android API levels without full `java.util.function` coverage; jsoup now requires core library desugaring so this indirection is no longer necessary. [#2412](https://github.com/jhy/jsoup/pull/2412) + +## 1.21.2 (2025-Aug-25) ### Changes * Deprecated internal (yet visible) methods `Normalizer#normalize(String, bool)` and `Attribute#shouldCollapseAttribute(Document.OutputSettings)`. These will be removed in a future version. +* Deprecated `Connection#sslSocketFactory(SSLSocketFactory)` in favor of the new `Connection#sslContext(SSLContext)`. Using `sslSocketFactory` will force the use of the legacy `HttpUrlConnection` implementation, which does not support HTTP/2. [#2370](https://github.com/jhy/jsoup/pull/2370) ### Improvements * When pretty-printing, if there are consecutive text nodes (via DOM manipulation), the non-significant whitespace between them will be collapsed. [#2349](https://github.com/jhy/jsoup/pull/2349). * Updated `Connection.Response#statusMessage()` to return a simple loggable string message (e.g. "OK") when using the `HttpClient` implementation, which doesn't otherwise return any server-set status message. [#2356](https://github.com/jhy/jsoup/issues/2346) * `Attributes#size()` and `Attributes#isEmpty()` now exclude any internal attributes (such as user data) from their count. This aligns with the attributes' serialized output and iterator. [#2369](https://github.com/jhy/jsoup/pull/2369) +* Added `Connection#sslContext(SSLContext)` to provide a custom SSL (TLS) context to requests, supporting both the `HttpClient` and the legacy `HttUrlConnection` implementations. [#2370](https://github.com/jhy/jsoup/pull/2370) +* Performance optimizations for DOM manipulation methods including when repeatedly removing an element's first child (`element.child(0).remove()`, and when using `Parser#parseBodyFragement()` to parse a large number of direct children. [#2373](https://github.com/jhy/jsoup/pull/2373). ### Bug Fixes * When parsing from an InputStream and a multibyte character happened to straddle a buffer boundary, the stream would not be completely read. [#2353](https://github.com/jhy/jsoup/issues/2353). * In `NodeTraversor`, if a last child element was removed during the `head()` call, the parent would be visited twice. [#2355](https://github.com/jhy/jsoup/issues/2355). * Cloning an Element that has an Attributes object would add an empty internal user-data attribute to that clone, which would cause unexpected results for `Attributes#size()` and `Attributes#isEmpty()`. [#2356](https://github.com/jhy/jsoup/issues/2356) * In a multithreaded application where multiple threads are calling `Element#children()` on the same element concurrently, a race condition could happen when the method was generating the internal child element cache (a filtered view of its child nodes). Since concurrent reads of DOM objects should be threadsafe without external synchronization, this method has been updated to execute atomically. [#2366](https://github.com/jhy/jsoup/issues/2366) +* When parsing HTML with svg:script elements in SVG elements, don't enter the Text insertion mode, but continue to parse as foreign content. Otherwise, misnested HTML could then cause an IndexOutOfBoundsException. [#2374](https://github.com/jhy/jsoup/issues/2374) +* Malformed HTML could throw an IndexOutOfBoundsException during the adoption agency. [#2377](https://github.com/jhy/jsoup/pull/2377). ## 1.21.1 (2025-Jun-23) diff --git a/jitpack.yml b/jitpack.yml new file mode 100644 index 0000000000..7800d9ee8d --- /dev/null +++ b/jitpack.yml @@ -0,0 +1,6 @@ +before_install: + - sdk install java 21.0.2-open + - sdk use java 21.0.2-open + - sdk install maven +install: + - mvn clean install -Djapicmp.skip=true -DskipTests diff --git a/pom.xml b/pom.xml index 7b92058312..707cfd3d00 100644 --- a/pom.xml +++ b/pom.xml @@ -5,7 +5,7 @@ org.jsoup jsoup - 1.21.2-SNAPSHOT + 1.22.2-SNAPSHOT https://jsoup.org/ jsoup is a Java library that simplifies working with real-world HTML and XML. It offers an easy-to-use API for URL fetching, data parsing, extraction, and manipulation using DOM API methods, CSS, and xpath selectors. jsoup implements the WHATWG HTML5 specification, and parses HTML to the same DOM as modern browsers. 2009 @@ -33,7 +33,7 @@ UTF-8 - 9.4.57.v20241219 + 9.4.58.v20250814 @@ -41,7 +41,7 @@ org.apache.maven.plugins maven-compiler-plugin - 3.14.0 + 3.15.0 UTF-8 false @@ -66,7 +66,7 @@ org.codehaus.mojo animal-sniffer-maven-plugin - 1.24 + 1.27 api-java8 @@ -133,7 +133,7 @@ org.apache.maven.plugins maven-javadoc-plugin - 3.11.2 + 3.12.0 none 8 @@ -151,7 +151,7 @@ org.apache.maven.plugins maven-source-plugin - 3.3.1 + 3.4.0 org/jsoup/examples/** @@ -169,7 +169,7 @@ org.apache.maven.plugins maven-jar-plugin - 3.4.2 + 3.5.0 @@ -240,16 +240,16 @@ org.apache.maven.plugins maven-resources-plugin - 3.3.1 + 3.4.0 maven-release-plugin - 3.1.1 + 3.3.1 org.apache.maven.plugins maven-surefire-plugin - 3.5.3 + 3.5.4 -Xss640k @@ -257,7 +257,7 @@ maven-failsafe-plugin - 3.5.3 + 3.5.4 @@ -275,14 +275,14 @@ com.github.siom79.japicmp japicmp-maven-plugin - 0.23.1 + 0.25.4 org.jsoup jsoup - 1.21.1 + 1.21.2 jar @@ -291,7 +291,7 @@ false true true - + @@ -316,8 +316,21 @@ + + org.sonatype.central + central-publishing-maven-plugin + 0.10.0 + true + + central + + + + src/main/resources + false + ./ META-INF/jsoup/ @@ -329,19 +342,6 @@ - - - sonatype-nexus-snapshots - Sonatype Nexus Snapshots - https://oss.sonatype.org/content/repositories/snapshots - - - sonatype-nexus-staging - Nexus Release Repository - https://oss.sonatype.org/service/local/staging/deploy/maven2/ - - - @@ -381,6 +381,27 @@ + + + org.codehaus.mojo + build-helper-maven-plugin + 3.6.1 + + + add-java11-test-source + generate-test-sources + + add-test-source + + + + ${project.basedir}/src/test/java11 + + + + + + org.apache.maven.plugins maven-compiler-plugin @@ -412,23 +433,6 @@ true - - - - testcompile-java-11 - test-compile - - testCompile - - - 11 - - - ${project.basedir}/src/test/java11 - - - - @@ -480,7 +484,7 @@ maven-failsafe-plugin - 3.5.3 + 3.5.4 @@ -501,7 +505,7 @@ org.junit.jupiter junit-jupiter - 5.13.3 + 5.14.2 test @@ -509,7 +513,7 @@ com.google.code.gson gson - 2.13.1 + 2.13.2 test @@ -544,6 +548,15 @@ 1.0.0 provided + + + + com.google.re2j + re2j + 1.8 + true + compile + diff --git a/src/main/java/org/jsoup/Connection.java b/src/main/java/org/jsoup/Connection.java index ed095f44d6..5d5c2a0140 100644 --- a/src/main/java/org/jsoup/Connection.java +++ b/src/main/java/org/jsoup/Connection.java @@ -6,6 +6,8 @@ import org.jsoup.parser.StreamParser; import org.jspecify.annotations.Nullable; +import javax.net.ssl.HostnameVerifier; +import javax.net.ssl.SSLContext; import javax.net.ssl.SSLSocketFactory; import java.io.BufferedInputStream; import java.io.IOException; @@ -39,7 +41,7 @@ the lifetime of the Connection object. A socket connection is only made at the p #execute()}, {@link #get()}, or {@link #post()}), and the server's response consumed.

For multi-threaded implementations, it is important to use a {@link #newRequest()} for each request. The session may be shared across concurrent threads, but a not a specific request.

-

HTTP/2 support: On JDK/JRE 11 and above, requests use {@link java.net.http.HttpClient}, which supports +

HTTP/2 support: On JVM 11 and above, requests use {@link java.net.http.HttpClient}, which supports HTTP/2. To use the legacy {@link java.net.HttpURLConnection} instead, set System.setProperty("jsoup.useHttpClient", "false").

*/ @@ -150,7 +152,7 @@ default Connection newRequest(URL url) {

The default timeout is 30 seconds (30,000 millis). A timeout of zero is treated as an infinite timeout.

This timeout specifies the combined maximum duration of the connection time and the time to read the full response.

-

Implementation note: when this Connection is backed by HttpURLConnection (rather than HttpClient, as used in JRE/JDK 11+), this timeout is implemented by setting both the socket connect and read timeouts to half of the specified value.

+

Implementation note: when this Connection is backed by HttpURLConnection (rather than HttpClient, as used in JVM 11+), this timeout is implemented by setting both the socket connect and read timeouts to half of the specified value.

@param millis number of milliseconds (thousandths of a second) before timing out connects or reads. @return this Connection, for chaining @@ -210,12 +212,38 @@ default Connection newRequest(URL url) { Connection ignoreContentType(boolean ignoreContentType); /** - * Set custom SSL socket factory - * @param sslSocketFactory custom SSL socket factory - * @return this Connection, for chaining + Set a custom SSL socket factory for HTTPS connections. +

Note: if set, the legacy HttpURLConnection will be used instead of the JVM's + HttpClient.

+ + @param sslSocketFactory SSL socket factory + @return this Connection, for chaining + @see #sslContext(SSLContext) + @deprecated use {@link #sslContext(SSLContext)} instead; will be removed in jsoup 1.24.1. */ + @Deprecated Connection sslSocketFactory(SSLSocketFactory sslSocketFactory); + /** + Set a custom SSL context for HTTPS connections. +

Note: when using the legacy HttpURLConnection, only the SSLSocketFactory from the + context will be used.

+ + @param sslContext SSL context + @return this Connection, for chaining + @since 1.21.2 + */ + default Connection sslContext(SSLContext sslContext) { + throw new UnsupportedOperationException(); + } + + /** + * Set a custom hostname verifier to verify the hostname during handshake + * @param hostnameVerifier hostname verifier + * @return this Connection, for chaining + */ + Connection hostnameVerifier(HostnameVerifier hostnameVerifier); + /** * Add a request data parameter. Request parameters are sent in the request query string for GETs, and in the * request body for POSTs. A request may have multiple values of the same name. @@ -769,11 +797,53 @@ interface Request extends Base { @Nullable SSLSocketFactory sslSocketFactory(); /** - * Set a custom SSL socket factory. - * @param sslSocketFactory SSL socket factory + Set a custom SSL socket factory for HTTPS connections. +

Note: if set, the legacy HttpURLConnection will be used instead of the JVM's + HttpClient.

+ + @param sslSocketFactory SSL socket factory + @see #sslContext(SSLContext) + @deprecated use {@link #sslContext(SSLContext)} instead; will be removed in jsoup 1.24.1. */ + @Deprecated void sslSocketFactory(SSLSocketFactory sslSocketFactory); + /** + Get the current custom SSL context, if any. + + @return custom SSL context if set, null otherwise + @since 1.21.2 + */ + @Nullable + default SSLContext sslContext() { + throw new UnsupportedOperationException(); + } + + /** + Set a custom SSL context for HTTPS connections. +

Note: when using the legacy HttpURLConnection, only the SSLSocketFactory from the + context will be used.

+ + @param sslContext SSL context + @return this Request, for chaining + @since 1.21.2 + */ + default Request sslContext(SSLContext sslContext) { + throw new UnsupportedOperationException(); + } + + /** + * Get the current hostname verifier, if any. + * @return hostname verifier if set, null otherwise + */ + @Nullable HostnameVerifier hostnameVerifier(); + + /** + * Set a custom hostname verifier to verify the hostname during handshake + * @param hostnameVerifier hostname verifier + */ + void hostnameVerifier(HostnameVerifier hostnameVerifier); + /** * Add a data parameter to the request * @param keyval data to add. @@ -983,7 +1053,7 @@ default Response readFully() throws IOException { *

Calling {@link #body() } or {@link #bodyAsBytes()} has the same effect.

* @return this response, for chaining * @throws UncheckedIOException if an IO exception occurs during buffering. - * @deprecated use {@link #readFully()} instead (for the checked exception). Will be removed in a future version. + * @deprecated use {@link #readFully()} instead (for the checked exception). Will be removed in jsoup 1.24.1. */ @Deprecated Response bufferUp(); diff --git a/src/main/java/org/jsoup/helper/DataUtil.java b/src/main/java/org/jsoup/helper/DataUtil.java index 87c76a3ca2..4124fe4fd0 100644 --- a/src/main/java/org/jsoup/helper/DataUtil.java +++ b/src/main/java/org/jsoup/helper/DataUtil.java @@ -248,6 +248,7 @@ static CharsetDoc detectCharset(ControllableInputStream input, @Nullable String if (charsetName == null) { // read ahead and determine from meta. safe first parse as UTF-8 int origMax = input.max(); input.max(firstReadBufferSize); + input.resetFullyRead(); // clear any pre-read (e.g., BOM) state before capped sniff input.mark(firstReadBufferSize); input.allowClose(false); // ignores closes during parse, in case we need to rewind try (Reader reader = new SimpleStreamReader(input, UTF_8)) { // input is currently capped to firstReadBufferSize diff --git a/src/main/java/org/jsoup/helper/HttpConnection.java b/src/main/java/org/jsoup/helper/HttpConnection.java index c0a6fdf8fc..9b5860d298 100644 --- a/src/main/java/org/jsoup/helper/HttpConnection.java +++ b/src/main/java/org/jsoup/helper/HttpConnection.java @@ -11,6 +11,8 @@ import org.jsoup.parser.StreamParser; import org.jspecify.annotations.Nullable; +import javax.net.ssl.HostnameVerifier; +import javax.net.ssl.SSLContext; import javax.net.ssl.SSLSocketFactory; import java.io.BufferedInputStream; import java.io.BufferedReader; @@ -223,6 +225,18 @@ public Connection sslSocketFactory(SSLSocketFactory sslSocketFactory) { return this; } + @Override + public Connection sslContext(SSLContext sslContext) { + req.sslContext(sslContext); + return this; + } + + @Override + public Connection hostnameVerifier(HostnameVerifier hostnameVerifier) { + req.hostnameVerifier(hostnameVerifier); + return this; + } + @Override public Connection data(String key, String filename, InputStream inputStream) { req.data(KeyVal.create(key, filename, inputStream)); @@ -618,11 +632,13 @@ public static class Request extends HttpConnection.Base impl private boolean parserDefined = false; // called parser(...) vs initialized in ctor private String postDataCharset = DataUtil.defaultCharsetName; private @Nullable SSLSocketFactory sslSocketFactory; + @Nullable SSLContext sslContext; private CookieManager cookieManager; @Nullable RequestAuthenticator authenticator; private @Nullable Progress responseProgress; private final ReentrantLock executing = new ReentrantLock(); // detects and warns if same request used concurrently + private @Nullable HostnameVerifier hostnameVerifier; Request() { super(); @@ -652,6 +668,7 @@ public static class Request extends HttpConnection.Base impl parser = copy.parser.newInstance(); // parsers and their tree-builders maintain state, so need a fresh copy parserDefined = copy.parserDefined; sslSocketFactory = copy.sslSocketFactory; // these are all synchronized so safe to share + sslContext = copy.sslContext; cookieManager = copy.cookieManager; authenticator = copy.authenticator; responseProgress = copy.responseProgress; @@ -724,6 +741,25 @@ public void sslSocketFactory(SSLSocketFactory sslSocketFactory) { this.sslSocketFactory = sslSocketFactory; } + @Override @Nullable + public SSLContext sslContext() { + return sslContext; + } + + @Override + public Connection.Request sslContext(SSLContext sslContext) { + this.sslContext = sslContext; + return this; + } + + public HostnameVerifier hostnameVerifier() { + return hostnameVerifier; + } + + public void hostnameVerifier(HostnameVerifier hostnameVerifier) { + this.hostnameVerifier = hostnameVerifier; + } + @Override public Connection.Request ignoreHttpErrors(boolean ignoreHttpErrors) { this.ignoreHttpErrors = ignoreHttpErrors; diff --git a/src/main/java/org/jsoup/helper/Re2jRegex.java b/src/main/java/org/jsoup/helper/Re2jRegex.java new file mode 100644 index 0000000000..b9e65bed60 --- /dev/null +++ b/src/main/java/org/jsoup/helper/Re2jRegex.java @@ -0,0 +1,48 @@ +package org.jsoup.helper; + +/** + re2j-backed Regex implementation; must only be touched when re2j is on the classpath. + */ +final class Re2jRegex extends Regex { + private static final java.util.regex.Pattern unused = java.util.regex.Pattern.compile(""); + + private final com.google.re2j.Pattern re2jPattern; + + private Re2jRegex(com.google.re2j.Pattern re2jPattern) { + super(unused); + this.re2jPattern = re2jPattern; + } + + public static Regex compile(String regex) { + try { + return new Re2jRegex(com.google.re2j.Pattern.compile(regex)); + } catch (RuntimeException e) { + throw new ValidationException("Pattern syntax error: " + e.getMessage()); + } catch (OutOfMemoryError | StackOverflowError e) { // defensive check on regex to normalize exception + throw new ValidationException("Pattern complexity error: " + e.getMessage()); + } + } + + @Override + public Matcher matcher(CharSequence input) { + return new Re2jMatcher(re2jPattern.matcher(input)); + } + + @Override + public String toString() { + return re2jPattern.toString(); + } + + private static final class Re2jMatcher implements Matcher { + private final com.google.re2j.Matcher delegate; + + Re2jMatcher(com.google.re2j.Matcher delegate) { + this.delegate = delegate; + } + + @Override + public boolean find() { + return delegate.find(); + } + } +} diff --git a/src/main/java/org/jsoup/helper/Regex.java b/src/main/java/org/jsoup/helper/Regex.java new file mode 100644 index 0000000000..7b157ddc90 --- /dev/null +++ b/src/main/java/org/jsoup/helper/Regex.java @@ -0,0 +1,119 @@ +package org.jsoup.helper; + +import org.jsoup.internal.SharedConstants; + +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; +import java.util.regex.Pattern; +import java.util.regex.PatternSyntaxException; + +/** + A regular expression abstraction. Allows jsoup to optionally use the re2j regular expression engine (linear time) + instead of the JDK's backtracking regex implementation. + +

If the {@code com.google.re2j} library is found on the classpath, by default it will be used. You can override this + by setting {@code -Djsoup.useRe2j=false} to explicitly disable, and use the JDK regex engine.

+ +

(Currently this a simplified implementation for jsoup's specific use; can extend as required.)

+ */ +public class Regex { + private static final boolean hasRe2j = hasRe2j(); + + private final Pattern jdkPattern; + + Regex(Pattern jdkPattern) { + this.jdkPattern = jdkPattern; + } + + /** + Compile a regex, using re2j if enabled and available; otherwise JDK regex. + + @param regex the regex to compile + @return the compiled regex + @throws ValidationException if the regex is invalid + */ + public static Regex compile(String regex) { + if (usingRe2j()) { + return Re2jRegex.compile(regex); + } + + try { + return new Regex(Pattern.compile(regex)); + } catch (PatternSyntaxException e) { + throw new ValidationException("Pattern syntax error: " + e.getMessage()); + } + } + + /** Wraps an existing JDK Pattern (for API compat); doesn't switch */ + public static Regex fromPattern(Pattern pattern) { + return new Regex(pattern); + } + + /** + Checks if re2j is available (on classpath) and enabled (via system property). + @return true if re2j is available and enabled + */ + public static boolean usingRe2j() { + return hasRe2j && wantsRe2j(); + } + + static boolean wantsRe2j() { + return Boolean.parseBoolean(System.getProperty(SharedConstants.UseRe2j, "true")); + } + + static void wantsRe2j(boolean use) { + System.setProperty(SharedConstants.UseRe2j, Boolean.toString(use)); + } + + static boolean hasRe2j() { + try { + Class re2 = Class.forName("com.google.re2j.Pattern", false, Regex.class.getClassLoader()); // check if re2j is in classpath + try { + // if it is, and we are on JVM9+, we need to dork around with modules, because re2j doesn't publish a module name. + // done via reflection so we can still run on JVM 8. + // todo remove if re2j publishes as a module + Class moduleCls = Class.forName("java.lang.Module"); + Method getModule = Class.class.getMethod("getModule"); + Object jsoupMod = getModule.invoke(Regex.class); + Object re2Mod = getModule.invoke(re2); + boolean reads = (boolean) moduleCls.getMethod("canRead", moduleCls).invoke(jsoupMod, re2Mod); + if (!reads) moduleCls.getMethod("addReads", moduleCls).invoke(jsoupMod, re2Mod); + } catch (ClassNotFoundException ignore) { + // jvm8 - no Module class; so we can use as-is + } + return true; + } catch (ClassNotFoundException e) { + return false; // no re2j + } catch (ReflectiveOperationException e) { + // unexpectedly couldn’t wire modules on 9+; return false to avoid IllegalAccessError later + System.err.println("Warning: (bug? please report) couldn't access re2j from jsoup due to modules: " + e); + return false; + } + } + + public Matcher matcher(CharSequence input) { + return new JdkMatcher(jdkPattern.matcher(input)); + } + + @Override + public String toString() { + return jdkPattern.toString(); + } + + public interface Matcher { + boolean find(); + } + + private static final class JdkMatcher implements Matcher { + private final java.util.regex.Matcher delegate; + + JdkMatcher(java.util.regex.Matcher delegate) { + this.delegate = delegate; + } + + @Override + public boolean find() { + return delegate.find(); + } + } +} diff --git a/src/main/java/org/jsoup/helper/RequestDispatch.java b/src/main/java/org/jsoup/helper/RequestDispatch.java index fc73e7c99f..e79f3c41a3 100644 --- a/src/main/java/org/jsoup/helper/RequestDispatch.java +++ b/src/main/java/org/jsoup/helper/RequestDispatch.java @@ -9,7 +9,7 @@ import java.lang.reflect.Constructor; /** - Handles requests using either HttpClient (available in JDK 11+) or HttpURLConnection. During initialization, the + Handles requests using either HttpClient (available in JVM 11+) or HttpURLConnection. During initialization, the HttpClientExecutor class is used if it can be instantiated, unless the system property {@link SharedConstants#UseHttpClient} is explicitly set to {@code false}. */ @@ -32,6 +32,10 @@ class RequestDispatch { static RequestExecutor get(Request request, @Nullable Response previousResponse) { boolean useHttpClient = Boolean.parseBoolean(System.getProperty(SharedConstants.UseHttpClient, "true")); + + if (request.sslSocketFactory() != null) // downgrade if a socket factory is set, as it can't be supplied to the HttpClient + useHttpClient = false; + if (useHttpClient && clientConstructor != null) { try { return clientConstructor.newInstance(request, previousResponse); diff --git a/src/main/java/org/jsoup/helper/UrlConnectionExecutor.java b/src/main/java/org/jsoup/helper/UrlConnectionExecutor.java index 9164c69d41..c2d6cabfc1 100644 --- a/src/main/java/org/jsoup/helper/UrlConnectionExecutor.java +++ b/src/main/java/org/jsoup/helper/UrlConnectionExecutor.java @@ -1,7 +1,6 @@ package org.jsoup.helper; import org.jsoup.Connection; -import org.jsoup.internal.Functions; import org.jspecify.annotations.Nullable; import javax.net.ssl.HttpsURLConnection; @@ -90,8 +89,15 @@ private static HttpURLConnection createConnection(HttpConnection.Request req) th conn.setConnectTimeout(req.timeout()); conn.setReadTimeout(req.timeout() / 2); // gets reduced after connection is made and status is read - if (req.sslSocketFactory() != null && conn instanceof HttpsURLConnection) - ((HttpsURLConnection) conn).setSSLSocketFactory(req.sslSocketFactory()); + if (conn instanceof HttpsURLConnection) { + HttpsURLConnection scon = (HttpsURLConnection) conn; + if (req.sslContext != null) + scon.setSSLSocketFactory(req.sslContext.getSocketFactory()); + else if (req.sslSocketFactory() != null) + scon.setSSLSocketFactory(req.sslSocketFactory()); + if (req.hostnameVerifier() != null) + scon.setHostnameVerifier(req.hostnameVerifier()); + } if (req.authenticator != null) AuthenticationHandler.handler.enable(req.authenticator, conn); // removed in finally if (req.method().hasBody()) @@ -118,7 +124,7 @@ private static LinkedHashMap> createHeaderMap(HttpURLConnec if (key == null || val == null) continue; // skip http1.1 line - final List vals = headers.computeIfAbsent(key, Functions.listFunction()); + final List vals = headers.computeIfAbsent(key, k -> new java.util.ArrayList<>()); vals.add(val); } return headers; diff --git a/src/main/java/org/jsoup/helper/Validate.java b/src/main/java/org/jsoup/helper/Validate.java index cc8dcaf342..d8e29d6e44 100644 --- a/src/main/java/org/jsoup/helper/Validate.java +++ b/src/main/java/org/jsoup/helper/Validate.java @@ -48,7 +48,7 @@ public static void notNull(@Nullable Object obj, String msg) { * @param obj nullable object to cast to not-null * @return the object, or throws an exception if it is null * @throws ValidationException if the object is null - * @deprecated prefer to use {@link #expectNotNull(Object, String, Object...)} instead + * @deprecated prefer to use {@link #expectNotNull(Object, String, Object...)} instead; will be removed in jsoup 1.24.1 */ @Deprecated public static Object ensureNotNull(@Nullable Object obj) { @@ -65,7 +65,7 @@ public static Object ensureNotNull(@Nullable Object obj) { * @param args the arguments to the msg * @return the object, or throws an exception if it is null * @throws ValidationException if the object is null - * @deprecated prefer to use {@link #expectNotNull(Object, String, Object...)} instead + * @deprecated prefer to use {@link #expectNotNull(Object, String, Object...)} instead; will be removed in jsoup 1.24.1 */ @Deprecated public static Object ensureNotNull(@Nullable Object obj, String msg, Object... args) { diff --git a/src/main/java/org/jsoup/internal/ControllableInputStream.java b/src/main/java/org/jsoup/internal/ControllableInputStream.java index cf8361805c..c3238a3fab 100644 --- a/src/main/java/org/jsoup/internal/ControllableInputStream.java +++ b/src/main/java/org/jsoup/internal/ControllableInputStream.java @@ -20,19 +20,19 @@ // reimplemented from ConstrainableInputStream for JDK21 - extending BufferedInputStream will pin threads during read public class ControllableInputStream extends FilterInputStream { private final SimpleBufferedInput buff; // super.in, but typed as SimpleBufferedInput - private int maxSize; - private long startTime; - private long timeout = 0; // optional max time of request - private int remaining; - private int markPos; - private boolean interrupted; - private boolean allowClose = true; // for cases where we want to re-read the input, can ignore .close() from the parser + private int maxSize; // logical cap exposed to callers (0 == unlimited) + private long startTime; // start time for timeout checks, nanos + private long timeout = 0; // optional max time of request + private int remaining; // how many bytes may still be returned to caller under the current cap + private int markPos; // logical readPos snapshot for InputStream.mark/reset (not a buffer cursor) + private boolean interrupted; // true if Thread.interrupted() was detected, used to latch interrupted state + private boolean allowClose = true; // for cases where we want to re-read the input, can ignore .close() from the parser // if we are tracking progress, will have the expected content length, progress callback, connection private @Nullable Progress progress; private @Nullable Object progressContext; - private int contentLength = -1; - private int readPos = 0; // amount read; can be reset() + private int contentLength = -1; // expected content length for progress; -1 == unknown + private int readPos = 0; // amount read; can be reset() private ControllableInputStream(SimpleBufferedInput in, int maxSize) { super(in); @@ -85,6 +85,7 @@ public int read(byte[] b, int off, int len) throws IOException { if (capped && len > remaining) len = remaining; // don't read more than desired, even if available + buff.capRemaining(capped ? remaining : Integer.MAX_VALUE); while (true) { // loop trying to read until we get some data or hit the overall timeout, if we have one if (expired()) @@ -95,7 +96,9 @@ public int read(byte[] b, int off, int len) throws IOException { if (read == -1) { // completed contentLength = readPos; } else { - remaining -= read; + if (capped && read > 0) { + remaining -= read; // track bytes returned to the caller + } readPos += read; } emitProgress(); @@ -107,6 +110,11 @@ public int read(byte[] b, int off, int len) throws IOException { } } + @Override + public boolean markSupported() { + return true; + } + /** * Reads this inputstream to a ByteBuffer. The supplied max may be less than the inputstream's max, to support * reading just the first bytes. @@ -145,15 +153,24 @@ public static ByteBuffer readToByteBuffer(InputStream in, int max) throws IOExce @SuppressWarnings("NonSynchronizedMethodOverridesSynchronizedMethod") // not synchronized in later JDKs @Override public void reset() throws IOException { - super.reset(); - remaining = maxSize - markPos; + if (markPos < 0) throw new IOException("Resetting to invalid mark"); + buff.rewindToMark(); + buff.clearMark(); + if (maxSize != 0) { + remaining = maxSize - markPos; + buff.capRemaining(remaining); + } else { + remaining = 0; + buff.capRemaining(Integer.MAX_VALUE); + } readPos = markPos; // readPos is used for progress emits + markPos = -1; } @SuppressWarnings("NonSynchronizedMethodOverridesSynchronizedMethod") // not synchronized in later JDKs @Override public void mark(int readlimit) { - super.mark(readlimit); - markPos = maxSize - remaining; + markPos = readPos; + buff.setMark(); } /** @@ -165,6 +182,10 @@ public boolean baseReadFully() { return buff.baseReadFully(); } + public void resetFullyRead() { + buff.resetFullyRead(); + } + /** Get the max size of this stream (how far at most will be read from the underlying stream) * @return the max size @@ -175,7 +196,9 @@ public int max() { public void max(int newMax) { remaining += newMax - maxSize; // update remaining to reflect the difference in the new maxsize + if (remaining < 0) remaining = 0; maxSize = newMax; + buff.capRemaining(newMax == 0 ? Integer.MAX_VALUE : remaining); } public void allowClose(boolean allowClose) { diff --git a/src/main/java/org/jsoup/internal/Functions.java b/src/main/java/org/jsoup/internal/Functions.java index 40227d8417..3d5d636416 100644 --- a/src/main/java/org/jsoup/internal/Functions.java +++ b/src/main/java/org/jsoup/internal/Functions.java @@ -11,8 +11,10 @@ /** * An internal class containing functions for use with {@link Map#computeIfAbsent(Object, Function)}. + * @deprecated for removal in jsoup 1.23.1. Replace usages with direct constructor references / lambdas. */ @SuppressWarnings({"rawtypes", "unchecked"}) +@Deprecated public final class Functions { private static final Function ListFunction = key -> new ArrayList<>(); private static final Function SetFunction = key -> new HashSet<>(); diff --git a/src/main/java/org/jsoup/internal/Normalizer.java b/src/main/java/org/jsoup/internal/Normalizer.java index 3659e14956..9fe85df336 100644 --- a/src/main/java/org/jsoup/internal/Normalizer.java +++ b/src/main/java/org/jsoup/internal/Normalizer.java @@ -23,7 +23,8 @@ public static String normalize(final String input) { /** If a string literal, just lower case the string; otherwise lower-case and trim. - @deprecated internal function; will be removed in a future version. + @deprecated internal helper; replace with {@link #lowerCase(String)} for no-trim, or {@link #normalize(String)} for trim + lowercase. + Will be removed in jsoup 1.24.1. */ @Deprecated public static String normalize(final String input, boolean isStringLiteral) { diff --git a/src/main/java/org/jsoup/internal/SharedConstants.java b/src/main/java/org/jsoup/internal/SharedConstants.java index baff2a36b6..8be99cb73b 100644 --- a/src/main/java/org/jsoup/internal/SharedConstants.java +++ b/src/main/java/org/jsoup/internal/SharedConstants.java @@ -21,5 +21,7 @@ public final class SharedConstants { public static final String UseHttpClient = "jsoup.useHttpClient"; + public static final String UseRe2j = "jsoup.useRe2j"; // enables use of the re2j regular expression engine when true and it's on the classpath + private SharedConstants() {} } diff --git a/src/main/java/org/jsoup/internal/SimpleBufferedInput.java b/src/main/java/org/jsoup/internal/SimpleBufferedInput.java index c76dedc7d4..dafbd3af4a 100644 --- a/src/main/java/org/jsoup/internal/SimpleBufferedInput.java +++ b/src/main/java/org/jsoup/internal/SimpleBufferedInput.java @@ -17,11 +17,12 @@ class SimpleBufferedInput extends FilterInputStream { static final int BufferSize = DefaultBufferSize; static final SoftPool BufferPool = new SoftPool<>(() -> new byte[BufferSize]); + private int capRemaining = Integer.MAX_VALUE; // how many bytes we are allowed to pull from the underlying stream private byte @Nullable [] byteBuf; // the byte buffer; recycled via SoftPool. Created in fill if required private int bufPos; private int bufLength; - private int bufMark = -1; + private int bufMark = -1; // mark set by ControllableInputStream; -1 when unset private boolean inReadFully = false; // true when the underlying inputstream has been read fully SimpleBufferedInput(@Nullable InputStream in) { @@ -50,12 +51,6 @@ public int read(byte[] dest, int offset, int desiredLen) throws IOException { int bufAvail = bufLength - bufPos; if (bufAvail <= 0) { // can't serve from the buffer - if (!inReadFully && bufMark < 0) { - // skip creating / copying into a local buffer; just pass through - int read = in.read(dest, offset, desiredLen); - closeIfDone(read); - return read; - } fill(); bufAvail = bufLength - bufPos; } @@ -76,38 +71,25 @@ private void fill() throws IOException { byteBuf = BufferPool.borrow(); } - if (bufMark < 0) { // no mark, can lose buffer (assumes we've read to bufLen) - bufPos = 0; - } else if (bufPos >= BufferSize) { // no room left in buffer - if (bufMark > 0) { // can throw away early part of the buffer - int size = bufPos - bufMark; - System.arraycopy(byteBuf, bufMark, byteBuf, 0, size); - bufPos = size; - bufMark = 0; - } else { // invalidate mark - bufMark = -1; - bufPos = 0; - } - } + compact(); bufLength = bufPos; - int read = in.read(byteBuf, bufPos, byteBuf.length - bufPos); + int toRead = Math.min(byteBuf.length - bufPos, capRemaining); + if (toRead <= 0) return; + int read = in.read(byteBuf, bufPos, toRead); if (read > 0) { bufLength = read + bufPos; - while (byteBuf.length - bufLength > 0) { // read in more if we have space, without blocking + capRemaining -= read; + while (byteBuf.length - bufLength > 0 && capRemaining > 0) { // read in more if we have space, without blocking if (in.available() < 1) break; - read = in.read(byteBuf, bufLength, byteBuf.length - bufLength); + toRead = Math.min(byteBuf.length - bufLength, capRemaining); + if (toRead <= 0) break; + read = in.read(byteBuf, bufLength, toRead); if (read <= 0) break; bufLength += read; + capRemaining -= read; } } - closeIfDone(read); - } - - private void closeIfDone(int read) throws IOException { - if (read == -1) { - inReadFully = true; - super.close(); // close underlying stream immediately; frees resources a little earlier - } + if (read == -1) inReadFully = true; } byte[] getBuf() { @@ -123,30 +105,55 @@ boolean baseReadFully() { return inReadFully; } - @Override - public int available() throws IOException { - if (byteBuf != null && bufLength - bufPos > 0) - return bufLength - bufPos; // doesn't include those in.available(), but mostly used as a block test - return inReadFully ? 0 : in.available(); + void resetFullyRead() { + if (in != null) // for null-wrapped streams, leave as fully read to avoid fill() on a null input + inReadFully = false; } - @SuppressWarnings("NonSynchronizedMethodOverridesSynchronizedMethod") // explicitly not synced @Override - public void mark(int readlimit) { - if (readlimit > BufferSize) { - throw new IllegalArgumentException("Read-ahead limit is greater than buffer size"); + public int available() throws IOException { + int buffered = (byteBuf != null) ? (bufLength - bufPos) : 0; + if (buffered > 0) { + return buffered; // doesn't include those in.available(), but mostly used as a block test } + int avail = inReadFully ? 0 : in.available(); + return avail; + } + + void capRemaining(int newRemaining) { + capRemaining = Math.max(0, newRemaining); + } + + void setMark() { bufMark = bufPos; } - @SuppressWarnings("NonSynchronizedMethodOverridesSynchronizedMethod") // explicitly not synced - @Override - public void reset() throws IOException { + void rewindToMark() throws IOException { if (bufMark < 0) throw new IOException("Resetting to invalid mark"); bufPos = bufMark; } + void clearMark() { + bufMark = -1; + } + + private void compact() { + if (byteBuf == null || bufPos == 0) return; + int keepFrom = bufMark >= 0 ? bufMark : bufPos; + if (keepFrom <= 0) return; + + int remaining = bufLength - keepFrom; + if (remaining > 0) { + System.arraycopy(byteBuf, keepFrom, byteBuf, 0, remaining); + } + bufLength = remaining; + bufPos -= keepFrom; + if (bufMark >= 0) { + bufMark -= keepFrom; + } + } + @Override public void close() throws IOException { if (in != null) super.close(); diff --git a/src/main/java/org/jsoup/internal/StringUtil.java b/src/main/java/org/jsoup/internal/StringUtil.java index a953a86ffa..50c650e4c4 100644 --- a/src/main/java/org/jsoup/internal/StringUtil.java +++ b/src/main/java/org/jsoup/internal/StringUtil.java @@ -148,8 +148,8 @@ public static String padding(int width, int maxPaddingWidth) { * @param string string to test * @return if string is blank */ - public static boolean isBlank(final String string) { - if (string == null || string.length() == 0) + public static boolean isBlank(@Nullable String string) { + if (string == null || string.isEmpty()) return true; int l = string.length(); diff --git a/src/main/java/org/jsoup/nodes/Attribute.java b/src/main/java/org/jsoup/nodes/Attribute.java index d90ed7fa2b..a99f1dd153 100644 --- a/src/main/java/org/jsoup/nodes/Attribute.java +++ b/src/main/java/org/jsoup/nodes/Attribute.java @@ -202,13 +202,13 @@ static void html(String key, @Nullable String val, QuietAppendable accum, Docume htmlNoValidate(key, val, accum, out); } - /** @deprecated internal method and will be removed in a future version */ + /** @deprecated internal method; use {@link #html(String, String, QuietAppendable, Document.OutputSettings)} with {@link org.jsoup.internal.QuietAppendable#wrap(Appendable)} instead. Will be removed in jsoup 1.24.1. */ @Deprecated protected void html(Appendable accum, Document.OutputSettings out) throws IOException { html(key, val, accum, out); } - /** @deprecated internal method and will be removed in a future version */ + /** @deprecated internal method; use {@link #html(String, String, QuietAppendable, Document.OutputSettings)} with {@link org.jsoup.internal.QuietAppendable#wrap(Appendable)} instead. Will be removed in jsoup 1.24.1. */ @Deprecated protected static void html(String key, @Nullable String val, Appendable accum, Document.OutputSettings out) throws IOException { html(key, val, QuietAppendable.wrap(accum), out); @@ -306,7 +306,7 @@ protected static boolean isDataAttribute(String key) { * * @param out output settings * @return Returns whether collapsible or not - * @deprecated internal method and will be removed in a future version + * @deprecated internal method; use {@link #shouldCollapseAttribute(String, String, Document.OutputSettings)} instead. Will be removed in jsoup 1.24.1. */ @Deprecated protected final boolean shouldCollapseAttribute(Document.OutputSettings out) { diff --git a/src/main/java/org/jsoup/nodes/Attributes.java b/src/main/java/org/jsoup/nodes/Attributes.java index 9fcf033ec5..eb400729a5 100644 --- a/src/main/java/org/jsoup/nodes/Attributes.java +++ b/src/main/java/org/jsoup/nodes/Attributes.java @@ -159,7 +159,7 @@ public Attributes put(String key, @Nullable String value) { if (i != NotFound) vals[i] = value; else - add(key, value); + addObject(key, value); return this; } @@ -183,6 +183,13 @@ Map userData() { return userData; } + /** + Check if these attributes have any user data associated with them. + */ + boolean hasUserData() { + return hasKey(SharedConstants.UserDataKey); + } + /** Get an arbitrary user-data object by key. * @param key case-sensitive key to the object. @@ -193,7 +200,7 @@ Map userData() { @Nullable public Object userData(String key) { Validate.notNull(key); - if (!hasKey(SharedConstants.UserDataKey)) return null; // no user data exists + if (!hasUserData()) return null; // no user data exists Map userData = userData(); return userData.get(key); } @@ -225,7 +232,7 @@ void putIgnoreCase(String key, @Nullable String value) { keys[i] = key; } else - add(key, value); + addObject(key, value); } /** @@ -365,7 +372,7 @@ public void addAll(Attributes incoming) { if (needsPut) put(attr); else - add(attr.getKey(), attr.getValue()); + addObject(attr.getKey(), attr.getValue()); } } diff --git a/src/main/java/org/jsoup/nodes/Document.java b/src/main/java/org/jsoup/nodes/Document.java index 49051bcc45..db25382f95 100644 --- a/src/main/java/org/jsoup/nodes/Document.java +++ b/src/main/java/org/jsoup/nodes/Document.java @@ -220,7 +220,10 @@ public void title(String title) { @return new element */ public Element createElement(String tagName) { - return new Element(parser.tagSet().valueOf(tagName, parser.defaultNamespace(), ParseSettings.preserveCase), this.baseUri()); + return new Element( + parser.tagSet().valueOf(tagName, parser.defaultNamespace(), ParseSettings.preserveCase), + searchUpForAttribute(this, BaseUriKey) + ); } @Override diff --git a/src/main/java/org/jsoup/nodes/Element.java b/src/main/java/org/jsoup/nodes/Element.java index 36119e74dd..3e8388027f 100644 --- a/src/main/java/org/jsoup/nodes/Element.java +++ b/src/main/java/org/jsoup/nodes/Element.java @@ -3,6 +3,7 @@ import org.jsoup.helper.Validate; import org.jsoup.internal.Normalizer; import org.jsoup.internal.QuietAppendable; +import org.jsoup.helper.Regex; import org.jsoup.internal.StringUtil; import org.jsoup.parser.ParseSettings; import org.jsoup.parser.Parser; @@ -50,7 +51,7 @@ public class Element extends Node implements Iterable { private static final List EmptyChildren = Collections.emptyList(); private static final NodeList EmptyNodeList = new NodeList(0); private static final Pattern ClassSplit = Pattern.compile("\\s+"); - private static final String BaseUriKey = Attributes.internalKey("baseUri"); + static final String BaseUriKey = Attributes.internalKey("baseUri"); Tag tag; NodeList childNodes; @Nullable Attributes attributes; // field is nullable but all methods for attributes are non-null @@ -87,8 +88,7 @@ public Element(Tag tag, @Nullable String baseUri, @Nullable Attributes attribute childNodes = EmptyNodeList; this.attributes = attributes; this.tag = tag; - if (baseUri != null) - this.setBaseUri(baseUri); + if (!StringUtil.isBlank(baseUri)) this.setBaseUri(baseUri); } /** @@ -130,17 +130,19 @@ public Attributes attributes() { @Override public String baseUri() { - return searchUpForAttribute(this, BaseUriKey); + String baseUri = searchUpForAttribute(this, BaseUriKey); + return baseUri != null ? baseUri : ""; } - private static String searchUpForAttribute(final Element start, final String key) { + @Nullable + static String searchUpForAttribute(final Element start, final String key) { Element el = start; while (el != null) { if (el.attributes != null && el.attributes.hasKey(key)) return el.attributes.get(key); el = el.parent(); } - return ""; + return null; } @Override @@ -355,7 +357,18 @@ public Elements parents() { * @see #childNode(int) */ public Element child(int index) { - return childElementsList().get(index); + Validate.isTrue(index >= 0, "Index must be >= 0"); + List cached = cachedChildren(); + if (cached != null) return cached.get(index); + // otherwise, iter on elementChild; saves creating list + int size = childNodes.size(); + for (int i = 0, e = 0; i < size; i++) { // direct iter is faster than chasing firstElSib, nextElSibd + Node node = childNodes.get(i); + if (node instanceof Element) { + if (e++ == index) return (Element) node; + } + } + throw new IndexOutOfBoundsException("No child at index: " + index); } /** @@ -370,7 +383,8 @@ public Element child(int index) { * @see #child(int) */ public int childrenSize() { - return childElementsList().size(); + if (childNodeSize() == 0) return 0; + return childElementsList().size(); // gets children into cache; faster subsequent child(i) if unmodified } /** @@ -406,8 +420,9 @@ List childElementsList() { private static final String childElsMod = "jsoup.childElsMod"; /** returns the cached child els, if they exist, and the modcount of our childnodes matches the stashed modcount */ - private @Nullable List cachedChildren() { - Map userData = attributes().userData(); + @Nullable List cachedChildren() { + if (attributes == null || !attributes.hasUserData()) return null; // don't create empty userdata + Map userData = attributes.userData(); //noinspection unchecked WeakReference> ref = (WeakReference>) userData.get(childElsKey); if (ref != null) { @@ -872,10 +887,7 @@ public Element insertChildren(int index, Collection children) { int currentSize = childNodeSize(); if (index < 0) index += currentSize +1; // roll around Validate.isTrue(index >= 0 && index <= currentSize, "Insert position out of bounds."); - - ArrayList nodes = new ArrayList<>(children); - Node[] nodeArray = nodes.toArray(new Node[0]); - addChildren(index, nodeArray); + addChildren(index, children.toArray(new Node[0])); return this; } @@ -1054,9 +1066,9 @@ public Element after(Node node) { @Override public Element empty() { // Detach each of the children -> parent links: - for (Node child : childNodes) { - child.parentNode = null; - } + int size = childNodes.size(); + for (int i = 0; i < size; i++) + childNodes.get(i).parentNode = null; childNodes.clear(); return this; } @@ -1236,10 +1248,10 @@ private static int indexInList(Element search, List eleme @since 1.15.2 */ public @Nullable Element firstElementChild() { - Node child = firstChild(); - while (child != null) { - if (child instanceof Element) return (Element) child; - child = child.nextSibling(); + int size = childNodes.size(); + for (int i = 0; i < size; i++) { + Node node = childNodes.get(i); + if (node instanceof Element) return (Element) node; } return null; } @@ -1252,10 +1264,9 @@ private static int indexInList(Element search, List eleme @since 1.15.2 */ public @Nullable Element lastElementChild() { - Node child = lastChild(); - while (child != null) { - if (child instanceof Element) return (Element) child; - child = child.previousSibling(); + for (int i = childNodes.size() - 1; i >= 0; i--) { + Node node = childNodes.get(i); + if (node instanceof Element) return (Element) node; } return null; } @@ -1394,7 +1405,6 @@ public Elements getElementsByAttributeValueContaining(String key, String match) */ public Elements getElementsByAttributeValueMatching(String key, Pattern pattern) { return Collector.collect(new Evaluator.AttributeWithValueMatching(key, pattern), this); - } /** @@ -1404,13 +1414,13 @@ public Elements getElementsByAttributeValueMatching(String key, Pattern pattern) * @return elements that have attributes matching this regular expression */ public Elements getElementsByAttributeValueMatching(String key, String regex) { - Pattern pattern; + Regex pattern; try { - pattern = Pattern.compile(regex); + pattern = Regex.compile(regex); } catch (PatternSyntaxException e) { throw new IllegalArgumentException("Pattern syntax error: " + regex, e); } - return getElementsByAttributeValueMatching(key, pattern); + return Collector.collect(new Evaluator.AttributeWithValueMatching(key, pattern), this); } /** @@ -1479,13 +1489,13 @@ public Elements getElementsMatchingText(Pattern pattern) { * @see Element#text() */ public Elements getElementsMatchingText(String regex) { - Pattern pattern; + Regex pattern; try { - pattern = Pattern.compile(regex); + pattern = Regex.compile(regex); } catch (PatternSyntaxException e) { throw new IllegalArgumentException("Pattern syntax error: " + regex, e); } - return getElementsMatchingText(pattern); + return Collector.collect(new Evaluator.Matches(pattern), this); } /** @@ -1505,13 +1515,13 @@ public Elements getElementsMatchingOwnText(Pattern pattern) { * @see Element#ownText() */ public Elements getElementsMatchingOwnText(String regex) { - Pattern pattern; + Regex pattern; try { - pattern = Pattern.compile(regex); + pattern = Regex.compile(regex); } catch (PatternSyntaxException e) { throw new IllegalArgumentException("Pattern syntax error: " + regex, e); } - return getElementsMatchingOwnText(pattern); + return Collector.collect(new Evaluator.MatchesOwn(pattern), this); } /** @@ -2066,12 +2076,36 @@ public Element filter(NodeFilter nodeFilter) { } static final class NodeList extends ArrayList { + /** Tracks if the children have valid sibling indices. We only need to reindex on siblingIndex() demand. */ + boolean validChildren = true; + public NodeList(int size) { super(size); } + /** The modCount is used to invalidate the cached element children. */ int modCount() { return this.modCount; } + + void incrementMod() { + this.modCount++; + } + } + + void reindexChildren() { + final int size = childNodes.size(); + for (int i = 0; i < size; i++) { + childNodes.get(i).setSiblingIndex(i); + } + childNodes.validChildren = true; + } + + void invalidateChildren() { + childNodes.validChildren = false; + } + + boolean hasValidChildren() { + return childNodes.validChildren; } } diff --git a/src/main/java/org/jsoup/nodes/Node.java b/src/main/java/org/jsoup/nodes/Node.java index e0a4bfe83b..9b39564351 100644 --- a/src/main/java/org/jsoup/nodes/Node.java +++ b/src/main/java/org/jsoup/nodes/Node.java @@ -366,8 +366,12 @@ public Node root() { * @return the Document associated with this Node, or null if there is no such Document. */ public @Nullable Document ownerDocument() { - Node root = root(); - return (root instanceof Document) ? (Document) root : null; + Node node = this; + while (node != null) { + if (node instanceof Document) return (Document) node; + node = node.parentNode; + } + return null; } /** @@ -386,7 +390,7 @@ public void remove() { * @see #after(String) */ public Node before(String html) { - addSiblingHtml(siblingIndex, html); + addSiblingHtml(siblingIndex(), html); return this; } @@ -403,7 +407,7 @@ public Node before(Node node) { // if the incoming node is a sibling of this, remove it first so siblingIndex is correct on add if (node.parentNode == parentNode) node.remove(); - parentNode.addChildren(siblingIndex, node); + parentNode.addChildren(siblingIndex(), node); return this; } @@ -414,7 +418,7 @@ public Node before(Node node) { * @see #before(String) */ public Node after(String html) { - addSiblingHtml(siblingIndex + 1, html); + addSiblingHtml(siblingIndex() + 1, html); return this; } @@ -431,7 +435,7 @@ public Node after(Node node) { // if the incoming node is a sibling of this, remove it first so siblingIndex is correct on add if (node.parentNode == parentNode) node.remove(); - parentNode.addChildren(siblingIndex + 1, node); + parentNode.addChildren(siblingIndex() + 1, node); return this; } @@ -505,7 +509,7 @@ public Node wrap(String html) { public @Nullable Node unwrap() { Validate.notNull(parentNode); Node firstChild = firstChild(); - parentNode.addChildren(siblingIndex, this.childNodesAsArray()); + parentNode.addChildren(siblingIndex(), this.childNodesAsArray()); this.remove(); return firstChild; @@ -547,19 +551,24 @@ protected void replaceChild(Node out, Node in) { if (in.parentNode != null) in.parentNode.removeChild(in); - final int index = out.siblingIndex; + final int index = out.siblingIndex(); ensureChildNodes().set(index, in); - assert this instanceof Element; in.parentNode = (Element) this; in.setSiblingIndex(index); out.parentNode = null; + + ((Element) this).childNodes.incrementMod(); // as mod count not changed in set(), requires explicit update, to invalidate the child element cache } protected void removeChild(Node out) { Validate.isTrue(out.parentNode == this); - final int index = out.siblingIndex; - ensureChildNodes().remove(index); - reindexChildren(index); + Element el = (Element) this; + if (el.hasValidChildren()) // can remove by index + ensureChildNodes().remove(out.siblingIndex); + else + ensureChildNodes().remove(out); // iterates, but potentially not every one + + el.invalidateChildren(); out.parentNode = null; } @@ -575,10 +584,9 @@ protected void addChildren(Node... children) { } protected void addChildren(int index, Node... children) { + // todo clean up all these and use the list, not the var array. just need to be careful when iterating the incoming (as we are removing as we go) Validate.notNull(children); - if (children.length == 0) { - return; - } + if (children.length == 0) return; final List nodes = ensureChildNodes(); // fast path - if used as a wrap (index=0, children = child[0].parent.children - do inplace @@ -595,7 +603,6 @@ protected void addChildren(int index, Node... children) { } } if (sameList) { // moving, so OK to empty firstParent and short-circuit - boolean wasEmpty = childNodeSize() == 0; firstParent.empty(); nodes.addAll(index, Arrays.asList(children)); i = children.length; @@ -603,8 +610,7 @@ protected void addChildren(int index, Node... children) { while (i-- > 0) { children[i].parentNode = (Element) this; } - if (!(wasEmpty && children[0].siblingIndex == 0)) // skip reindexing if we just moved - reindexChildren(index); + ((Element) this).invalidateChildren(); return; } } @@ -614,22 +620,13 @@ protected void addChildren(int index, Node... children) { reparentChild(child); } nodes.addAll(index, Arrays.asList(children)); - reindexChildren(index); + ((Element) this).invalidateChildren(); } protected void reparentChild(Node child) { child.setParentNode(this); } - private void reindexChildren(int start) { - final int size = childNodeSize(); - if (size == 0) return; - final List childNodes = ensureChildNodes(); - for (int i = start; i < size; i++) { - childNodes.get(i).setSiblingIndex(i); - } - } - /** Retrieves this node's sibling nodes. Similar to {@link #childNodes() node.parent.childNodes()}, but does not include this node (a node is not a sibling of itself). @@ -656,10 +653,12 @@ public List siblingNodes() { return null; // root final List siblings = parentNode.ensureChildNodes(); - final int index = siblingIndex+1; - if (siblings.size() > index) - return siblings.get(index); - else + final int index = siblingIndex() + 1; + if (siblings.size() > index) { + Node node = siblings.get(index); + assert (node.siblingIndex == index); // sanity test that invalidations haven't missed + return node; + } else return null; } @@ -671,7 +670,7 @@ public List siblingNodes() { if (parentNode == null) return null; // root - if (siblingIndex > 0) + if (siblingIndex() > 0) return parentNode.ensureChildNodes().get(siblingIndex-1); else return null; @@ -684,6 +683,9 @@ public List siblingNodes() { * @see org.jsoup.nodes.Element#elementSiblingIndex() */ public int siblingIndex() { + if (parentNode != null && !parentNode.childNodes.validChildren) + parentNode.reindexChildren(); + return siblingIndex; } @@ -900,7 +902,7 @@ public String toString() { return outerHtml(); } - /** @deprecated internal method moved into Printer; will be removed in a future version */ + /** @deprecated internal method moved into Printer; will be removed in jsoup 1.24.1. */ @Deprecated protected void indent(Appendable accum, int depth, Document.OutputSettings out) throws IOException { accum.append('\n').append(StringUtil.padding(depth * out.indentAmount(), out.maxPaddingWidth())); @@ -1006,7 +1008,7 @@ protected Node doClone(@Nullable Node parent) { } clone.parentNode = (Element) parent; // can be null, to create an orphan split - clone.siblingIndex = parent == null ? 0 : siblingIndex; + clone.siblingIndex = parent == null ? 0 : siblingIndex(); // if not keeping the parent, shallowClone the ownerDocument to preserve its settings if (parent == null && !(this instanceof Document)) { Document doc = ownerDocument(); diff --git a/src/main/java/org/jsoup/nodes/Printer.java b/src/main/java/org/jsoup/nodes/Printer.java index 20b3266df1..6b83fdbfe2 100644 --- a/src/main/java/org/jsoup/nodes/Printer.java +++ b/src/main/java/org/jsoup/nodes/Printer.java @@ -159,6 +159,7 @@ boolean isBlockEl(@Nullable Node node) { if (node == null) return false; if (node instanceof Element) { Element el = (Element) node; + if (el.nameIs("br")) return true; // give
a newline; actually an inline tag return el.isBlock() || (!el.tag.isKnownTag() && (el.parentNode instanceof Document || hasChildBlocks(el))); } diff --git a/src/main/java/org/jsoup/nodes/PseudoTextElement.java b/src/main/java/org/jsoup/nodes/PseudoTextElement.java index d6f0f9b4ce..9ceb41507c 100644 --- a/src/main/java/org/jsoup/nodes/PseudoTextElement.java +++ b/src/main/java/org/jsoup/nodes/PseudoTextElement.java @@ -6,7 +6,8 @@ /** * Represents a {@link TextNode} as an {@link Element}, to enable text nodes to be selected with * the {@link org.jsoup.select.Selector} {@code :matchText} syntax. - * @deprecated use {@link Element#selectNodes(String, Class)} instead, with selector of ::textnode and class TextNode. + * @deprecated use {@link Element#selectNodes(String, Class)} instead, with selector of ::textnode and class TextNode; + * will be removed in jsoup 1.24.1. */ @Deprecated public class PseudoTextElement extends Element { diff --git a/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java b/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java index f539de93eb..fc873eb298 100644 --- a/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java +++ b/src/main/java/org/jsoup/parser/HtmlTreeBuilder.java @@ -36,7 +36,7 @@ public class HtmlTreeBuilder extends TreeBuilder { "annotation-xml", "mi", "mn", "mo", "ms", "mtext" }; static final String[]TagSearchInScopeSvg = new String[] { - "desc", "foreignObject", "title" + "desc", "foreignobject", "title" // note normalized to lowercase to match other scope searches; will preserve input case as appropriate }; static final String[] TagSearchList = new String[]{"ol", "ul"}; @@ -60,7 +60,9 @@ public class HtmlTreeBuilder extends TreeBuilder { "button", "fieldset", "input", "keygen", "object", "output", "select", "textarea" }; - public static final int MaxScopeSearchDepth = 100; // prevents the parser bogging down in exceptionally broken pages + /** @deprecated Not used anymore; configure parser depth via {@link Parser#setMaxDepth(int)}. Will be removed in jsoup 1.24.1. */ + @Deprecated + public static final int MaxScopeSearchDepth = 100; private HtmlTreeBuilderState state; // the current state private HtmlTreeBuilderState originalState; // original / marked state @@ -306,9 +308,9 @@ void error(HtmlTreeBuilderState state) { Element createElementFor(Token.StartTag startTag, String namespace, boolean forcePreserveCase) { // dedupe and normalize the attributes: Attributes attributes = startTag.attributes; - if (!forcePreserveCase) - attributes = settings.normalizeAttributes(attributes); if (attributes != null && !attributes.isEmpty()) { + if (!forcePreserveCase) + settings.normalizeAttributes(attributes); int dupes = attributes.deduplicate(settings); if (dupes > 0) { error("Dropped duplicate attribute(s) in tag [%s]", startTag.normalName); @@ -332,7 +334,9 @@ Element insertElementFor(final Token.StartTag startTag) { if (startTag.isSelfClosing()) { Tag tag = el.tag(); tag.setSeenSelfClose(); // can infer output if in xml syntax - if (tag.isKnownTag() && (tag.isEmpty() || tag.isSelfClosing())) { + if (tag.isEmpty()) { + // treated as empty below; nothing further + } else if (tag.isKnownTag() && tag.isSelfClosing()) { // ok, allow it. effectively a pop, but fiddles with the state. handles empty style, title etc which would otherwise leave us in data state tokeniser.transition(TokeniserState.Data); // handles "; @@ -2114,4 +2182,151 @@ static void assertErrorsDoNotContain(String msg, ParseErrorList errors) { assertEquals("a < b", data.data()); assertEquals("a < b", data.outerHtml()); } + + @Test void dropsNullsFromBody() { + // https://github.com/jhy/jsoup/issues/2395 + String html = "

\u0000

\u0000\u0000

Hi\u0000

"; + + Parser parser = Parser.htmlParser(); + parser.setTrackErrors(10); + + Document doc = Jsoup.parse(html, parser); + assertEquals("

\n

\n

Hi

", doc.body().html()); + assertEquals("Hi", doc.body().text()); + + ParseErrorList errors = parser.getErrors(); + assertEquals(4, errors.size()); + assertEquals("<1:4>: Unexpected character '\u0000' in input state [Data]", errors.get(0).toString()); + assertEquals("<1:12>: Unexpected character '\u0000' in input state [Data]", errors.get(1).toString()); + assertEquals("<1:13>: Unexpected character '\u0000' in input state [Data]", errors.get(2).toString()); + assertEquals("<1:23>: Unexpected character '\u0000' in input state [Data]", errors.get(3).toString()); + // todo should we replace that null, for convenience? + } + + @Test void replacesNullsInForeign() { + String html = "\u0000\u0000\u0000Hi\u0000"; + Parser parser = Parser.htmlParser(); + parser.setTrackErrors(10); + + Document doc = Jsoup.parse(html, parser); + assertEquals("\n ��Hi�\n", doc.body().html()); + assertEquals("���Hi�", doc.body().text()); + + ParseErrorList errors = parser.getErrors(); + assertEquals(4, errors.size()); + assertEquals("<1:12>: Unexpected character '\u0000' in input state [Data]", errors.get(0).toString()); + assertEquals("<1:26>: Unexpected character '\u0000' in input state [Data]", errors.get(1).toString()); + assertEquals("<1:27>: Unexpected character '\u0000' in input state [Data]", errors.get(2).toString()); + assertEquals("<1:43>: Unexpected character '\u0000' in input state [Data]", errors.get(3).toString()); + } + + @Nested class DeepHtmlTrees { + private int depth(Element el) { + int depth = 0; + while ((el = el.parent()) != null) { + depth++; + } + return depth; + } + + /** + * Parse the HTML code in `contents`, wrapped in enough divs to ensure that the root elements + * of contents are at depth `startingDepth`. + */ + private Element parseDeepHtml(int startingDepth, String contents) { + StringBuilder html = new StringBuilder(); + html.append(""); + for (int i = 0; i < startingDepth - 4; i++) { + html.append("
"); + } + html.append("
"); + html.append(contents); + + Parser parser = Parser.htmlParser(); + Document doc = Jsoup.parse(html.toString(), parser); + Element container = doc.getElementById("container"); + assertNotNull(container); + assertEquals(startingDepth - 1, depth(container)); + + return container; + } + + @Test void nestedDivs() { + Element container = parseDeepHtml(511, "
"); + + assertEquals("
\n
\n
\n
", container.html()); + } + + @Test void closingTagOfTagClosedByDepthLimit() { + // The tag would be nested too deep, so it first closes the innermost . + // This means that the first will close the outer , as it's the only + // one that is currently open. The last is then just ignored, as there is no + // open left to close. + Element container = parseDeepHtml(511, ""); + + assertEquals("", container.html()); + } + + @Test void tableAtDepthLimitWithDirectTd() { + Element container = parseDeepHtml(512, "\n\n", container.html()); + } + + @Test void tableRightBeforeDepthLimitWithDirectTd() { + Element container = parseDeepHtml(511, "
"); + + assertEquals("
\n
"); + + assertEquals("\n \n \n \n
", container.html()); + } + + @Test void customDepthLimit() { + Parser parser = Parser.htmlParser().setMaxDepth(5); + String input = "
"; + + Document doc = Jsoup.parse(input, parser); + String expected = new StringBuilder() + .append("\n") + .append(" \n") + .append(" \n") + .append("
\n") + .append("
\n") + .append("
\n") + .append("
\n") + .append("
\n") + .append("
\n") + .append("
\n") + .append("
\n") + .append(" \n") + .append("") + .toString(); + + assertEquals(expected, doc.html()); + } + + @Test void formControlsDetachWhenFormTrimmed() { + Parser parser = Parser.htmlParser().setMaxDepth(3); + String input = "
"; + + Document doc = Jsoup.parse(input, "", parser); + Element formEl = doc.getElementById("f"); + assertNotNull(formEl); + assertTrue(formEl instanceof FormElement); + FormElement form = (FormElement) formEl; + assertEquals("", form.html()); + assertEquals(0, form.elements().size()); + } + + @Test void templateModesClearedWhenTrimmed() { + Parser parser = Parser.htmlParser().setMaxDepth(3); + String input = "

Two

"; + + Document doc = Jsoup.parse(input, "", parser); + Element template = doc.getElementById("tmpl"); + assertNotNull(template); + assertEquals("", template.html()); + Element paragraph = doc.selectFirst("p"); + assertNotNull(paragraph); + assertEquals("Two", paragraph.text()); + } + } } diff --git a/src/test/java/org/jsoup/parser/ParserIT.java b/src/test/java/org/jsoup/parser/ParserIT.java index e1904ddc20..368c772ef3 100644 --- a/src/test/java/org/jsoup/parser/ParserIT.java +++ b/src/test/java/org/jsoup/parser/ParserIT.java @@ -49,8 +49,16 @@ public void handlesDeepStack() { long start = System.currentTimeMillis(); Document doc = Parser.parseBodyFragment(longBody.toString(), ""); + int depth = 1; + Element el = doc.body(); + while (el.childrenSize() > 0) { + el = el.child(0); + depth++; + } + // Assert - assertEquals(2, doc.body().childNodeSize()); + assertEquals(1, doc.body().childrenSize()); + assertEquals(512, depth); assertEquals(25000, doc.select("dd").size()); assertTrue(System.currentTimeMillis() - start < 20000); // I get ~ 1.5 seconds, but others have reported slower // was originally much longer, or stack overflow. diff --git a/src/test/java/org/jsoup/parser/ParserSettingsTest.java b/src/test/java/org/jsoup/parser/ParserSettingsTest.java index 7856287cb1..c21bf8341a 100644 --- a/src/test/java/org/jsoup/parser/ParserSettingsTest.java +++ b/src/test/java/org/jsoup/parser/ParserSettingsTest.java @@ -48,8 +48,7 @@ public void attributesCaseNormalization(Locale locale) { Attributes attributes = new Attributes(); attributes.put("ITEM", "1"); - Attributes normalizedAttributes = parseSettings.normalizeAttributes(attributes); - - assertEquals("item", normalizedAttributes.asList().get(0).getKey()); + parseSettings.normalizeAttributes(attributes); + assertEquals("item", attributes.asList().get(0).getKey()); } } diff --git a/src/test/java/org/jsoup/parser/ParserTest.java b/src/test/java/org/jsoup/parser/ParserTest.java index 1418de0ee9..bfebf6beba 100644 --- a/src/test/java/org/jsoup/parser/ParserTest.java +++ b/src/test/java/org/jsoup/parser/ParserTest.java @@ -9,7 +9,10 @@ import java.nio.charset.StandardCharsets; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertNotSame; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertTrue; public class ParserTest { @@ -30,6 +33,24 @@ public void unescapeEntitiesHandlesLargeInput() { assertEquals(body, Parser.unescapeEntities(body, false)); } + @Test public void unescapeTracksErrors() { + Parser parser = Parser.htmlParser(); + parser.setTrackErrors(10); + + String s = parser.unescape("One &bogus; & > Two", false); + assertEquals("One &bogus; & > Two", s); + ParseErrorList errors = parser.getErrors(); + assertEquals(2, errors.size()); + assertEquals("<1:6>: Invalid character reference: invalid named reference [bogus]", errors.get(0).toString()); + assertEquals("<1:22>: Invalid character reference: missing semicolon on [>]", errors.get(1).toString()); + + // can reuse parser; errors will be reset + s = parser.unescape("One & &bogus; Two", false); + assertEquals("One & &bogus; Two", s); + assertEquals(1, parser.getErrors().size()); + assertEquals("<1:12>: Invalid character reference: invalid named reference [bogus]", parser.getErrors().get(0).toString()); + } + @Test public void testUtf8() throws IOException { // testcase for https://github.com/jhy/jsoup/issues/1557. no repro. @@ -58,4 +79,24 @@ public void testClone() { assertEquals(xmlParser.settings().preserveTagCase(), xmlClone.settings().preserveTagCase()); assertEquals(xmlParser.settings().preserveAttributeCase(), xmlClone.settings().preserveAttributeCase()); } + + @Test + public void testCloneCopyTagSet() { + Parser parser = Parser.htmlParser(); + parser.tagSet().add(new Tag("foo")); + parser.tagSet().onNewTag(tag -> tag.set(Tag.SelfClose)); + Parser clone = parser.clone(); + + // Ensure the tagsets are different instances + assertNotSame(clone.tagSet(), parser.tagSet()); + // Check that cloned tagset contains same tag + assertNotNull(clone.tagSet().get("foo", Parser.NamespaceHtml)); + // Ensure onNewTag customizers are retained + Tag custom = clone.tagSet().valueOf("qux", Parser.NamespaceHtml); + assertTrue(custom.isSelfClosing()); + // Check that cloned tagset does not observe modifications made to the original + assertNull(clone.tagSet().get("bar", Parser.NamespaceHtml)); + parser.tagSet().add(new Tag("bar")); + assertNull(clone.tagSet().get("bar", Parser.NamespaceHtml)); + } } diff --git a/src/test/java/org/jsoup/parser/TagSetTest.java b/src/test/java/org/jsoup/parser/TagSetTest.java index 8c8faf15c7..6ba19b8445 100644 --- a/src/test/java/org/jsoup/parser/TagSetTest.java +++ b/src/test/java/org/jsoup/parser/TagSetTest.java @@ -4,6 +4,10 @@ import org.jsoup.nodes.Element; import org.junit.jupiter.api.Test; +import java.lang.reflect.Field; +import java.util.Map; +import java.util.concurrent.atomic.AtomicInteger; + import static org.jsoup.parser.Parser.NamespaceHtml; import static org.junit.jupiter.api.Assertions.*; @@ -182,4 +186,36 @@ public class TagSetTest { assertTrue(copy.valueOf("custom-tag", NamespaceHtml).is(Tag.Void)); assertFalse(source.valueOf("custom-tag", NamespaceHtml).is(Tag.Void)); } + + @Test void copyPullThroughDoesNotMutateSource() { + TagSet source = TagSet.Html(); + TagSet copy = new TagSet(source); + + int sourceNamespacesBefore = tagSetNamespaceCount(source); + assertNotNull(copy.get("div", NamespaceHtml)); + int sourceNamespacesAfter = tagSetNamespaceCount(source); + assertEquals(sourceNamespacesBefore, sourceNamespacesAfter); + } + + @Test void copyPullWithCustomizerThroughDoesNotMutateSource() { + TagSet source = TagSet.Html(); + TagSet copy = new TagSet(source); + + AtomicInteger sourceAdds = new AtomicInteger(); + source.onNewTag(tag -> sourceAdds.incrementAndGet()); + + assertNotNull(copy.get("div", NamespaceHtml)); + assertEquals(0, sourceAdds.get()); + } + + private static int tagSetNamespaceCount(TagSet tagSet) { + try { + Field tagsField = TagSet.class.getDeclaredField("tags"); + tagsField.setAccessible(true); + Map tags = (Map) tagsField.get(tagSet); + return tags.size(); + } catch (ReflectiveOperationException e) { + throw new RuntimeException(e); + } + } } diff --git a/src/test/java/org/jsoup/parser/TagTest.java b/src/test/java/org/jsoup/parser/TagTest.java index d9f7138980..4ed0e17977 100644 --- a/src/test/java/org/jsoup/parser/TagTest.java +++ b/src/test/java/org/jsoup/parser/TagTest.java @@ -81,6 +81,12 @@ public void canBeInsensitive(Locale locale) { assertFalse(p.isInline()); } + @Test public void brSemantics() { + Tag br = Tag.valueOf("br"); + assertTrue(br.isInline()); + assertFalse(br.isBlock()); + } + @Test public void imgSemantics() { Tag img = Tag.valueOf("img"); assertTrue(img.isInline()); diff --git a/src/test/java/org/jsoup/parser/XmlTreeBuilderTest.java b/src/test/java/org/jsoup/parser/XmlTreeBuilderTest.java index dcba7986a1..ff70991896 100644 --- a/src/test/java/org/jsoup/parser/XmlTreeBuilderTest.java +++ b/src/test/java/org/jsoup/parser/XmlTreeBuilderTest.java @@ -647,4 +647,41 @@ private static void assertXmlNamespace(Element el) { assertEquals("

Foo
", TextUtil.stripNewlines(doc.outerHtml())); // we infer that empty els can be represented with self-closing if seen in parse } + + @Test public void xmlParserHasUnlimitedDepthByDefault() { + Parser parser = Parser.xmlParser(); + Document doc = Jsoup.parse(deepXml(600), "", parser); + Element target = doc.selectFirst("target"); + assertNotNull(target); + assertTrue(depth(target) > 512); + } + + @Test public void xmlParserRespectsConfiguredMaxDepth() { + Parser parser = Parser.xmlParser().setMaxDepth(5); + Document doc = Jsoup.parse(deepXml(100), "", parser); + Element target = doc.selectFirst("target"); + assertNotNull(target); + assertEquals(parser.getMaxDepth(), depth(target)); + } + + private static String deepXml(int depth) { + StringBuilder xml = new StringBuilder(""); + for (int i = 0; i < depth; i++) { + xml.append(""); + } + xml.append(""); + for (int i = 0; i < depth; i++) { + xml.append(""); + } + xml.append(""); + return xml.toString(); + } + + private static int depth(Element el) { + int d = 0; + while ((el = el.parent()) != null) { + d++; + } + return d; + } } diff --git a/src/test/java/org/jsoup/select/EvaluatorTest.java b/src/test/java/org/jsoup/select/EvaluatorTest.java index ff456d757d..31c1cc75f5 100644 --- a/src/test/java/org/jsoup/select/EvaluatorTest.java +++ b/src/test/java/org/jsoup/select/EvaluatorTest.java @@ -1,6 +1,7 @@ package org.jsoup.select; import org.jsoup.Jsoup; +import org.jsoup.helper.Regex; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.junit.jupiter.api.Test; @@ -257,6 +258,34 @@ public void testMatchesWholeOwnTextToString() { assertEquals(":matchesWholeOwnText(example)", evaluator.toString()); } + @Test + public void testMatchesToStringRegex() { + Regex pattern = Regex.compile("example"); + Evaluator.Matches evaluator = new Evaluator.Matches(pattern); + assertEquals(":matches(example)", evaluator.toString()); + } + + @Test + public void testMatchesOwnToStringRegex() { + Regex pattern = Regex.compile("example"); + Evaluator.MatchesOwn evaluator = new Evaluator.MatchesOwn(pattern); + assertEquals(":matchesOwn(example)", evaluator.toString()); + } + + @Test + public void testMatchesWholeTextToStringRegex() { + Regex pattern = Regex.compile("example"); + Evaluator.MatchesWholeText evaluator = new Evaluator.MatchesWholeText(pattern); + assertEquals(":matchesWholeText(example)", evaluator.toString()); + } + + @Test + public void testMatchesWholeOwnTextToStringRegex() { + Regex pattern = Regex.compile("example"); + Evaluator.MatchesWholeOwnText evaluator = new Evaluator.MatchesWholeOwnText(pattern); + assertEquals(":matchesWholeOwnText(example)", evaluator.toString()); + } + @Test public void testMatchTextToString() { Evaluator.MatchText evaluator = new Evaluator.MatchText(); diff --git a/src/test/java/org/jsoup/select/SelectorIT.java b/src/test/java/org/jsoup/select/SelectorIT.java index 71f0d6f0b1..1b8f59269c 100644 --- a/src/test/java/org/jsoup/select/SelectorIT.java +++ b/src/test/java/org/jsoup/select/SelectorIT.java @@ -57,40 +57,4 @@ public void uncaughtException(Thread t, Throwable e) { exceptionCount.incrementAndGet(); } } - - @Test public void streamParserSelect() throws Exception { - // https://github.com/jhy/jsoup/issues/2277 - // The memo in the StructuralEvaluator was not getting reset correctly, and so would run out of memory - // Test tracks memory consumption. Will be interesting to see how it behaves on the CI workers. - - String xml = "1"; - Evaluator query = QueryParser.parse("A B C"); - Runtime runtime = Runtime.getRuntime(); - - System.gc(); - Thread.sleep(100); - long initialUsed = runtime.totalMemory() - runtime.freeMemory(); - - for (int i = 0; i < 50_000; i++) { // Before fix, would exceed 10MB in ~ 9000 iters - try (StreamParser parser = new StreamParser(Parser.xmlParser())) { - parser.parse(xml, ""); - parser.selectFirst(query); - parser.stop(); - } - - if (i % 1000 == 0) { - System.gc(); - Thread.sleep(100); - long currentUsed = runtime.totalMemory() - runtime.freeMemory(); - long delta = currentUsed - initialUsed; - - // Fail if we grow + 10MB - if (delta > 10_000_000) { - fail(String.format("Memo leak detected. Memory increased by %,d bytes after %,d iterations", - delta, i)); - } - } - } - } - } diff --git a/src/test/java/org/jsoup/select/SelectorTest.java b/src/test/java/org/jsoup/select/SelectorTest.java index 6207a6cc57..8fe7d00eaa 100644 --- a/src/test/java/org/jsoup/select/SelectorTest.java +++ b/src/test/java/org/jsoup/select/SelectorTest.java @@ -10,10 +10,12 @@ import org.jsoup.nodes.TextNode; import org.jsoup.parser.Parser; import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; -import java.util.IdentityHashMap; import java.util.List; import java.util.Locale; +import java.util.Map; import java.util.stream.Collectors; import static org.jsoup.select.EvaluatorDebug.sexpr; @@ -896,14 +898,12 @@ public void selectClassWithSpace() { Document doc = Jsoup.parse(html); Elements found = doc.select("div[class=value ]"); - assertEquals(2, found.size()); - assertEquals("class without space", found.get(0).text()); - assertEquals("class with space", found.get(1).text()); + assertEquals(1, found.size()); + assertEquals("class with space", found.get(0).text()); found = doc.select("div[class=\"value \"]"); - assertEquals(2, found.size()); - assertEquals("class without space", found.get(0).text()); - assertEquals("class with space", found.get(1).text()); + assertEquals(1, found.size()); + assertEquals("class with space", found.get(0).text()); found = doc.select("div[class=\"value\\ \"]"); assertEquals(0, found.size()); @@ -1194,7 +1194,7 @@ public void wildcardNamespaceMatchesNoNamespace() { Evaluator eval = QueryParser.parse("p ~ p"); CombiningEvaluator.And andEval = (CombiningEvaluator.And) eval; StructuralEvaluator.PreviousSibling prevEval = (StructuralEvaluator.PreviousSibling) andEval.evaluators.get(0); - IdentityHashMap> map = prevEval.threadMemo.get(); + Map> map = prevEval.threadMemo.get(); assertEquals(0, map.size()); // no memo yet Document doc1 = Jsoup.parse("

One

Two

Three"); @@ -1207,7 +1207,7 @@ public void wildcardNamespaceMatchesNoNamespace() { assertEquals(2, s2.size()); assertEquals("Two2", s2.first().text()); - assertEquals(1, map.size()); // root of doc 2 + assertEquals(0, map.size()); // reset after collect } @Test public void blankTextNodesAreConsideredEmpty() { @@ -1729,4 +1729,68 @@ public void testAncestorChain() { ); } + @Test void attributeSelectorQuotedWhitespace() { + // https://github.com/jhy/jsoup/issues/2380 + Document doc = Jsoup.parse( + "

" + + "
" + + "
" + ); + + // match: literal compare (no trimming) + assertSelectedIds(doc.select("div[data=\"foobar\"]"), "1"); + assertSelectedIds(doc.select("div[data=\" foobar \"]"), "2"); + + // prefix + assertSelectedIds(doc.select("div[data^=\"foo\"]"), "1"); + assertSelectedIds(doc.select("div[data^=\" foo\"]"), "2"); + + // suffix + assertSelectedIds(doc.select("div[data$=\"bar\"]"), "1"); + assertSelectedIds(doc.select("div[data$=\"bar \"]"), "2"); + + // contains + assertSelectedIds(doc.select("div[data*=\"foobar\"]"), "1", "2", "3"); + assertSelectedIds(doc.select("div[data*=\" foobar \"]"), "2"); + } + + @Test void canSelectBlankAttribute() { + Document doc = Jsoup.parse( + "
" + + "
" + + "
" + ); + + assertSelectedIds(doc.select("div[data]"), "1", "2", "3"); + assertSelectedIds(doc.select("div[data='']"), "1", "2"); + assertSelectedIds(doc.select("div[data=]"), "1", "2"); + + assertSelectedIds(doc.select("div[data^='']"), "1", "2", "3"); + assertSelectedIds(doc.select("div[data$='']"), "1", "2", "3"); + assertSelectedIds(doc.select("div[data*='']"), "1", "2", "3"); + } + + @ParameterizedTest + @ValueSource(strings = {"[abs:!=]", "[ abs:^=]"}) + void parseExceptionOnEmptyAbsKey(String query) { + Selector.SelectorParseException ex = assertThrows( + Selector.SelectorParseException.class, + () -> Selector.evaluatorOf(query) + ); + assertEquals("Absolute attribute key must have a name", ex.getMessage()); + } + + @Test void parseExceptionOnEmptyKeyVal() { + // was previously firing at match time, not eval time + String q = "[\"=\"]"; + boolean threw = false; + try { + Evaluator e = Selector.evaluatorOf(q); + } catch (Selector.SelectorParseException ex) { + threw = true; + assertEquals("Quoted value must have content", ex.getMessage()); + } + assertTrue(threw); + } + } diff --git a/src/test/java/org/jsoup/select/StructuralEvaluatorTest.java b/src/test/java/org/jsoup/select/StructuralEvaluatorTest.java new file mode 100644 index 0000000000..87c50540cb --- /dev/null +++ b/src/test/java/org/jsoup/select/StructuralEvaluatorTest.java @@ -0,0 +1,104 @@ +package org.jsoup.select; + +import org.jsoup.Jsoup; +import org.jsoup.nodes.Document; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Stream; + +import static org.junit.jupiter.api.Assertions.*; + +class StructuralEvaluatorTest { + private static final String Html = + "
"; + + @ParameterizedTest + @MethodSource("selectorMemoData") + void selectorMemoIsClearedOnReset(String selector, boolean expectMemos) { + // test that the structural evaluator memos are used, and are reset + + Document doc = Jsoup.parse(Html); + Evaluator evaluator = Selector.evaluatorOf(selector); + + // collect all StructuralEvaluator instances from the parsed evaluator tree + List structuralEvals = new ArrayList<>(); + collectEvals(evaluator, structuralEvals); + + // use Collector.stream vs Selector.select(), as the later is able to reset after executing + Collector.stream(evaluator, doc).count(); // consume stream to populate memos + assertFalse(structuralEvals.isEmpty()); + + boolean hadMemos = false; + for (StructuralEvaluator se : structuralEvals) { + if (!se.threadMemo.get().isEmpty()) { + hadMemos = true; + break; + } + } + + evaluator.reset(); + + // verify all structural evaluator thread-local maps are cleared + for (StructuralEvaluator se : structuralEvals) { + assertTrue(se.threadMemo.get().isEmpty()); + } + + assertEquals(expectMemos, hadMemos); + } + + private static Stream selectorMemoData() { + return Stream.of( + Arguments.of("div:not(.b)", true), // Not (uses memoMatches) + Arguments.of("div p", true), // Ancestor (ancestor chain checks) + Arguments.of("span ~ a", true), // PreviousSibling + Arguments.of("span + a", true), // ImmediatePreviousSibling + Arguments.of("div > span > a", false), // ImmediateParentRun does not use memoMatches + Arguments.of("div:has(p)", false) // Has (coverage; does not use memo for these inputs) + ); + } + + private static void collectEvals(Evaluator evaluator, List out) { + // recursive traversal of evaluator trees to find StructuralEvaluator instances + if (evaluator instanceof CombiningEvaluator) { + CombiningEvaluator ce = (CombiningEvaluator) evaluator; + for (Evaluator inner : ce.evaluators) { + collectEvals(inner, out); + } + return; + } + + if (evaluator instanceof StructuralEvaluator.ImmediateParentRun) { + StructuralEvaluator.ImmediateParentRun run = (StructuralEvaluator.ImmediateParentRun) evaluator; + out.add(run); + for (Evaluator inner : run.evaluators) { + collectEvals(inner, out); + } + return; + } + + if (evaluator instanceof StructuralEvaluator) { + StructuralEvaluator se = (StructuralEvaluator) evaluator; + out.add(se); + collectEvals(se.evaluator, out); + } + + } +} diff --git a/src/test/java11/org/jsoup/helper/HttpClientExecutorTest.java b/src/test/java11/org/jsoup/helper/HttpClientExecutorTest.java index 475c67888d..07a853b280 100644 --- a/src/test/java11/org/jsoup/helper/HttpClientExecutorTest.java +++ b/src/test/java11/org/jsoup/helper/HttpClientExecutorTest.java @@ -2,17 +2,19 @@ import org.jsoup.internal.SharedConstants; import org.junit.jupiter.api.Test; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertInstanceOf; +import java.io.IOException; +import java.net.*; +import java.util.Collections; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.*; public class HttpClientExecutorTest { @Test void getsHttpClient() { try { enableHttpClient(); - RequestExecutor executor = RequestDispatch.get(null, null); - //assertInstanceOf(HttpClientExecutor.class, executor); - assertEquals("org.jsoup.helper.HttpClientExecutor", executor.getClass().getName()); - // Haven't figured out how to get Maven to allow this mjar code to be on the classpath for the surefire tests, hence not instanceof + RequestExecutor executor = RequestDispatch.get(new HttpConnection.Request(), null); + assertInstanceOf(HttpClientExecutor.class, executor); } finally { disableHttpClient(); // reset to previous default for JDK8 compat tests } @@ -20,8 +22,8 @@ public class HttpClientExecutorTest { @Test void getsHttpUrlConnectionByDefault() { System.clearProperty(SharedConstants.UseHttpClient); - RequestExecutor executor = RequestDispatch.get(null, null); - assertEquals("org.jsoup.helper.HttpClientExecutor", executor.getClass().getName()); + RequestExecutor executor = RequestDispatch.get(new HttpConnection.Request(), null); + assertInstanceOf(HttpClientExecutor.class, executor); } public static void enableHttpClient() { @@ -31,4 +33,86 @@ public static void enableHttpClient() { public static void disableHttpClient() { System.setProperty(SharedConstants.UseHttpClient, "false"); } + + @Test void proxyWrapUsesSystemDefaultProxySelector() { + ProxySelector originalSelector = ProxySelector.getDefault(); + InetSocketAddress defaultProxy = new InetSocketAddress("system.proxy", 8080); + + try { + ProxySelector.setDefault(new ProxySelector() { + @Override + public List select(URI uri) { + return Collections.singletonList( + new Proxy(Proxy.Type.HTTP, defaultProxy) + ); + } + + @Override + public void connectFailed(URI uri, SocketAddress sa, IOException ioe) {} + }); + + HttpClientExecutor.ProxyWrap wrap = new HttpClientExecutor.ProxyWrap(); + List proxies = wrap.select(URI.create("http://example.com")); + + assertEquals(1, proxies.size()); + assertSame(defaultProxy, proxies.get(0).address()); + } finally { + ProxySelector.setDefault(originalSelector); + } + } + + @Test void proxyWrapConnectFailedOnlyForSystemProxy() { + HttpClientExecutor.ProxyWrap wrap = new HttpClientExecutor.ProxyWrap(); + HttpClientExecutor.perRequestProxy.set(new Proxy(Proxy.Type.HTTP, new InetSocketAddress("custom", 9090))); + wrap.connectFailed(URI.create("http://example.com"), + new InetSocketAddress("custom", 9090), + new IOException("test")); + HttpClientExecutor.perRequestProxy.remove(); + } + + @Test + void perRequestProxyOverridesSystemDefault() { + ProxySelector original = ProxySelector.getDefault(); + InetSocketAddress sysProxy = new InetSocketAddress("system.proxy", 8080); + InetSocketAddress perReqProxy = new InetSocketAddress("per.request", 9999); + try { + ProxySelector.setDefault(new ProxySelector() { + @Override + public List select(URI uri) { + return Collections.singletonList( + new Proxy(Proxy.Type.HTTP, sysProxy)); + } + @Override + public void connectFailed(URI uri, SocketAddress sa, IOException ioe) {} + }); + + HttpClientExecutor.perRequestProxy.set( + new Proxy(Proxy.Type.HTTP, perReqProxy)); + + HttpClientExecutor.ProxyWrap wrap = new HttpClientExecutor.ProxyWrap(); + List proxies = wrap.select(URI.create("http://example.com")); + assertSame(perReqProxy, proxies.get(0).address()); + } finally { + HttpClientExecutor.perRequestProxy.remove(); + ProxySelector.setDefault(original); + } + } + + @Test void connectFailedDelegatesToSystemDefault() { + ProxySelector original = ProxySelector.getDefault(); + final boolean[] called = {false}; + try { + ProxySelector.setDefault(new ProxySelector() { + @Override + public List select(URI uri) { return Collections.singletonList(Proxy.NO_PROXY); } + @Override + public void connectFailed(URI uri, SocketAddress sa, IOException ioe) { called[0] = true; } + }); + new HttpClientExecutor.ProxyWrap() + .connectFailed(URI.create("http://example.com"), new InetSocketAddress("x", 80), new IOException("x")); + assertTrue(called[0]); + } finally { + ProxySelector.setDefault(original); + } + } } diff --git a/src/test/resources/fuzztests/2374.html.gz b/src/test/resources/fuzztests/2374.html.gz new file mode 100644 index 0000000000..1541e0ef07 Binary files /dev/null and b/src/test/resources/fuzztests/2374.html.gz differ diff --git a/src/test/resources/fuzztests/2393.html.gz b/src/test/resources/fuzztests/2393.html.gz new file mode 100644 index 0000000000..02213d2950 Binary files /dev/null and b/src/test/resources/fuzztests/2393.html.gz differ diff --git a/src/test/resources/fuzztests/2397.html.gz b/src/test/resources/fuzztests/2397.html.gz new file mode 100644 index 0000000000..81900aa368 Binary files /dev/null and b/src/test/resources/fuzztests/2397.html.gz differ diff --git a/src/test/resources/fuzztests/48116.html.gz b/src/test/resources/fuzztests/48116.html.gz index 37367dc8cc..748c5efddd 100644 Binary files a/src/test/resources/fuzztests/48116.html.gz and b/src/test/resources/fuzztests/48116.html.gz differ diff --git a/src/test/resources/fuzztests/9056.html.gz b/src/test/resources/fuzztests/9056.html.gz new file mode 100644 index 0000000000..21c10af80d Binary files /dev/null and b/src/test/resources/fuzztests/9056.html.gz differ diff --git a/src/test/resources/fuzztests/as-replace.html.gz b/src/test/resources/fuzztests/as-replace.html.gz new file mode 100644 index 0000000000..5770cd6834 Binary files /dev/null and b/src/test/resources/fuzztests/as-replace.html.gz differ diff --git a/src/test/resources/fuzztests/ex-inselect16.html b/src/test/resources/fuzztests/ex-inselect16.html new file mode 100644 index 0000000000..75cefea18d --- /dev/null +++ b/src/test/resources/fuzztests/ex-inselect16.html @@ -0,0 +1 @@ +
" + + "
" + + "

One

" + + "

Two

" + + "
" + + "
" + + "
" + + "
" + + "

Target

" + + "
" + + "
" + + "