Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
127 changes: 124 additions & 3 deletions src/main/java/org/codelibs/nekohtml/sax/SimpleHTMLScanner.java
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.codelibs.nekohtml.HTMLEntities;
import org.xml.sax.Attributes;
import org.xml.sax.ContentHandler;
import org.xml.sax.DTDHandler;
Expand Down Expand Up @@ -353,11 +354,12 @@ protected void parseHTML(final String html) throws SAXException {
// Text content
final int nextTag = html.indexOf('<', pos);
final int endPos = nextTag >= 0 ? nextTag : length;
final String text = html.substring(pos, endPos);
final String rawText = html.substring(pos, endPos);

// Always emit text content, including whitespace
// This preserves spacing between elements for proper text extraction
if (text.length() > 0) {
if (rawText.length() > 0) {
final String text = resolveEntities(rawText);
fContentHandler.characters(text.toCharArray(), 0, text.length());
}

Expand Down Expand Up @@ -398,7 +400,7 @@ protected AttributesImpl parseAttributes(final String attrString) {
value = ""; // No value
}

attrs.addAttribute("", name, name, "CDATA", value);
attrs.addAttribute("", name, name, "CDATA", resolveEntities(value, true));
}

return attrs;
Expand Down Expand Up @@ -436,6 +438,125 @@ protected String normalizeAttributeName(final String name) {
return "upper".equals(fAttributeCase) ? name.toUpperCase() : "lower".equals(fAttributeCase) ? name.toLowerCase() : name;
}

// Pattern for HTML character references: &#decimal; or &#xhex; or &name;
// Semicolon is optional to handle common malformed HTML
private static final Pattern ENTITY_PATTERN = Pattern.compile("&(?:#([0-9]+)|#[xX]([0-9a-fA-F]+)|([a-zA-Z][a-zA-Z0-9]*));?");

/**
* Resolves HTML character entities in text content.
* Semicolon-less named entities are decoded in text context.
*
* @param text The text containing entities
* @return The text with entities resolved to their character equivalents
*/
protected String resolveEntities(final String text) {
return resolveEntities(text, false);
}

/**
* Resolves HTML character entities in the given text.
* Handles numeric decimal (&#214;), numeric hex (&#xD6;), and named (&Ouml;) entities.
* In attribute context, semicolon-less named entities followed by [A-Za-z0-9=] are not decoded
* per HTML5 attribute value state rules, preventing corruption of URLs like &not=, &copy=.
*
* @param text The text containing entities
* @param inAttribute Whether this text is an attribute value
* @return The text with entities resolved to their character equivalents
*/
protected String resolveEntities(final String text, final boolean inAttribute) {
if (text == null || text.indexOf('&') < 0) {
return text;
}

final Matcher m = ENTITY_PATTERN.matcher(text);
final StringBuilder sb = new StringBuilder(text.length());
int lastEnd = 0;

while (m.find()) {
sb.append(text, lastEnd, m.start());

if (m.group(1) != null) {
// Numeric decimal: &#214;
try {
final int codePoint = Integer.parseInt(m.group(1));
sb.append(resolveCodePoint(codePoint, m.group(0)));
} catch (final NumberFormatException e) {
sb.append(m.group(0));
}
} else if (m.group(2) != null) {
// Numeric hex: &#xD6;
try {
final int codePoint = Integer.parseInt(m.group(2), 16);
sb.append(resolveCodePoint(codePoint, m.group(0)));
} catch (final NumberFormatException e) {
sb.append(m.group(0));
}
} else if (m.group(3) != null) {
// Named entity: &Ouml;
final String matched = m.group(0);
final boolean hasSemicolon = matched.endsWith(";");

// HTML5 attribute value state: if no semicolon and next char is [A-Za-z0-9=],
// do not decode (prevents corruption of URLs like &not=2, &copy=, &reg=)
if (inAttribute && !hasSemicolon) {
final int afterEnd = m.end();
if (afterEnd < text.length()) {
final char nextChar = text.charAt(afterEnd);
if (Character.isLetterOrDigit(nextChar) || nextChar == '=') {
sb.append(matched);
lastEnd = m.end();
continue;
}
}
}

final int c = HTMLEntities.get(m.group(3));
if (c != -1) {
sb.appendCodePoint(c);
} else {
sb.append(matched);
}
}

lastEnd = m.end();
}

sb.append(text, lastEnd, text.length());
return sb.toString();
}

/**
* Validates a numeric code point and returns the resolved character or replacement.
* Invalid code points (null char, surrogates, out of range, XML-illegal) are replaced with U+FFFD.
*/
private static String resolveCodePoint(final int codePoint, final String original) {
if (codePoint == 0) {
// Null character: replace with U+FFFD per HTML5 spec
return "\uFFFD";
}
if (codePoint >= 0xD800 && codePoint <= 0xDFFF) {
// Surrogate range: invalid Unicode scalar value
return "\uFFFD";
}
if (codePoint > 0x10FFFF) {
// Out of Unicode range
return "\uFFFD";
}
// XML 1.0 illegal characters (except tab, newline, carriage return)
if (codePoint < 0x20 && codePoint != 0x9 && codePoint != 0xA && codePoint != 0xD) {
return "\uFFFD";
}
if (codePoint >= 0xFDD0 && codePoint <= 0xFDEF) {
// Unicode noncharacters
return "\uFFFD";
}
if ((codePoint & 0xFFFE) == 0xFFFE) {
// U+xFFFE and U+xFFFF are noncharacters
return "\uFFFD";
}
return new String(Character.toChars(codePoint));
}

@Override
public boolean getFeature(final String name) throws SAXNotRecognizedException, SAXNotSupportedException {
throw new SAXNotRecognizedException("Feature not recognized: " + name);
Expand Down
Loading
Loading