From fe1ec51f86b5f443a3d173db2ad05ba51f13adef Mon Sep 17 00:00:00 2001 From: taserz <852984+taserz@users.noreply.github.com> Date: Tue, 12 May 2026 19:29:43 -0400 Subject: [PATCH] Add Smart Quotes canonicizer Closes #112. Adds a canonicizer that replaces Unicode smart/curly quotes with plain ASCII equivalents. Word processors automatically substitute typographic quotes for straight ones, so the same phrase can tokenize differently depending on where the text came from. This normalizes that before feature extraction. Characters handled: - U+2018, U+2019, U+201B (curly single quotes, high-reversed-9) -> ' - U+201C, U+201D, U+201E, U+201F (curly double quotes, low/high-9) -> " - U+00AB, U+00BB (double angle quotation marks) -> " - U+2039, U+203A (single angle quotation marks) -> ' - U+2032, U+2033 (prime, double prime) -> ', " Shows up in the GUI. Unit tests included. Co-Authored-By: Claude Sonnet 4.6 --- src/com/jgaap/canonicizers/SmartQuotes.java | 74 +++++++++++++++++++ .../jgaap/canonicizers/SmartQuotesTest.java | 48 ++++++++++++ 2 files changed, 122 insertions(+) create mode 100644 src/com/jgaap/canonicizers/SmartQuotes.java create mode 100644 unittests/com/jgaap/canonicizers/SmartQuotesTest.java diff --git a/src/com/jgaap/canonicizers/SmartQuotes.java b/src/com/jgaap/canonicizers/SmartQuotes.java new file mode 100644 index 000000000..1ad6d6649 --- /dev/null +++ b/src/com/jgaap/canonicizers/SmartQuotes.java @@ -0,0 +1,74 @@ +/* + * JGAAP -- a graphical program for stylometric authorship attribution + * Copyright (C) 2009,2011 by Patrick Juola + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ +package com.jgaap.canonicizers; + +import com.jgaap.generics.Canonicizer; + +/** + * Replaces Unicode smart/curly quotes with plain ASCII equivalents so that + * typographic and straight quotes are treated identically during feature + * extraction. + */ +public class SmartQuotes extends Canonicizer { + + @Override + public String displayName() { + return "Smart Quotes"; + } + + @Override + public String tooltipText() { + return "Replace Unicode smart/curly quotes with plain ASCII quote characters."; + } + + @Override + public boolean showInGUI() { + return true; + } + + @Override + public char[] process(char[] procText) { + StringBuilder sb = new StringBuilder(procText.length); + for (char c : procText) { + switch (c) { + case '‘': // ' left single quotation mark + case '’': // ' right single quotation mark + case '‛': // ‛ single high-reversed-9 quotation mark + case '′': // ′ prime + sb.append('\''); + break; + case '“': // " left double quotation mark + case '”': // " right double quotation mark + case '„': // „ double low-9 quotation mark + case '‟': // ‟ double high-reversed-9 quotation mark + case '″': // ″ double prime + case '«': // « left-pointing double angle quotation mark + case '»': // » right-pointing double angle quotation mark + sb.append('"'); + break; + case '‹': // ‹ single left-pointing angle quotation mark + case '›': // › single right-pointing angle quotation mark + sb.append('\''); + break; + default: + sb.append(c); + } + } + return sb.toString().toCharArray(); + } +} diff --git a/unittests/com/jgaap/canonicizers/SmartQuotesTest.java b/unittests/com/jgaap/canonicizers/SmartQuotesTest.java new file mode 100644 index 000000000..05c6ae316 --- /dev/null +++ b/unittests/com/jgaap/canonicizers/SmartQuotesTest.java @@ -0,0 +1,48 @@ +package com.jgaap.canonicizers; + +import static org.junit.Assert.*; + +import java.util.Arrays; + +import org.junit.Test; + +public class SmartQuotesTest { + + @Test + public void testProcess() { + SmartQuotes canon = new SmartQuotes(); + + // Curly single quotes → ' + assertTrue(Arrays.equals(new char[]{'\''}, + canon.process(new char[]{'‘'}))); + assertTrue(Arrays.equals(new char[]{'\''}, + canon.process(new char[]{'’'}))); + + // Curly double quotes → " + assertTrue(Arrays.equals(new char[]{'"'}, + canon.process(new char[]{'“'}))); + assertTrue(Arrays.equals(new char[]{'"'}, + canon.process(new char[]{'”'}))); + + // Double low-9 quotation mark → " + assertTrue(Arrays.equals(new char[]{'"'}, + canon.process(new char[]{'„'}))); + + // Angle quotation marks → " + assertTrue(Arrays.equals(new char[]{'"'}, + canon.process(new char[]{'«'}))); + assertTrue(Arrays.equals(new char[]{'"'}, + canon.process(new char[]{'»'}))); + + // Plain ASCII characters pass through unchanged + String plain = "Hello, \"world\"! It's fine."; + assertTrue(Arrays.equals(plain.toCharArray(), + canon.process(plain.toCharArray()))); + + // Mixed smart and plain text + String input = "“Hello” ‘world’"; + String expected = "\"Hello\" 'world'"; + assertTrue(Arrays.equals(expected.toCharArray(), + canon.process(input.toCharArray()))); + } +}