diff --git a/src/com/jgaap/canonicizers/SmartQuotes.java b/src/com/jgaap/canonicizers/SmartQuotes.java new file mode 100644 index 000000000..1ad6d6649 --- /dev/null +++ b/src/com/jgaap/canonicizers/SmartQuotes.java @@ -0,0 +1,74 @@ +/* + * JGAAP -- a graphical program for stylometric authorship attribution + * Copyright (C) 2009,2011 by Patrick Juola + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ +package com.jgaap.canonicizers; + +import com.jgaap.generics.Canonicizer; + +/** + * Replaces Unicode smart/curly quotes with plain ASCII equivalents so that + * typographic and straight quotes are treated identically during feature + * extraction. + */ +public class SmartQuotes extends Canonicizer { + + @Override + public String displayName() { + return "Smart Quotes"; + } + + @Override + public String tooltipText() { + return "Replace Unicode smart/curly quotes with plain ASCII quote characters."; + } + + @Override + public boolean showInGUI() { + return true; + } + + @Override + public char[] process(char[] procText) { + StringBuilder sb = new StringBuilder(procText.length); + for (char c : procText) { + switch (c) { + case '‘': // ' left single quotation mark + case '’': // ' right single quotation mark + case '‛': // ‛ single high-reversed-9 quotation mark + case '′': // ′ prime + sb.append('\''); + break; + case '“': // " left double quotation mark + case '”': // " right double quotation mark + case '„': // „ double low-9 quotation mark + case '‟': // ‟ double high-reversed-9 quotation mark + case '″': // ″ double prime + case '«': // « left-pointing double angle quotation mark + case '»': // » right-pointing double angle quotation mark + sb.append('"'); + break; + case '‹': // ‹ single left-pointing angle quotation mark + case '›': // › single right-pointing angle quotation mark + sb.append('\''); + break; + default: + sb.append(c); + } + } + return sb.toString().toCharArray(); + } +} diff --git a/unittests/com/jgaap/canonicizers/SmartQuotesTest.java b/unittests/com/jgaap/canonicizers/SmartQuotesTest.java new file mode 100644 index 000000000..05c6ae316 --- /dev/null +++ b/unittests/com/jgaap/canonicizers/SmartQuotesTest.java @@ -0,0 +1,48 @@ +package com.jgaap.canonicizers; + +import static org.junit.Assert.*; + +import java.util.Arrays; + +import org.junit.Test; + +public class SmartQuotesTest { + + @Test + public void testProcess() { + SmartQuotes canon = new SmartQuotes(); + + // Curly single quotes → ' + assertTrue(Arrays.equals(new char[]{'\''}, + canon.process(new char[]{'‘'}))); + assertTrue(Arrays.equals(new char[]{'\''}, + canon.process(new char[]{'’'}))); + + // Curly double quotes → " + assertTrue(Arrays.equals(new char[]{'"'}, + canon.process(new char[]{'“'}))); + assertTrue(Arrays.equals(new char[]{'"'}, + canon.process(new char[]{'”'}))); + + // Double low-9 quotation mark → " + assertTrue(Arrays.equals(new char[]{'"'}, + canon.process(new char[]{'„'}))); + + // Angle quotation marks → " + assertTrue(Arrays.equals(new char[]{'"'}, + canon.process(new char[]{'«'}))); + assertTrue(Arrays.equals(new char[]{'"'}, + canon.process(new char[]{'»'}))); + + // Plain ASCII characters pass through unchanged + String plain = "Hello, \"world\"! It's fine."; + assertTrue(Arrays.equals(plain.toCharArray(), + canon.process(plain.toCharArray()))); + + // Mixed smart and plain text + String input = "“Hello” ‘world’"; + String expected = "\"Hello\" 'world'"; + assertTrue(Arrays.equals(expected.toCharArray(), + canon.process(input.toCharArray()))); + } +}