Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 74 additions & 0 deletions src/com/jgaap/canonicizers/SmartQuotes.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
/*
* JGAAP -- a graphical program for stylometric authorship attribution
* Copyright (C) 2009,2011 by Patrick Juola
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package com.jgaap.canonicizers;

import com.jgaap.generics.Canonicizer;

/**
* Replaces Unicode smart/curly quotes with plain ASCII equivalents so that
* typographic and straight quotes are treated identically during feature
* extraction.
*/
public class SmartQuotes extends Canonicizer {

@Override
public String displayName() {
return "Smart Quotes";
}

@Override
public String tooltipText() {
return "Replace Unicode smart/curly quotes with plain ASCII quote characters.";
}

@Override
public boolean showInGUI() {
return true;
}

@Override
public char[] process(char[] procText) {
StringBuilder sb = new StringBuilder(procText.length);
for (char c : procText) {
switch (c) {
case '‘': // ' left single quotation mark
case '’': // ' right single quotation mark
case '‛': // ‛ single high-reversed-9 quotation mark
case '′': // ′ prime
sb.append('\'');
break;
case '“': // " left double quotation mark
case '”': // " right double quotation mark
case '„': // „ double low-9 quotation mark
case '‟': // ‟ double high-reversed-9 quotation mark
case '″': // ″ double prime
case '«': // « left-pointing double angle quotation mark
case '»': // » right-pointing double angle quotation mark
sb.append('"');
break;
case '‹': // ‹ single left-pointing angle quotation mark
case '›': // › single right-pointing angle quotation mark
sb.append('\'');
break;
default:
sb.append(c);
}
}
return sb.toString().toCharArray();
}
}
48 changes: 48 additions & 0 deletions unittests/com/jgaap/canonicizers/SmartQuotesTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
package com.jgaap.canonicizers;

import static org.junit.Assert.*;

import java.util.Arrays;

import org.junit.Test;

public class SmartQuotesTest {

@Test
public void testProcess() {
SmartQuotes canon = new SmartQuotes();

// Curly single quotes → '
assertTrue(Arrays.equals(new char[]{'\''},
canon.process(new char[]{'‘'})));
assertTrue(Arrays.equals(new char[]{'\''},
canon.process(new char[]{'’'})));

// Curly double quotes → "
assertTrue(Arrays.equals(new char[]{'"'},
canon.process(new char[]{'“'})));
assertTrue(Arrays.equals(new char[]{'"'},
canon.process(new char[]{'”'})));

// Double low-9 quotation mark → "
assertTrue(Arrays.equals(new char[]{'"'},
canon.process(new char[]{'„'})));

// Angle quotation marks → "
assertTrue(Arrays.equals(new char[]{'"'},
canon.process(new char[]{'«'})));
assertTrue(Arrays.equals(new char[]{'"'},
canon.process(new char[]{'»'})));

// Plain ASCII characters pass through unchanged
String plain = "Hello, \"world\"! It's fine.";
assertTrue(Arrays.equals(plain.toCharArray(),
canon.process(plain.toCharArray())));

// Mixed smart and plain text
String input = "“Hello” ‘world’";
String expected = "\"Hello\" 'world'";
assertTrue(Arrays.equals(expected.toCharArray(),
canon.process(input.toCharArray())));
}
}