From e47759f084e356ccf2b8442fb8fc887831814fa4 Mon Sep 17 00:00:00 2001 From: Maximilian Rudolph Date: Mon, 13 Oct 2014 15:18:54 +0200 Subject: [PATCH 1/2] changed datacruncher (and depended classes) to support processing without lowercasing --- src/iitb/Segment/DataCruncher.java | 88 +++++++++++++++---- src/iitb/Segment/Segment.java | 17 ++-- .../Segment/DataCruncherGetTokenListTest.java | 11 +++ .../DataCruncherReadRowFixedColTest.java | 50 ++++++++++- .../DataCruncherReadRowVarColTest.java | 50 ++++++++++- .../Segment/DataCruncherReadTaggedTest.java | 23 +++++ 6 files changed, 212 insertions(+), 27 deletions(-) diff --git a/src/iitb/Segment/DataCruncher.java b/src/iitb/Segment/DataCruncher.java index 8b86265..af3140d 100644 --- a/src/iitb/Segment/DataCruncher.java +++ b/src/iitb/Segment/DataCruncher.java @@ -140,8 +140,11 @@ class TestData { String seq[]; String fname; String delimit, impDelimit; - TestData(String file,String delimitP,String impDelimitP, String grpDelimit) { - try { + final boolean lowerCase; + TestData(String file, String delimitP, String impDelimitP, String grpDelimit, + final boolean lowerCase) { + this.lowerCase = lowerCase; + try { fname = file; rin =new BufferedReader(new FileReader(file+".raw")); delimit = delimitP; @@ -170,7 +173,8 @@ int[] groupedTokens() { String[] nextRecord() { try { if ((line=rin.readLine())!=null) { - StringTokenizer tok=new StringTokenizer(line.toLowerCase(),delimit,true); + final String correctedLine = lowerCase ? line.toLowerCase() : line; + StringTokenizer tok=new StringTokenizer(correctedLine,delimit,true); int len = tok.countTokens(); if ((seq == null) || (seq.length < len)) seq =new String[len]; @@ -298,16 +302,29 @@ void close() { public class DataCruncher { + /** + * This is the old interface to keep compatibility + * @param text + * @param delimit A set of delimiters used by the Tokenizer. + * @param impDelimit Delimiters to be retained for tagging. + * @return an Array of tokens. + */ + protected static String[] getTokenList(String text, String delimit, + String impDelimit) { + return getTokenList(text,delimit,impDelimit,true); + } + /** * * @param text * @param delimit A set of delimiters used by the Tokenizer. - * @param impDelimit Delimiters to be retained for tagging. + * @param impDelimit Delimiters to be retained for tagging. + * @param lowerCase convert tokens to lower case * @return an Array of tokens. */ protected static String[] getTokenList(String text, String delimit, - String impDelimit) { - text = text.toLowerCase(); + String impDelimit,boolean lowerCase) { + text = lowerCase ? text.toLowerCase() : text; StringTokenizer textTok = new StringTokenizer(text, delimit, true); //This allocates space for all tokens and delimiters, //but will make a second pass through the String unnecessary. @@ -322,7 +339,27 @@ protected static String[] getTokenList(String text, String delimit, //Finally, the storage is trimmed to the actual size. return tokenList.toArray(new String[tokenList.size()]); } - + + /** + * Reads a block of text ended by a blank line or the end of the file. + * The block contains lines of tokens with a label. + * + * NOTE: This is the old interface which always lowercases the input + * @param numLabels The maximal number of labels expected + * @param tin + * @param tagDelimit Separator between tokens and tag number + * @param delimit Used to define token boundaries + * @param impDelimit Delimiters to be retained for tagging + * @param t Stores the labels + * @param cArray Stores the tokens + * @return number of lines read + * @throws IOException + */ + public static int readRowVarCol(int numLabels, BufferedReader tin, + String tagDelimit, String delimit, String impDelimit, int[] t, + String[][] cArray) throws IOException { + return readRowVarCol(numLabels,tin,tagDelimit,delimit,impDelimit,t,cArray,true); + } /** * Reads a block of text ended by a blank line or the end of the file. * The block contains lines of tokens with a label. @@ -333,19 +370,21 @@ protected static String[] getTokenList(String text, String delimit, * @param impDelimit Delimiters to be retained for tagging * @param t Stores the labels * @param cArray Stores the tokens + * @param lowerCase lowercase tokens before processing * @return number of lines read * @throws IOException */ public static int readRowVarCol(int numLabels, BufferedReader tin, String tagDelimit, String delimit, String impDelimit, int[] t, - String[][] cArray) throws IOException { + String[][] cArray,boolean lowerCase) throws IOException { int ptr = 0; String line; while(true) { line = tin.readLine(); StringTokenizer firstSplit=null; if (line!=null) { - firstSplit=new StringTokenizer(line.toLowerCase(),tagDelimit); + final String modifiedLine = lowerCase ? line.toLowerCase() : line; + firstSplit=new StringTokenizer(modifiedLine,tagDelimit); } if ((line==null) || (firstSplit.countTokens()<2)) { // Empty Line @@ -354,17 +393,23 @@ public static int readRowVarCol(int numLabels, BufferedReader tin, String w = firstSplit.nextToken(); int label=Integer.parseInt(firstSplit.nextToken()); t[ptr] = label; - cArray[ptr++] = getTokenList(w,delimit,impDelimit); + cArray[ptr++] = getTokenList(w,delimit,impDelimit,lowerCase); } } + static int readRowFixedCol(int numLabels, BufferedReader tin, String tagDelimit, + String delimit, String impDelimit, int[] t, String[][] cArray, int labels[]) + throws IOException { + return readRowFixedCol(numLabels,tin,tagDelimit,delimit,impDelimit,t,cArray,labels,true); + } static int readRowFixedCol(int numLabels, BufferedReader tin, String tagDelimit, - String delimit, String impDelimit, int[] t, String[][] cArray, int labels[]) + String delimit, String impDelimit, int[] t, String[][] cArray, int labels[],boolean lowerCase) throws IOException { String line=tin.readLine(); if (line == null) return 0; - StringTokenizer firstSplit=new StringTokenizer(line.toLowerCase(),tagDelimit,true); + final String modifiedLine = lowerCase ? line.toLowerCase() : line; + StringTokenizer firstSplit=new StringTokenizer(modifiedLine,tagDelimit,true); int ptr = 0; for (int i = 0; (i < labels.length) && firstSplit.hasMoreTokens(); i++) { int label = labels[i]; @@ -378,7 +423,7 @@ static int readRowFixedCol(int numLabels, BufferedReader tin, String tagDelimit, } if ((label > 0) && (label <= numLabels)) { t[ptr] = label; - cArray[ptr++] = getTokenList(w,delimit,impDelimit); + cArray[ptr++] = getTokenList(w,delimit,impDelimit,lowerCase); } } return ptr; @@ -416,10 +461,14 @@ protected static int[] readHeaderInfo(int numLabels, BufferedReader tin, return labels; } - + public static TrainData readTagged(int numLabels, String tfile, + String rfile, String delimit, String tagDelimit, String impDelimit, + LabelMap labelMap) { + return readTagged(numLabels,tfile,rfile,delimit,tagDelimit,impDelimit,labelMap,true); + } public static TrainData readTagged(int numLabels, String tfile, String rfile, String delimit, String tagDelimit, String impDelimit, - LabelMap labelMap) { + LabelMap labelMap,boolean lowerCase) { try { ArrayList td = new ArrayList(); BufferedReader tin = new BufferedReader(new FileReader(tfile @@ -447,10 +496,10 @@ public static TrainData readTagged(int numLabels, String tfile, int ptr = 0; if (fixedColFormat) { ptr = readRowFixedCol(numLabels, tin, tagDelimit, delimit, - impDelimit, t, cArray, labels); + impDelimit, t, cArray, labels,lowerCase); } else { ptr = readRowVarCol(numLabels, tin, tagDelimit, delimit, - impDelimit, t, cArray); + impDelimit, t, cArray,lowerCase); } if (ptr == 0) { break; @@ -471,12 +520,13 @@ public static TrainData readTagged(int numLabels, String tfile, return null; } - public static void readRaw(Vector data,String file,String delimit,String impDelimit) { + public static void readRaw(Vector data,String file,String delimit,String impDelimit,boolean lowerCase) { try { BufferedReader rin=new BufferedReader(new FileReader(file+".raw")); String line; while((line=rin.readLine())!=null) { - StringTokenizer tok=new StringTokenizer(line.toLowerCase(),delimit,true); + final String modifiedLine = lowerCase ? line.toLowerCase() : line; + StringTokenizer tok=new StringTokenizer(line,delimit,true); String seq[]=new String[tok.countTokens()]; int count=0; for(int i=0 ; i s = new Vector(); - TrainData tdMan = DataCruncher.readTagged(nlabels,baseDir+"/data/"+inName+"/"+inName+".test",baseDir+"/data/"+inName+"/"+inName+".test",delimit,tagDelimit,impDelimit,labelMap); - TrainData tdAuto = DataCruncher.readTagged(nlabels,baseDir+"/out/"+outDir+"/"+inName+".test",baseDir+"/data/"+inName+"/"+inName+".test",delimit,tagDelimit,impDelimit,labelMap); - DataCruncher.readRaw(s,baseDir+"/data/"+inName+"/"+inName+".test","",""); + TrainData tdMan = DataCruncher.readTagged(nlabels,baseDir+"/data/"+inName+"/"+inName+".test",baseDir+"/data/"+inName+"/"+inName+".test",delimit,tagDelimit,impDelimit,labelMap,lowerCase); + TrainData tdAuto = DataCruncher.readTagged(nlabels,baseDir+"/out/"+outDir+"/"+inName+".test",baseDir+"/data/"+inName+"/"+inName+".test",delimit,tagDelimit,impDelimit,labelMap,lowerCase); + DataCruncher.readRaw(s,baseDir+"/data/"+inName+"/"+inName+".test","","",lowerCase); int len=tdAuto.size(); int truePos[]=new int[nlabels+1]; int totalMarkedPos[]=new int[nlabels+1]; diff --git a/test/iitb/Segment/DataCruncherGetTokenListTest.java b/test/iitb/Segment/DataCruncherGetTokenListTest.java index 99b9cb8..74f1d88 100644 --- a/test/iitb/Segment/DataCruncherGetTokenListTest.java +++ b/test/iitb/Segment/DataCruncherGetTokenListTest.java @@ -39,4 +39,15 @@ public void testGetTokenList() { assertEquals(tokens[1], "goldfield"); assertEquals(tokens[4], "|3"); } + @Test + public void testGetTokenListWithoutLowerCasing() { + String tokenString = "West Goldfield Avenue, |3"; + String delimit = ",\t/ -():.;'?#`&\"_"; + String impDelimit = ","; + String[] tokens = DataCruncher.getTokenList(tokenString, delimit, impDelimit,false); + + assertEquals(tokens.length, 5); + assertEquals(tokens[1], "Goldfield"); + assertEquals(tokens[4], "|3"); + } } diff --git a/test/iitb/Segment/DataCruncherReadRowFixedColTest.java b/test/iitb/Segment/DataCruncherReadRowFixedColTest.java index 9e81384..eae59de 100644 --- a/test/iitb/Segment/DataCruncherReadRowFixedColTest.java +++ b/test/iitb/Segment/DataCruncherReadRowFixedColTest.java @@ -48,7 +48,29 @@ public void testReadRowFixedCol() { e.printStackTrace(); } } - + + + @Test + public void testReadRowFixedColWithoutDowncasing() { + int numLabels = 7; + BufferedReader reader = new BufferedReader(new StringReader(tagged)); + String tagDelimit = "|"; + String delimit = ",\t/ -():.;'?#`&\"_"; + String impDelimit = ","; + int[] t = new int[numLabels]; + String[][] cArray = new String[numLabels][0]; + try { + int[] labels = DataCruncher.readHeaderInfo(numLabels, reader, tagDelimit); + int ptr = DataCruncher.readRowFixedCol(numLabels, reader, + tagDelimit, delimit, impDelimit, t, cArray, labels,false); + assertEquals(ptr, 4); + assertEquals(cArray[0][2], "Road"); + assertEquals(cArray[3][0], "99603"); + } catch (IOException e) { + e.printStackTrace(); + } + } + @Test public void testReadRowFixedColSecond() { int numLabels = 7; @@ -73,4 +95,30 @@ public void testReadRowFixedColSecond() { e.printStackTrace(); } } + + @Test + public void testReadRowFixedColSecondWithoutDowncasing() { + int numLabels = 7; + BufferedReader reader = new BufferedReader(new StringReader(tagged)); + String tagDelimit = "|"; + String delimit = ",\t/ -():.;'?#`&\"_"; + String impDelimit = ","; + int[] t = new int[numLabels]; + String[][] cArray = new String[numLabels][0]; + try { + int[] labels = DataCruncher.readHeaderInfo(numLabels, reader, tagDelimit); + int ptr = DataCruncher.readRowFixedCol(numLabels, reader, + tagDelimit, delimit, impDelimit, t, cArray, labels,false); + t = new int[numLabels]; + cArray = new String[numLabels][0]; + ptr = DataCruncher.readRowFixedCol(numLabels, reader, + tagDelimit, delimit, impDelimit, t, cArray, labels,false); + assertEquals(ptr, 4); + assertEquals(cArray[0][2], "Center"); + assertEquals(cArray[3][0], "36201"); + } catch (IOException e) { + e.printStackTrace(); + } + } + } diff --git a/test/iitb/Segment/DataCruncherReadRowVarColTest.java b/test/iitb/Segment/DataCruncherReadRowVarColTest.java index 04f0f41..e8ae50e 100644 --- a/test/iitb/Segment/DataCruncherReadRowVarColTest.java +++ b/test/iitb/Segment/DataCruncherReadRowVarColTest.java @@ -47,8 +47,32 @@ public void testReadRowVarCol() { e.printStackTrace(); } } - - @Test + + @Test + public void testReadRowVarColWithoutDowncasing() { + String file = "testdata" + File.separator + "us50-short.tagged"; + try { + BufferedReader tin = new BufferedReader(new FileReader(file)); + int numLabels = 7; + int[] t = new int[7]; + String[][] cArray = new String[7][0]; + String tagDelimit = "|"; + String delimit = ",\t/ -():.;'?#`&\"_"; + String impDelimit = ","; + int ptr = DataCruncher.readRowVarCol(numLabels, tin, tagDelimit, delimit,impDelimit,t,cArray,false); + assertEquals(ptr, 4); + assertEquals(t[3], 7); + assertEquals(cArray[1][1], ","); + assertEquals(cArray[0][2], "road"); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + } + + + @Test public void testReadRowVarColEof() { String file = "testdata" + File.separator + "us50-short.tagged"; try { @@ -69,4 +93,26 @@ public void testReadRowVarColEof() { e.printStackTrace(); } } + @Test + public void testReadRowVarColEofWithoutDowncasing() { + String file = "testdata" + File.separator + "us50-short.tagged"; + try { + BufferedReader tin = new BufferedReader(new FileReader(file)); + int numLabels = 7; + int[] t = new int[7]; + String[][] cArray = new String[7][0]; + String tagDelimit = "|"; + String delimit = ",\t/ -():.;'?#`&\"_"; + String impDelimit = ","; + int ptr = DataCruncher.readRowVarCol(numLabels, tin, tagDelimit, delimit,impDelimit,t,cArray,false); + //Will run into end of file + ptr = DataCruncher.readRowVarCol(numLabels, tin, tagDelimit, delimit,impDelimit,t,cArray,false); + assertEquals(ptr, 4); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + } + } diff --git a/test/iitb/Segment/DataCruncherReadTaggedTest.java b/test/iitb/Segment/DataCruncherReadTaggedTest.java index 37315c5..c10cb8d 100644 --- a/test/iitb/Segment/DataCruncherReadTaggedTest.java +++ b/test/iitb/Segment/DataCruncherReadTaggedTest.java @@ -43,4 +43,27 @@ public void testReadTagged() { assertEquals("road", seq.x(2)); assertEquals(",", seq.x(3)); } + + @Test + public void testReadTaggedWithoutLowerCasing() { + String file = "testdata" + File.separator + "us50-short"; + DataCruncher.createRaw(file, "|"); + File raw = new File("testdata" + File.separator + "us50-short.raw"); + raw.deleteOnExit(); + + int numLabels = 7; + String tfile = file; + String rfile = file; + String tagDelimit = "|"; + String delimit = ",\t/ -():.;'?#`&\"_"; + String impDelimit = ","; + LabelMap labelMap = new LabelMap(); + TrainData data = DataCruncher.readTagged(numLabels, tfile, rfile, delimit, tagDelimit, impDelimit, labelMap,false); + assertTrue(data.hasNext()); + assertEquals(2, data.size()); + DataSequence seq = data.next(); + assertEquals(8, seq.length()); + assertEquals("Road", seq.x(2)); + assertEquals(",", seq.x(3)); + } } From aa05c72c4ae66f4ef8f5978ca0cf9fa45704feaf Mon Sep 17 00:00:00 2001 From: Maximilian Rudolph Date: Mon, 13 Oct 2014 15:54:00 +0200 Subject: [PATCH 2/2] fixed test --- test/iitb/Segment/DataCruncherReadRowVarColTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/iitb/Segment/DataCruncherReadRowVarColTest.java b/test/iitb/Segment/DataCruncherReadRowVarColTest.java index e8ae50e..ad0f279 100644 --- a/test/iitb/Segment/DataCruncherReadRowVarColTest.java +++ b/test/iitb/Segment/DataCruncherReadRowVarColTest.java @@ -63,7 +63,7 @@ public void testReadRowVarColWithoutDowncasing() { assertEquals(ptr, 4); assertEquals(t[3], 7); assertEquals(cArray[1][1], ","); - assertEquals(cArray[0][2], "road"); + assertEquals(cArray[0][2], "Road"); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) {