From 17cc866cc97a60991b00dd8f47d4bf5abbd9d725 Mon Sep 17 00:00:00 2001
From: christofs <c.schoech@gmail.com>
Date: Mon, 17 Aug 2015 11:27:43 +0200
Subject: [PATCH 01/56] Kleinigkeiten

---
 my_tmw.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/my_tmw.py b/my_tmw.py
index 71076df..88ce0aa 100644
--- a/my_tmw.py
+++ b/my_tmw.py
@@ -97,10 +97,10 @@
 #tmw.average_topicscores(corpuspath, mastermatrixfile, metadatafile, topics_in_texts, targets, mode, number_of_topics, outfolder)
 
 ### 5b make_topic_distribution_plot
-aggregates = wdir+"/7_aggregates/avg*decade.csv" # if mode == lineplot, use only bydecade data!
+aggregates = wdir+"/7_aggregates/avg*decade.csv" # if mode == lineplot / areaplot, use only bydecade data!
 outfolder = wdir+"/8_visuals/"
 topicwordfile = wdir+"/6_mallet/topics-with-words.csv"
-rows_shown = 200 # if mode == lineplot, set to maximum number of topics
+rows_shown = 200 # if mode == lineplot / areaplot, set to maximum number of topics
 font_scale = 1.0
 dpi = 300
 mode = "areaplot" # heatmap|lineplot|areaplot

From 08fd0426894a30bfa05b1a25770ac048a9782466 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20Schl=C3=B6r?=
 <daniel.schloer@informatik.uni-wuerzburg.de>
Date: Wed, 19 Aug 2015 09:36:57 +0200
Subject: [PATCH 02/56] Flexible paragraph respecting segmenter function with
 tolerance factor added

---
 tmw.py | 87 +++++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 52 insertions(+), 35 deletions(-)

diff --git a/tmw.py b/tmw.py
index f25d8a6..fe0e5d3 100644
--- a/tmw.py
+++ b/tmw.py
@@ -68,53 +68,70 @@ def tei5reader_fulldocs(inpath, outfolder):
             output.write(outtext)
     print("Done.")
 
-
-def segmenter(inpath, outfolder, target):
+# Utility function for writing segments
+def writesegment(segment, outfolder, filename, counter):
+    from os.path import join
+    segname = join(outfolder, filename + "§{:04d}".format(counter) + ".txt")
+    with open(segname,"w") as output:
+        output.write(' '.join(segment))
+    output.close()
+
+# Parameters:
+#   - inpath:               path to search documents in
+#   - outfolder:            path to save segments in
+#   - target:               number of words per segment
+#   - sizetolerancefactor:  factor of which exceedance of target is tolerated before slicing paragraphs
+#                               1 for zero tolerance
+#                              -1 for infinity tolerance
+#   - preserveparagraphs:   if True, segments will contain linebreaks according to paragraphs
+#
+def segmenter(inpath, outfolder, target, sizetolerancefactor = -1, preserveparagraphs = False):
     """Script for turning plain text files into equal-sized segments, without respecting paragraph boundaries."""
     print("\nLaunched segmenter.")
-
     import os
-    import glob
     import re
+    from os import listdir
+    from os.path import join
 
     if not os.path.exists(outfolder):
         os.makedirs(outfolder)
-        
-    for file in glob.glob(inpath):
+    counter = 1
+    for relfile in listdir(inpath):
+        file = join(inpath, relfile)
         with open(file, "r") as infile:
             filename = os.path.basename(file)[:-4]
-            #print("File name: ", filename)
-            text = infile.read()
 
-            text = re.sub("[,;\.!?—]", " ", text)
-            text = re.sub("-", " ", text)
-            text = re.sub("\n", " ", text)
-            text = re.sub("[ ]{1,9}", " ", text)
-            words = re.split("\W", text)
-            #print("Number of words: ", filename, len(words))
-            #for word in words[0:31]:
-            #    print(word)
-
-            seg = ""
-            actual = 0
-            counter = 0
-            for i in range(len(words)-1):
-                if len(words[i]) > 1:
-                    if actual < target:
-                        seg = seg + words[i] + " "
-                        #print(words[i])
-                        segsplit = re.split(" ", seg)
-                        actual = len(segsplit)
-                    else:
-                        counter += 1
-                        actual = 0
-                        segname = outfolder + filename + "§{:04d}".format(counter) + ".txt"
-                        with open(segname,"w") as output:
-                            output.write(seg)
-                            seg = ""
+            segment = []
+            for line in infile:
+                text = line
+                text = re.sub("[,;\.!?—]", " ", text)
+                text = re.sub("-", " ", text)
+                text = re.sub("[ ]{1,9}", " ", text)
+                words = re.split("\W", text)
+                if preserveparagraphs:
+                    words.append("\n")
+                if sizetolerancefactor != -1 and len(segment) + len(words) > target * sizetolerancefactor:
+                    print("Segment length extending size-constraints. Checking if segment length is sufficient yet.")
+                    if len(segment) * sizetolerancefactor < target:
+                        print("Segment length isn't sufficient. Slicing paragraph to meet segment-legth-constraints.")
+                        # wortweise auffüllen
+                        wordsliceindex = target - len(segment)
+                        segment.extend(words[0:wordsliceindex])
+                        words = words[wordsliceindex:len(words)]
+                    print("Segment length: \t", len(segment))
+                    writesegment(segment, outfolder, filename, counter)
+                    counter = counter + 1
+                    segment = []
+                segment.extend(words)
+                if len(segment) >= target:
+                    print("Segment length: \t", len(segment))
+                    writesegment(segment, outfolder, filename, counter)
+                    counter = counter + 1
+                    segment = []
+        print("Segment length: \t", len(segment))
+        writesegment(segment, outfolder, filename, counter)
     print("Done.")
 
-
 def segments_to_bins(inpath, outfile):
     """Script for sorting text segments into bins."""
     print("\nLaunched segments_to_bins.")

From 14ff8ade67f87acdfeb978aa9ab0eeb73c12c5dc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20Schl=C3=B6r?= <daniel.schloer@gmail.com>
Date: Thu, 20 Aug 2015 11:50:29 +0200
Subject: [PATCH 03/56] Tokanization changed

---
 tmw.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tmw.py b/tmw.py
index fe0e5d3..cae666b 100644
--- a/tmw.py
+++ b/tmw.py
@@ -89,9 +89,11 @@ def segmenter(inpath, outfolder, target, sizetolerancefactor = -1, preserveparag
     """Script for turning plain text files into equal-sized segments, without respecting paragraph boundaries."""
     print("\nLaunched segmenter.")
     import os
+    import glob
     import re
     from os import listdir
     from os.path import join
+    from nltk.tokenize import word_tokenize
 
     if not os.path.exists(outfolder):
         os.makedirs(outfolder)
@@ -107,7 +109,9 @@ def segmenter(inpath, outfolder, target, sizetolerancefactor = -1, preserveparag
                 text = re.sub("[,;\.!?—]", " ", text)
                 text = re.sub("-", " ", text)
                 text = re.sub("[ ]{1,9}", " ", text)
-                words = re.split("\W", text)
+               # words = re.split("\W", text)
+                words = word_tokenize(text)
+                print(words)
                 if preserveparagraphs:
                     words.append("\n")
                 if sizetolerancefactor != -1 and len(segment) + len(words) > target * sizetolerancefactor:

From 3e7eca3bffe57ad089264e4ca24dfe08f3980a98 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20Schl=C3=B6r?= <daniel.schloer@gmail.com>
Date: Thu, 20 Aug 2015 11:53:23 +0200
Subject: [PATCH 04/56] Code cleanup

---
 tmw.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/tmw.py b/tmw.py
index cae666b..831dc84 100644
--- a/tmw.py
+++ b/tmw.py
@@ -89,7 +89,6 @@ def segmenter(inpath, outfolder, target, sizetolerancefactor = -1, preserveparag
     """Script for turning plain text files into equal-sized segments, without respecting paragraph boundaries."""
     print("\nLaunched segmenter.")
     import os
-    import glob
     import re
     from os import listdir
     from os.path import join
@@ -106,12 +105,10 @@ def segmenter(inpath, outfolder, target, sizetolerancefactor = -1, preserveparag
             segment = []
             for line in infile:
                 text = line
-                text = re.sub("[,;\.!?—]", " ", text)
+                text = re.sub("[,;\.!?—\t\r\n\v\f]", " ", text)
                 text = re.sub("-", " ", text)
                 text = re.sub("[ ]{1,9}", " ", text)
-               # words = re.split("\W", text)
                 words = word_tokenize(text)
-                print(words)
                 if preserveparagraphs:
                     words.append("\n")
                 if sizetolerancefactor != -1 and len(segment) + len(words) > target * sizetolerancefactor:

From c0e101b1ab8106e414b71604adb87dc03b58cbed Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20Schl=C3=B6r?= <daniel.schloer@gmail.com>
Date: Thu, 20 Aug 2015 16:07:51 +0200
Subject: [PATCH 05/56] last segment added to previous if too small

---
 tmw.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/tmw.py b/tmw.py
index 831dc84..f1a7500 100644
--- a/tmw.py
+++ b/tmw.py
@@ -69,10 +69,10 @@ def tei5reader_fulldocs(inpath, outfolder):
     print("Done.")
 
 # Utility function for writing segments
-def writesegment(segment, outfolder, filename, counter):
+def writesegment(segment, outfolder, filename, counter, mode="w"):
     from os.path import join
     segname = join(outfolder, filename + "§{:04d}".format(counter) + ".txt")
-    with open(segname,"w") as output:
+    with open(segname, mode) as output:
         output.write(' '.join(segment))
     output.close()
 
@@ -98,6 +98,7 @@ def segmenter(inpath, outfolder, target, sizetolerancefactor = -1, preserveparag
         os.makedirs(outfolder)
     counter = 1
     for relfile in listdir(inpath):
+        counter = 1
         file = join(inpath, relfile)
         with open(file, "r") as infile:
             filename = os.path.basename(file)[:-4]
@@ -130,7 +131,13 @@ def segmenter(inpath, outfolder, target, sizetolerancefactor = -1, preserveparag
                     counter = counter + 1
                     segment = []
         print("Segment length: \t", len(segment))
-        writesegment(segment, outfolder, filename, counter)
+        if sizetolerancefactor != -1 and len(segment) * sizetolerancefactor < target:
+            print("Segment length of last Segment too short. Adding text to previous segment.")
+            counter = counter - 1
+            writesegment(segment, outfolder, filename, counter, "a")
+        else:
+            writesegment(segment, outfolder, filename, counter)
+
     print("Done.")
 
 def segments_to_bins(inpath, outfile):

From 5d9e06fbde5c916c39ed4f4a68e6c0c104cf629f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20Schl=C3=B6r?= <daniel.schloer@gmail.com>
Date: Fri, 21 Aug 2015 16:42:24 +0200
Subject: [PATCH 06/56] Single lines with length > target segment size will be
 sliced

---
 tmw.py | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/tmw.py b/tmw.py
index f1a7500..2ad92ac 100644
--- a/tmw.py
+++ b/tmw.py
@@ -93,16 +93,18 @@ def segmenter(inpath, outfolder, target, sizetolerancefactor = -1, preserveparag
     from os import listdir
     from os.path import join
     from nltk.tokenize import word_tokenize
+    import glob
 
     if not os.path.exists(outfolder):
         os.makedirs(outfolder)
     counter = 1
-    for relfile in listdir(inpath):
+    for relfile in glob.glob(inpath):
         counter = 1
         file = join(inpath, relfile)
         with open(file, "r") as infile:
             filename = os.path.basename(file)[:-4]
-
+            if filename == "rf0053":
+                print("now")
             segment = []
             for line in infile:
                 text = line
@@ -112,7 +114,7 @@ def segmenter(inpath, outfolder, target, sizetolerancefactor = -1, preserveparag
                 words = word_tokenize(text)
                 if preserveparagraphs:
                     words.append("\n")
-                if sizetolerancefactor != -1 and len(segment) + len(words) > target * sizetolerancefactor:
+                while sizetolerancefactor != -1 and len(segment) + len(words) > target * sizetolerancefactor:
                     print("Segment length extending size-constraints. Checking if segment length is sufficient yet.")
                     if len(segment) * sizetolerancefactor < target:
                         print("Segment length isn't sufficient. Slicing paragraph to meet segment-legth-constraints.")
@@ -125,17 +127,24 @@ def segmenter(inpath, outfolder, target, sizetolerancefactor = -1, preserveparag
                     counter = counter + 1
                     segment = []
                 segment.extend(words)
-                if len(segment) >= target:
+                if sizetolerancefactor != -1 and len(segment) > 0 and len(segment) * sizetolerancefactor < target:
+                    print("Segment length of last Segment too short. Adding text to previous segment.")
+                    counter = counter - 1
+                    writesegment(segment, outfolder, filename, counter, "a")
+                    counter = counter + 1
                     print("Segment length: \t", len(segment))
+                    segment = []
+                elif len(segment) > 0:
                     writesegment(segment, outfolder, filename, counter)
+                    print("Segment length: \t", len(segment))
                     counter = counter + 1
                     segment = []
-        print("Segment length: \t", len(segment))
-        if sizetolerancefactor != -1 and len(segment) * sizetolerancefactor < target:
+        if sizetolerancefactor != -1 and len(segment) > 0 and len(segment) * sizetolerancefactor < target:
             print("Segment length of last Segment too short. Adding text to previous segment.")
             counter = counter - 1
             writesegment(segment, outfolder, filename, counter, "a")
-        else:
+            counter = counter + 1
+        elif len(segment) > 0:
             writesegment(segment, outfolder, filename, counter)
 
     print("Done.")
@@ -319,7 +328,7 @@ def nltk_stanfordpos(inpath, outfolder):
 
     import os
     import glob
-    from nltk.tag.stanford import POSTagger
+    from nltk.tag.stanford import StanfordPOSTagger as POSTagger
 
     for file in glob.glob(inpath):
         st = POSTagger('/home/christof/Programs/stanfordpos/models/french.tagger', '/home/christof/Programs/stanfordpos/stanford-postagger.jar', encoding="utf8")
@@ -522,7 +531,7 @@ def get_color_scale(word, font_size, position, orientation, random_state=None):
         #font_path = "/home/christof/.fonts/AveriaSans-Regular.ttf"
         wordcloud = WordCloud(font_path=font_path, background_color="white", margin=5).generate(text)
         default_colors = wordcloud.to_array()
-        plt.imshow(wordcloud.recolor(color_func=get_color_scale, random_state=3))
+        plt.imshow(wordcloud.recolor(random_state=3)) #color_func=get_color_scale
         plt.imshow(default_colors)
         plt.imshow(wordcloud)
         plt.title(figure_title, fontsize=24)

From e1214dd2ee2e167d67501d3b9aff22b33dfaf423 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20Schl=C3=B6r?= <daniel.schloer@gmail.com>
Date: Fri, 21 Aug 2015 16:45:39 +0200
Subject: [PATCH 07/56] cleanup to avoid conflicts

---
 tmw.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/tmw.py b/tmw.py
index 2ad92ac..56e8bf8 100644
--- a/tmw.py
+++ b/tmw.py
@@ -103,8 +103,6 @@ def segmenter(inpath, outfolder, target, sizetolerancefactor = -1, preserveparag
         file = join(inpath, relfile)
         with open(file, "r") as infile:
             filename = os.path.basename(file)[:-4]
-            if filename == "rf0053":
-                print("now")
             segment = []
             for line in infile:
                 text = line
@@ -328,7 +326,7 @@ def nltk_stanfordpos(inpath, outfolder):
 
     import os
     import glob
-    from nltk.tag.stanford import StanfordPOSTagger as POSTagger
+    from nltk.tag.stanford import POSTagger
 
     for file in glob.glob(inpath):
         st = POSTagger('/home/christof/Programs/stanfordpos/models/french.tagger', '/home/christof/Programs/stanfordpos/stanford-postagger.jar', encoding="utf8")
@@ -531,7 +529,7 @@ def get_color_scale(word, font_size, position, orientation, random_state=None):
         #font_path = "/home/christof/.fonts/AveriaSans-Regular.ttf"
         wordcloud = WordCloud(font_path=font_path, background_color="white", margin=5).generate(text)
         default_colors = wordcloud.to_array()
-        plt.imshow(wordcloud.recolor(random_state=3)) #color_func=get_color_scale
+        plt.imshow(color_func=get_color_scale, wordcloud.recolor(random_state=3))
         plt.imshow(default_colors)
         plt.imshow(wordcloud)
         plt.title(figure_title, fontsize=24)

From d2cd4502a98f9ce092348227d1ac6baa3514b69a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20Schl=C3=B6r?= <daniel.schloer@gmail.com>
Date: Fri, 21 Aug 2015 16:46:33 +0200
Subject: [PATCH 08/56] cleanup to avoid conflicts

---
 tmw.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tmw.py b/tmw.py
index 56e8bf8..a8d08eb 100644
--- a/tmw.py
+++ b/tmw.py
@@ -529,7 +529,7 @@ def get_color_scale(word, font_size, position, orientation, random_state=None):
         #font_path = "/home/christof/.fonts/AveriaSans-Regular.ttf"
         wordcloud = WordCloud(font_path=font_path, background_color="white", margin=5).generate(text)
         default_colors = wordcloud.to_array()
-        plt.imshow(color_func=get_color_scale, wordcloud.recolor(random_state=3))
+        plt.imshow(wordcloud.recolor(color_func=get_color_scale, random_state=3))
         plt.imshow(default_colors)
         plt.imshow(wordcloud)
         plt.title(figure_title, fontsize=24)

From 24e136dd38f3cd6bb62c754cab64e30ec455550f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20Schl=C3=B6r?= <daniel.schloer@gmail.com>
Date: Mon, 24 Aug 2015 13:06:11 +0200
Subject: [PATCH 09/56] keeping track of last segment size

---
 tmw.py | 148 ++++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 136 insertions(+), 12 deletions(-)

diff --git a/tmw.py b/tmw.py
index a8d08eb..0c19e38 100644
--- a/tmw.py
+++ b/tmw.py
@@ -57,10 +57,10 @@ def tei5reader_fulldocs(inpath, outfolder):
             ### Some cleaning up
             text = re.sub("  ", "", text)
             #text = re.sub("    ", "", text)
-            text = re.sub("\n{1,6}", " ", text)
+          #  text = re.sub("\n{1,6}", " ", text)
             #text = re.sub("\n{1,6}", "\n", text)
-            text = re.sub("\n \n", "\n", text)
-            text = re.sub("\t\n", "", text)
+          #  text = re.sub("\n \n", "\n", text)
+           # text = re.sub("\t\n", "", text)
 
             outtext = str(text)
             outfile = outfolder + filename + ".txt"
@@ -76,6 +76,71 @@ def writesegment(segment, outfolder, filename, counter, mode="w"):
         output.write(' '.join(segment))
     output.close()
 
+def write(segment, file, mode = "w"):
+    with open(file, mode) as output:
+        output.write(' '.join(segment))
+        output.close()
+
+
+counter = 0
+currentsegmentsize = 0
+
+# Utility function for writing segments
+def writesegment(segment, outfolder, filename, target, tolerancefactor):
+    from os.path import join
+    global currentsegmentsize
+    global counter
+
+    segname = join(outfolder, filename + "§{:04d}".format(counter) + ".txt")
+
+    # case: last segment is too small => fill with (slice of) new segment
+    if currentsegmentsize * tolerancefactor < target: # min size limit not reached => split
+            #split segment
+            wordsliceindex = target - len(currentsegmentsize)
+
+            # if it's too big: slice!
+            if len(segment) > wordsliceindex:
+                write(segment[0:wordsliceindex], segname, "a")
+                currentsegmentsize += wordsliceindex
+                segment = segment[wordsliceindex:len(segment)]
+            else:
+                # segment fits so append
+                write(segment, segname, "a")
+                currentsegmentsize += len(segment)
+                # done
+                return
+
+
+    # # case: new segment is too big, last segment is too small
+    # while currentsegmentsize + len(segment) > target * tolerancefactor: # max size limit exceeded
+    #     if currentsegmentsize * tolerancefactor < target: # min size limit not reached => split
+    #         #split segment
+    #         wordsliceindex = target - len(currentsegmentsize)
+    #         write(segment[0:wordsliceindex], segname, "a")
+    #         segment = segment[wordsliceindex:len(segment)]
+    #         currentsegmentsize += wordsliceindex
+
+
+    # case: new segment is too big
+    # if segment > targer: slice segment
+    while len(segment) > target * tolerancefactor:
+        counter += 1
+        currentsegmentsize = 0
+        write(segment[0:target], segname)
+        segment = segment[target:len(segment)]
+
+    # now #segment is < target
+    if (len(segment) == 0):
+        #segment was perfectly sliced so we are done
+        return
+
+    # there's some part of segment left, write this to file
+    counter += 1
+    currentsegmentsize = len(segment)
+    write(segment, segname)
+
+
+
 # Parameters:
 #   - inpath:               path to search documents in
 #   - outfolder:            path to save segments in
@@ -98,52 +163,111 @@ def segmenter(inpath, outfolder, target, sizetolerancefactor = -1, preserveparag
     if not os.path.exists(outfolder):
         os.makedirs(outfolder)
     counter = 1
+    # work on files in inpath
     for relfile in glob.glob(inpath):
         counter = 1
+
+        # get absolut filename
         file = join(inpath, relfile)
+        # track size of last segment in order to avoid exceeding size constraints while appending
+
+        lastsegment = 0
         with open(file, "r") as infile:
             filename = os.path.basename(file)[:-4]
+            # segment contains words assigned to the current segment
             segment = []
+
+            # go thru paragraphs one by one
             for line in infile:
                 text = line
+                # remove special characters and space-chains
                 text = re.sub("[,;\.!?—\t\r\n\v\f]", " ", text)
                 text = re.sub("-", " ", text)
                 text = re.sub("[ ]{1,9}", " ", text)
+
+                # tokanize text
                 words = word_tokenize(text)
+
                 if preserveparagraphs:
                     words.append("\n")
+
+                # while current #segment and #remaining-words exceed limitation (e.g. line too long)
                 while sizetolerancefactor != -1 and len(segment) + len(words) > target * sizetolerancefactor:
                     print("Segment length extending size-constraints. Checking if segment length is sufficient yet.")
+
+                    # if #segment is yet too small, extend to desired limit with part of words (e.g. slice line)
                     if len(segment) * sizetolerancefactor < target:
                         print("Segment length isn't sufficient. Slicing paragraph to meet segment-legth-constraints.")
                         # wortweise auffüllen
                         wordsliceindex = target - len(segment)
                         segment.extend(words[0:wordsliceindex])
                         words = words[wordsliceindex:len(words)]
+
+                    # Possible states: Line: <line still too long> / <line meeting constraints>
+                    #               Segment: <segment within size constraints> => save segment
+
                     print("Segment length: \t", len(segment))
                     writesegment(segment, outfolder, filename, counter)
                     counter = counter + 1
+                    lastsegment = len(segment)
                     segment = []
+
+                    # dealing with state: <line still too long>:
+                    if len(words) > target * sizetolerancefactor:
+                        segment.extend(words[0:target])
+                        words = words[target:len(words)]
+
+                # line completely processed
+                if len(words) == 0:
+                    continue
+
+                # possible states:  <line meeting size constraints> / <line too small>
+
                 segment.extend(words)
+
+                # dealing with state: <line too small>:
+                # if words is too small for its own segment: append to previous segment ignoring further limitations
                 if sizetolerancefactor != -1 and len(segment) > 0 and len(segment) * sizetolerancefactor < target:
                     print("Segment length of last Segment too short. Adding text to previous segment.")
-                    counter = counter - 1
-                    writesegment(segment, outfolder, filename, counter, "a")
+
+                    # avoid appending beyond size constraints:
+                    if sizetolerancefactor != -1 and lastsegment + len(segment) > target * sizetolerancefactor:
+                        print("Segment length of last segment exceeded. Starting new segment")
+                        lastsegment = len(segment)
+                        writesegment(segment, outfolder, filename, counter)
+                    else:
+                        print("Segment length of last segment: " + str(lastsegment))
+                        counter = counter - 1
+                        lastsegment += len(segment)
+                        writesegment(segment, outfolder, filename, counter, "a")
                     counter = counter + 1
                     print("Segment length: \t", len(segment))
                     segment = []
+                # otherways just save remaining words in own segment
                 elif len(segment) > 0:
                     writesegment(segment, outfolder, filename, counter)
                     print("Segment length: \t", len(segment))
                     counter = counter + 1
+                    lastsegment = len(segment)
                     segment = []
-        if sizetolerancefactor != -1 and len(segment) > 0 and len(segment) * sizetolerancefactor < target:
-            print("Segment length of last Segment too short. Adding text to previous segment.")
-            counter = counter - 1
-            writesegment(segment, outfolder, filename, counter, "a")
-            counter = counter + 1
-        elif len(segment) > 0:
-            writesegment(segment, outfolder, filename, counter)
+        # # following code might be obsolete. TODO: check!
+        # if sizetolerancefactor != -1 and len(segment) > 0 and len(segment) * sizetolerancefactor < target:
+        #     print("Segment length of last Segment too short. Adding text to previous segment.")
+        #     # avoid appending beyond size constraints:
+        #     if sizetolerancefactor != -1 and lastsegment + len(segment) > target * sizetolerancefactor:
+        #         print("Segment length of last segment exceeded. Starting new segment")
+        #         lastsegment = 0
+        #         writesegment(segment, outfolder, filename, counter)
+        #     else:
+        #         counter = counter - 1
+        #         lastsegment += len(segment)
+        #         writesegment(segment, outfolder, filename, counter, "a")
+        #     counter = counter + 1
+        #     print("Segment length: \t", len(segment))
+        #     segment = []
+        # elif len(segment) > 0:
+        #     writesegment(segment, outfolder, filename, counter)
+        #     lastsegment = segment
 
     print("Done.")
 

From e1e6e695c92f2cde557f2c90347e64468bea8ba9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20Schl=C3=B6r?= <daniel.schloer@gmail.com>
Date: Mon, 24 Aug 2015 16:20:58 +0200
Subject: [PATCH 10/56] complete refactoring of segmentation

---
 tmw.py | 207 +++++++++++++++++++++++++--------------------------------
 1 file changed, 89 insertions(+), 118 deletions(-)

diff --git a/tmw.py b/tmw.py
index 0c19e38..d9eea14 100644
--- a/tmw.py
+++ b/tmw.py
@@ -76,68 +76,110 @@ def writesegment(segment, outfolder, filename, counter, mode="w"):
         output.write(' '.join(segment))
     output.close()
 
+# Utility function for writing into files
 def write(segment, file, mode = "w"):
     with open(file, mode) as output:
         output.write(' '.join(segment))
         output.close()
 
 
+# global segment counter
 counter = 0
+
+# global current segment size
 currentsegmentsize = 0
 
 # Utility function for writing segments
-def writesegment(segment, outfolder, filename, target, tolerancefactor):
+def writesegment(segment, outfolder, filename, target, tolerancefactor, preserveparagraphs):
     from os.path import join
     global currentsegmentsize
     global counter
 
+
+
+
+    # ignore empty segments
+    if segment == ["\n"] or len(segment) < 1:
+        return
+
+
+    # workaround for easy inter line-spacing in case of paragraph removal for lines combined into one segment
+    if not preserveparagraphs and segment[-1] == "\n":
+        segment = segment[0:len(segment) - 1]
+        segment[-1] += " "
+
+
     segname = join(outfolder, filename + "§{:04d}".format(counter) + ".txt")
+    relname = filename + "§{:04d}".format(counter) + ".txt"
+
+
 
     # case: last segment is too small => fill with (slice of) new segment
     if currentsegmentsize * tolerancefactor < target: # min size limit not reached => split
-            #split segment
-            wordsliceindex = target - len(currentsegmentsize)
-
-            # if it's too big: slice!
-            if len(segment) > wordsliceindex:
-                write(segment[0:wordsliceindex], segname, "a")
-                currentsegmentsize += wordsliceindex
-                segment = segment[wordsliceindex:len(segment)]
-            else:
-                # segment fits so append
-                write(segment, segname, "a")
-                currentsegmentsize += len(segment)
-                # done
-                return
-
-
-    # # case: new segment is too big, last segment is too small
-    # while currentsegmentsize + len(segment) > target * tolerancefactor: # max size limit exceeded
-    #     if currentsegmentsize * tolerancefactor < target: # min size limit not reached => split
-    #         #split segment
-    #         wordsliceindex = target - len(currentsegmentsize)
-    #         write(segment[0:wordsliceindex], segname, "a")
-    #         segment = segment[wordsliceindex:len(segment)]
-    #         currentsegmentsize += wordsliceindex
-
+        #split segment
+        wordsliceindex = target - currentsegmentsize
+
+        # if it's too big: slice!
+        if currentsegmentsize + len(segment) > target * tolerancefactor:
+            print(relname + "\t Last segment size: " + str(currentsegmentsize) + "\t appending " + str(wordsliceindex) + "\t for a total of " + str((currentsegmentsize + wordsliceindex)))
+            write(segment[0:wordsliceindex], segname, "a")
+            currentsegmentsize += wordsliceindex
+            segment = segment[wordsliceindex:len(segment)]
+
+            # segment is filled. continue with next one
+            counter += 1
+            currentsegmentsize = 0
+            segname = join(outfolder, filename + "§{:04d}".format(counter) + ".txt")
+            relname = filename + "§{:04d}".format(counter) + ".txt"
+            if os.path.isfile(segname):
+                os.remove(segname)
+        # else just add text to current segment
+        else:
+            print(relname + "\t Last segment size: " + str(currentsegmentsize) + "\t appending " + str(len(segment)) + "\t for a total of " + str((currentsegmentsize + len(segment))))
+            # segment fits so append
+            write(segment, segname, "a")
+            currentsegmentsize += len(segment) - segment.count("\n") # take possible segment end into account!
+            # done
+            return
 
     # case: new segment is too big
-    # if segment > targer: slice segment
+    # if segment > target: slice segment
     while len(segment) > target * tolerancefactor:
-        counter += 1
-        currentsegmentsize = 0
+        print(relname + "\t Last segment size: " + str(currentsegmentsize) + "\t appending " + str(target) + "\t for a total of " + str((currentsegmentsize + target)))
         write(segment[0:target], segname)
         segment = segment[target:len(segment)]
 
-    # now #segment is < target
+        # segment is filled. continue with next one
+        counter += 1
+        currentsegmentsize = 0
+        segname = join(outfolder, filename + "§{:04d}".format(counter) + ".txt")
+        relname = filename + "§{:04d}".format(counter) + ".txt"
+        if os.path.isfile(segname):
+            os.remove(segname)
+        print(relname + "\t New segment with size \t0")
+
+    # now size of segment is < target
     if (len(segment) == 0):
         #segment was perfectly sliced so we are done
         return
 
-    # there's some part of segment left, write this to file
-    counter += 1
-    currentsegmentsize = len(segment)
-    write(segment, segname)
+    # there's some part of segment left, write this into file
+
+
+    # if the remaining part is exceeding current segment's capacity start new segment
+    if currentsegmentsize + len(segment) > target * tolerancefactor:
+        # segment is filled. continue with next one
+        counter += 1
+        currentsegmentsize = 0
+        segname = join(outfolder, filename + "§{:04d}".format(counter) + ".txt")
+        relname = filename + "§{:04d}".format(counter) + ".txt"
+        if os.path.isfile(segname):
+            os.remove(segname)
+        print(relname + "\t New segment with size \t0")
+
+    print(relname + "\t Last segment size: " + str(currentsegmentsize) + "\t appending " + str(len(segment)) + "\t for a total of " + str((currentsegmentsize + len(segment))))
+    currentsegmentsize += len(segment) - segment.count("\n") # take possible segment end into account!
+    write(segment, segname, "a")
 
 
 
@@ -162,18 +204,25 @@ def segmenter(inpath, outfolder, target, sizetolerancefactor = -1, preserveparag
 
     if not os.path.exists(outfolder):
         os.makedirs(outfolder)
-    counter = 1
+    global counter
+    global currentsegmentsize
     # work on files in inpath
     for relfile in glob.glob(inpath):
-        counter = 1
+
 
         # get absolut filename
         file = join(inpath, relfile)
-        # track size of last segment in order to avoid exceeding size constraints while appending
 
-        lastsegment = 0
         with open(file, "r") as infile:
             filename = os.path.basename(file)[:-4]
+
+            counter = 0
+            currentsegmentsize = 0
+            segname = join(outfolder, filename + "§{:04d}".format(counter) + ".txt")
+            relname = filename + "§{:04d}".format(counter) + ".txt"
+            if os.path.isfile(segname):
+                os.remove(segname)
+
             # segment contains words assigned to the current segment
             segment = []
 
@@ -188,86 +237,8 @@ def segmenter(inpath, outfolder, target, sizetolerancefactor = -1, preserveparag
                 # tokanize text
                 words = word_tokenize(text)
 
-                if preserveparagraphs:
-                    words.append("\n")
-
-                # while current #segment and #remaining-words exceed limitation (e.g. line too long)
-                while sizetolerancefactor != -1 and len(segment) + len(words) > target * sizetolerancefactor:
-                    print("Segment length extending size-constraints. Checking if segment length is sufficient yet.")
-
-                    # if #segment is yet too small, extend to desired limit with part of words (e.g. slice line)
-                    if len(segment) * sizetolerancefactor < target:
-                        print("Segment length isn't sufficient. Slicing paragraph to meet segment-legth-constraints.")
-                        # wortweise auffüllen
-                        wordsliceindex = target - len(segment)
-                        segment.extend(words[0:wordsliceindex])
-                        words = words[wordsliceindex:len(words)]
-
-                    # Possible states: Line: <line still too long> / <line meeting constraints>
-                    #               Segment: <segment within size constraints> => save segment
-
-                    print("Segment length: \t", len(segment))
-                    writesegment(segment, outfolder, filename, counter)
-                    counter = counter + 1
-                    lastsegment = len(segment)
-                    segment = []
-
-                    # dealing with state: <line still too long>:
-                    if len(words) > target * sizetolerancefactor:
-                        segment.extend(words[0:target])
-                        words = words[target:len(words)]
-
-                # line completely processed
-                if len(words) == 0:
-                    continue
-
-                # possible states:  <line meeting size constraints> / <line too small>
-
-                segment.extend(words)
-
-                # dealing with state: <line too small>:
-                # if words is too small for its own segment: append to previous segment ignoring further limitations
-                if sizetolerancefactor != -1 and len(segment) > 0 and len(segment) * sizetolerancefactor < target:
-                    print("Segment length of last Segment too short. Adding text to previous segment.")
-
-                    # avoid appending beyond size constraints:
-                    if sizetolerancefactor != -1 and lastsegment + len(segment) > target * sizetolerancefactor:
-                        print("Segment length of last segment exceeded. Starting new segment")
-                        lastsegment = len(segment)
-                        writesegment(segment, outfolder, filename, counter)
-                    else:
-                        print("Segment length of last segment: " + str(lastsegment))
-                        counter = counter - 1
-                        lastsegment += len(segment)
-                        writesegment(segment, outfolder, filename, counter, "a")
-                    counter = counter + 1
-                    print("Segment length: \t", len(segment))
-                    segment = []
-                # otherways just save remaining words in own segment
-                elif len(segment) > 0:
-                    writesegment(segment, outfolder, filename, counter)
-                    print("Segment length: \t", len(segment))
-                    counter = counter + 1
-                    lastsegment = len(segment)
-                    segment = []
-        # # following code might be obsolete. TODO: check!
-        # if sizetolerancefactor != -1 and len(segment) > 0 and len(segment) * sizetolerancefactor < target:
-        #     print("Segment length of last Segment too short. Adding text to previous segment.")
-        #     # avoid appending beyond size constraints:
-        #     if sizetolerancefactor != -1 and lastsegment + len(segment) > target * sizetolerancefactor:
-        #         print("Segment length of last segment exceeded. Starting new segment")
-        #         lastsegment = 0
-        #         writesegment(segment, outfolder, filename, counter)
-        #     else:
-        #         counter = counter - 1
-        #         lastsegment += len(segment)
-        #         writesegment(segment, outfolder, filename, counter, "a")
-        #     counter = counter + 1
-        #     print("Segment length: \t", len(segment))
-        #     segment = []
-        # elif len(segment) > 0:
-        #     writesegment(segment, outfolder, filename, counter)
-        #     lastsegment = segment
+                words.append("\n")
+                writesegment(words, outfolder, filename, target, sizetolerancefactor, preserveparagraphs)
 
     print("Done.")
 

From fc64ddffe6defb346ee8f4e7eb57e3b82964bd16 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20Schl=C3=B6r?= <daniel.schloer@gmail.com>
Date: Mon, 24 Aug 2015 17:16:46 +0200
Subject: [PATCH 11/56] default value changed to 1

---
 tmw.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tmw.py b/tmw.py
index d9eea14..645e239 100644
--- a/tmw.py
+++ b/tmw.py
@@ -192,7 +192,7 @@ def writesegment(segment, outfolder, filename, target, tolerancefactor, preserve
 #                              -1 for infinity tolerance
 #   - preserveparagraphs:   if True, segments will contain linebreaks according to paragraphs
 #
-def segmenter(inpath, outfolder, target, sizetolerancefactor = -1, preserveparagraphs = False):
+def segmenter(inpath, outfolder, target, sizetolerancefactor = 1, preserveparagraphs = False):
     """Script for turning plain text files into equal-sized segments, without respecting paragraph boundaries."""
     print("\nLaunched segmenter.")
     import os

From 9f3e5b0b044a7542725ef155013424eced4b6929 Mon Sep 17 00:00:00 2001
From: christofs <c.schoech@gmail.com>
Date: Tue, 25 Aug 2015 16:27:59 +0200
Subject: [PATCH 12/56] Removed defaults from segmenter, added parameters
 instead

---
 my_tmw.py | 4 +++-
 tmw.py    | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/my_tmw.py b/my_tmw.py
index 88ce0aa..1afa717 100644
--- a/my_tmw.py
+++ b/my_tmw.py
@@ -25,7 +25,9 @@
 inpath = wdir + "1_txt/*.txt"
 outpath = wdir + "2_segs/"
 segment_length = 1000
-#tmw.segmenter(inpath,outpath,segment_length)
+sizetolerancefactor = 1 # 1 = zero tolerance; 1.1 = +/- 10% tolerance.
+preserveparagraphs = False
+#tmw.segmenter(inpath,outpath,segment_length, sizetolerancefactor, preserveparagraphs)
 
 ### 1c - segments_to_bins: inpath, outfile
 inpath = wdir + "2_segs/*.txt"
diff --git a/tmw.py b/tmw.py
index d9eea14..51663d7 100644
--- a/tmw.py
+++ b/tmw.py
@@ -192,7 +192,7 @@ def writesegment(segment, outfolder, filename, target, tolerancefactor, preserve
 #                              -1 for infinity tolerance
 #   - preserveparagraphs:   if True, segments will contain linebreaks according to paragraphs
 #
-def segmenter(inpath, outfolder, target, sizetolerancefactor = -1, preserveparagraphs = False):
+def segmenter(inpath, outfolder, target, sizetolerancefactor, preserveparagraphs):
     """Script for turning plain text files into equal-sized segments, without respecting paragraph boundaries."""
     print("\nLaunched segmenter.")
     import os

From e6ec126f0edac68f11b7ac1b603714717c41b709 Mon Sep 17 00:00:00 2001
From: christofs <c.schoech@gmail.com>
Date: Tue, 25 Aug 2015 17:55:16 +0200
Subject: [PATCH 13/56] make_lemmatext: with mode for POS to be chosen

---
 my_tmw.py |  4 +++-
 tmw.py    | 28 ++++++++++++++++++++--------
 2 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/my_tmw.py b/my_tmw.py
index 1afa717..d295979 100644
--- a/my_tmw.py
+++ b/my_tmw.py
@@ -50,7 +50,9 @@
 ### 2c - make_lemmatext
 inpath = wdir + "4_tagged/*.trt"
 outfolder = wdir + "5_lemmata/"
-#tmw.make_lemmatext(inpath,outfolder)
+mode = "N" # N=nouns, NV=nouns+verbs, NVAA=nouns+verbs+adj+adverbs
+stoplist = ["<unknown>", "unknown"]
+#tmw.make_lemmatext(inpath, outfolder, mode, stoplist)
 
 
 
diff --git a/tmw.py b/tmw.py
index 51663d7..bb1829c 100644
--- a/tmw.py
+++ b/tmw.py
@@ -472,7 +472,7 @@ def call_treetagger(infolder, outfolder, tagger):
 
 
 
-def make_lemmatext(inpath,outfolder):
+def make_lemmatext(inpath, outfolder, mode, stoplist):
     """Function to extract lemmas from TreeTagger output."""
     print("\nLaunched make_lemmatext.")
 
@@ -496,13 +496,24 @@ def make_lemmatext(inpath,outfolder):
                 if len(splitline) == 3:
                     lemma = splitline[2]
                     pos = splitline[1]
-                    word = splitline[0]
-                    if "|" in lemma:
-                        lemmata.append(word.lower())
-                    elif "NOM" in pos and "|" not in lemma and "<unknown>" not in lemma:
-                    #elif "NOM" in pos or "VER" in pos or "ADJ" in pos or "ADV" in pos and "|" not in lemma and "<unknown>" not in lemma:
-                        lemmata.append(lemma.lower())
-            stoplist = ["les","suis","est","un", "pas", "abord", "rien", "fait", "ton", "moi","être"]
+                    token = splitline[0]
+                    ## Select subset of lemmas according to parameter "mode"
+                    if mode == "N":
+                        if "|" in lemma:
+                            lemmata.append(token.lower())
+                        elif "NOM" in pos and "|" not in lemma and "<unknown>" not in lemma:
+                            lemmata.append(lemma.lower())
+                    elif mode == "NV":
+                        if "|" in lemma:
+                            lemmata.append(token.lower())
+                        elif "NOM" in pos or "VER" in pos and "|" not in lemma and "<unknown>" not in lemma:
+                            lemmata.append(lemma.lower())
+                    elif mode == "NVAA":
+                        if "|" in lemma:
+                            lemmata.append(token.lower())
+                        elif "NOM" in pos or "VER" in pos or "ADJ" in pos or "ADV" in pos and "|" not in lemma and "<unknown>" not in lemma:
+                            lemmata.append(lemma.lower())
+            ## Continue with list of lemmata, but remove undesired leftover words         
             lemmata = ' '.join([word for word in lemmata if word not in stoplist])
             lemmata = re.sub("[ ]{1,4}"," ", lemmata)
             newfilename = os.path.basename(file)[:-4] + ".txt"
@@ -514,6 +525,7 @@ def make_lemmatext(inpath,outfolder):
 
 
 
+
 ##################################################################
 ###  3. Importing and modeling with Mallet                     ###
 ##################################################################

From cae9b1b518fedbdf303c169c3ba335ec1e44ef5c Mon Sep 17 00:00:00 2001
From: christofs <c.schoech@gmail.com>
Date: Tue, 25 Aug 2015 17:57:10 +0200
Subject: [PATCH 14/56] make_lemmatext: add option esN with criterium NC to
 mode

---
 my_tmw.py |  2 +-
 tmw.py    | 11 ++++++++---
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/my_tmw.py b/my_tmw.py
index d295979..88f1e0f 100644
--- a/my_tmw.py
+++ b/my_tmw.py
@@ -50,7 +50,7 @@
 ### 2c - make_lemmatext
 inpath = wdir + "4_tagged/*.trt"
 outfolder = wdir + "5_lemmata/"
-mode = "N" # N=nouns, NV=nouns+verbs, NVAA=nouns+verbs+adj+adverbs
+mode = "esN" # esN=nouns, frN=nouns, frNV=nouns+verbs, frNVAA=nouns+verbs+adj+adverbs
 stoplist = ["<unknown>", "unknown"]
 #tmw.make_lemmatext(inpath, outfolder, mode, stoplist)
 
diff --git a/tmw.py b/tmw.py
index bb1829c..f14a7b4 100644
--- a/tmw.py
+++ b/tmw.py
@@ -498,21 +498,26 @@ def make_lemmatext(inpath, outfolder, mode, stoplist):
                     pos = splitline[1]
                     token = splitline[0]
                     ## Select subset of lemmas according to parameter "mode"
-                    if mode == "N":
+                    if mode == "frN":
                         if "|" in lemma:
                             lemmata.append(token.lower())
                         elif "NOM" in pos and "|" not in lemma and "<unknown>" not in lemma:
                             lemmata.append(lemma.lower())
-                    elif mode == "NV":
+                    elif mode == "frNV":
                         if "|" in lemma:
                             lemmata.append(token.lower())
                         elif "NOM" in pos or "VER" in pos and "|" not in lemma and "<unknown>" not in lemma:
                             lemmata.append(lemma.lower())
-                    elif mode == "NVAA":
+                    elif mode == "frNVAA":
                         if "|" in lemma:
                             lemmata.append(token.lower())
                         elif "NOM" in pos or "VER" in pos or "ADJ" in pos or "ADV" in pos and "|" not in lemma and "<unknown>" not in lemma:
                             lemmata.append(lemma.lower())
+                    if mode == "esN":
+                        if "|" in lemma:
+                            lemmata.append(token.lower())
+                        elif "NC" in pos and "|" not in lemma and "<unknown>" not in lemma:
+                            lemmata.append(lemma.lower())
             ## Continue with list of lemmata, but remove undesired leftover words         
             lemmata = ' '.join([word for word in lemmata if word not in stoplist])
             lemmata = re.sub("[ ]{1,4}"," ", lemmata)

From e723daa748b820e7227e30f9aac27bd12a00ff7d Mon Sep 17 00:00:00 2001
From: christofs <c.schoech@gmail.com>
Date: Tue, 25 Aug 2015 18:02:10 +0200
Subject: [PATCH 15/56] make_lemmatext: bugfix elif

---
 tmw.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tmw.py b/tmw.py
index f14a7b4..48fd57d 100644
--- a/tmw.py
+++ b/tmw.py
@@ -513,7 +513,7 @@ def make_lemmatext(inpath, outfolder, mode, stoplist):
                             lemmata.append(token.lower())
                         elif "NOM" in pos or "VER" in pos or "ADJ" in pos or "ADV" in pos and "|" not in lemma and "<unknown>" not in lemma:
                             lemmata.append(lemma.lower())
-                    if mode == "esN":
+                    elif mode == "esN":
                         if "|" in lemma:
                             lemmata.append(token.lower())
                         elif "NC" in pos and "|" not in lemma and "<unknown>" not in lemma:

From 32c6b6eba2a9d23dca25e4636f01390ae983ad60 Mon Sep 17 00:00:00 2001
From: christofs <c.schoech@gmail.com>
Date: Thu, 27 Aug 2015 19:04:04 +0200
Subject: [PATCH 16/56] all deactivated

---
 __pycache__/tmw.cpython-34.pyc | Bin 27799 -> 27799 bytes
 tmw_config.py                  |   8 ++++----
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/__pycache__/tmw.cpython-34.pyc b/__pycache__/tmw.cpython-34.pyc
index 80f234cfab053dd7b20755136322a872ecafec74..526672e938f5e42d98e93819dc9fc0123f1d2f43 100644
GIT binary patch
delta 8311
zcma)Bd6Zk#dDqo8l18)dc*e7NJdy1gjE(Ksn6beN9)kyL4;ViTLYCfW9wX_g_i4P0
z0pYZ17SfR9CeT0+w5MrGLvs?2x`aYoa?*1e=#oQ9n({&p{R`I#g`^F%7#jNhzN?Ys
z@i@Ug`ptcJ{qA>v-*Vr3<Q4JAD<b@6v?0=P{cE>A^mO=rzc5rVu$piz@O@!?A-MfY
zBYgXTn{G)+$CPd$kH1F5DM%|HFC@~A(WuL2#>|?zf@7$;;L+}Y$Kx^TPvzss?bPXl
zd8*+0frffD*eeFryTM{`n7YW2y1(wuV|7%F`DHU|x&UniZXlVl9W&;4>l`zUx#bn8
z_Gj&sDF@JPumVFUVd(AjbAkjO(JDHH>?&;urF_kpP@?+0K&ehWRo~wdqhc)T*4fT@
zK2eyJZR$t$<2zV@di07CxIxRAu`;F`lxEgGVUkq#e6A3S$Psku0j%JDg@jC-w8MST
zkqs<A#LgwMrdub?3<^f$Nn56zjFmL4T*^FEE5CXo`Ws?%sionFSF|asY2P%IazTtO
zgzgx)^UPK2w%*6`F9demIdfcwsWsLiBUpb8081JTxlCbhykO6nIcvd`ThLJ1ifr(x
z!E{$~47KBcb$}}Y>jCY64gxor$mh*mN^YiySc~jN18;Q$TH73&ykKWcnaCy0nMAT+
zOSut!Sj||~z@DKueP%%`#}Cldn7kTYHvynS*PpR+rd)|)fE9F+UcTGBUrdzRTE@iY
zQ4Hbp!@<YAu6KOB>zDRPr*;H2U5IQ?S;<1u&gLi1sS~YVX=kQ@ByMPXCXqd!N=&?=
z#@nvm3eLH{q;moXoFHPc7Th|Fh4_4uzHB1z22CfK$eXTz&OGhN4)s`D&s`AO2s!h|
zoxR|QA0@dHQwF4&E}w8r$Q!6i^78F@Q_k2j8_&*X3RXU29;PAxPR$P3=JB_RCea|e
zeB)kusMOkC5XBHxmrp$w)d${0RctM^dlFh6!gKYUTkzNbXaw+~fYq^(KFOO=%f|_W
zV$Dm8F&dH?Gr=bXy2=5x_Oj}#`m4`NI(tp9H`QS0pqMW0>P!hSs-EoHT4Y{^wU{gs
zWOXB?n7p);rsJ?x)kuehU4Jr>$&e|Ah$nUirvUNXAUT*cJya%=w?QP((WujuawOCF
z8r1iDJZn9CblC)vcZuLlSR-DLrm1K3sRz4b;*QdvcW?0D&eVc`xFEvkef0Odm%#PS
z$SHX}l?>n1%?B33+vjt0Is0U8!thheZHDi*J%@}Siq~Cd_^#W%A3)WnGi7+Ec9Eai
z(YJGJMUa?R?!dHu&^#vGkW;Yp8H*ffMZZK{R4{c9t8ew~79sV2eZ5+*ty-^ar&XPR
zI5i4oFM1fu1RR>YAlIUOD_3OflcY{21nitoU3=V4dW^N#So&%VXj&Ao7^iWzbhMAC
zuFZPq6R1$h2=lV+?&~NhWE12Wx6^Fp23Wfh;0D*1w;eaYbTaC5=1Dk-D)pcgGM)te
z>D1QKm`%)?aXl%8lOZ+MZ;9E`BmJKi2X3Wq(+fd%kC9lNRu%Jw#a*JMBbYnv)@O+n
zW+Bf0AF)H*MZd}ocHU7LY}B)eac$EEx>e3p3)afyDo<u*^R_J9Bgm**sjtTy5mEZz
zDw@4P?`m(GkaMNi2G4o-E-Nxi%S$gOa<jB(MmKEb^794usXZz>a%i|R%IE^Q;zTSf
z>BKo$IdOC})Y*~lQKQA4FVIlkAr4<wsY=<&@;F01KhnNtxsA5hZWNK2jx$|UzZ@AB
zQVp#>BO>bY)mz4vs3%I1F`gyEOA#6V1d7oiI^tdFm1j!c(Gl-1hxRo^CNXD`ry!S3
zy*#o2I7VR5jwxi)hB6YydY*ltA0^)UUFf6kiLKg(sBZoIgAYF77|A8;ZF-C~Yf7A;
zb~gmS87H@IgscLnO$%;S-;MPKPVz+OWBq|5mm1crYc?9RI;|X7O)1Xgnw6`JFpY|{
zRIZPT<zQx|ayB_GR==ShT(e@*h>*`Or!YoCx!@#iX*w<u3+KcSOF7j9rPZ{5Q@yoj
zhj_Sj<=U&n&WA`i`3b-SfU^YCl$3Bjz^bW|p2I;Bz2F2S_AgJW-(NRS{0xdPT-Qfp
zxjx#Q#Kdlxs6cirt&4ILv@}4dO6$MjDB+HY<KCbMQpPn%;j3A+P#BAdKCgU&+LFiy
z@ejeFj()r7=bZ7V9b-Q!@<tw@idaxfe}>9DOg@YfA6I~)QQC}!<)f%N4fr&H+nh}}
zbbMM=wuT4Mwg7k>@Fnz$*7jPW<xw}nbH~9<B!>?Cr^grZ!epBQ#Lysd^sA-sjrWTn
z({Wz?;>tZm)^t_L)o}Y&EJ*|ayT}F4QqDl$hy_PblV1T;wjf#HKSJGL1vpb^OA<#5
zX1rE-K~c#;Q1_^Bt#1@xEB*a?C5B)*HRdu;k5jHcamsRB|7kNJ<*2%M!%FeT>ah)5
zVwaVh<Ax>&<~ya{+b~pol8Qtt$n4hX4A;>vl084mjhxV^OG|q`Z>Ef<#EEn*YmG+Y
zIpsQwuD$k|z3!52_mYE2&S$Xm#0m9#8=FO|`ufHZQ7Zk*#y#F*#UV)c7wik<QxF&)
zK!ROb6(6P2f_LB}7evhUa~d9*v7}Qtf)w1)Wu0x-CRm-CgH>I83==*L_$@#YP64Nj
zAnp3dA-KMbos?`|o#??f_HmSco4^PjHl<}cMpWvYKaOLm$<?a7VO2+6?P?(9UgV!l
znH89h6@Kh5FlmIn0`EkA4sF*06yURHZ?5iCFOSsd5`r~3i4!J;hDD`}DY<2<c((MX
zTYl`lQERuVZFV(e9%Vg4<8C<NB<Yqw0bqJBUEjbRBf^fgkT}liFoi++b@lSLk*!7>
zlpWWeE?y?fQp&gT;BpkzaVk~XaLtXPIJ&s9tIs&*H^T5lJdr+{XVB#@35c&1eHbmQ
zs(8)fMh8`u!(VN+cE#M&k<ZgSMm?G{CgG}-BR`M%?*@F4fKs_{qx`o3rtpv&xVBwn
z)TV2{FYZ>MiHQ}unmV}b$|-c&k~EOWm}gXCVmsZjAD>uD_~OJ!n-S5?GxNEmd=Zo0
zsk}Q@-vD!U!w`gS!}BTm9Gd<CP!Cv=BK#)m)&ampH<UK%W<{oCD0@d|5tlMvg;Fl7
zK;+_6tOE^Zth8K3Lt;&OM^#oa0sje8z6{{?zk||00>C-9iAK*$P)TQRMzma^SHUGo
z{($E87>y~qE|N1q7)Le<4aIzVKUH;7TDP-T998>wZt}kBDV^Q9Q^Y}lg$_>c1}3wK
zwCM~arA@cg)HZz=*0Im^-7<NT{4rLnMnSe_))Yb^T{dVS-PLH@v|U6m*`o%o?>>@3
zamm<9ZtI3@X{D`PBBSs7zT@^OqpmE&QX<!*K5ns`zpEa)e%SlEN3ENFK_@xdK|%T2
zeVwJ!GzW86TQ-i{i3Riv5tx;%x}h(t=XZ5Zurq)!Sh{i}tfFeu-$cLwhdjPk(II*e
zlFxfo`|f`pW83CfLH9zA3RSO%BuJhjz~!8-Cq>n?XEcCC4Ry_)!A;C;h9Pv)nF_!E
zm@JrC=dUWOdK)S2?e?~M<y)T8=l8S;?ZA0itp?r%dzZhasrHibE+dBgB(;&?1nh=?
ziqhW@xM7F(%SjmR;tf>)2dW#%#-};8Iy_7FHP=sBUKTF&8j53-^6$o_1WUUS1paCJ
zmW*A#g;lr3x~w+;ox1CWKJVL}(tS64TwDP{QW|L&5(Uf7oj0)bOMv@T@8lKYKDA{s
zwv9ctmLtC4AJ6B~Y(?c%)b;1F>JKrKpY}oT7)}+e<eVd4@u){9M~^LM;x5);Qw_;q
zp!}bJ|H2&M8Jk|f!N0h{S<_0-7PQT>3I7P)YR*md?O!SWRc+rtDng~y{xxEpIlxx)
z7W%#o2njOKMwfy*`H{2<wUM9>DOo*t<4E!6=o6$qvJSu=f>+~{V9DM@7l@~&t&xE0
z+(u{s&R8qy4G@R#O#TSvk_||~2s^}DA)~ZHlG^#7pRnGe?!M{CgE~?%ji?9G$mIyc
z3<Gy4P=O&jS}tow>ga*qE{~5SfzR-e;dG0rw}J9m*`Pji;MH0RGiRB|l;bV8E)y5)
zx&bn3Q&P7oOA+hYV3?#eBK<cc<kMZEuP!ogb#AJ?rylZgCg^MRc8jpLhwqoNxzv91
zABy5TnBac^|4T5n;3xeK%ws<TGv9em4$#6@asc^}{4?}o=H7?Wdw}-=V7Po2Z~<^X
z02)rykN?SE(jOzdxQQBu7xC7R`P4~W<OcxIoQO@}rCFp8-~kAL9}ociYSM?bTKKq`
zUx+}LP_P}9moWZ?gMFe$ojW)xT1riaZWlU-XU6{o12}p%K!#>O3jqa)Fv?77=1l~T
z^?)Y867pAh|2fRa!m+H&9cEL*BKw&H>($$bH&M2`cKXF*n$)y_M<=&@7H8>E#|e3k
zikAxzX!2>2FVF9Tx31_gz#p8Dry3aAN@s61Ydmz#(x0G?+FxtNx8eB@0FY#g)O0!W
z)77$&lAl6XIBcCu>nxh?Zl~#L{0*vh1C&Z{zFM{k&ut$0GpeCLP%#wAeug!V9Z}+$
z$sFc8vk|1(%@Yp2VYIWRu4BJ`iS?*wkE{?W^}{2>qNnuUkv7p`1nKP)-5%*OKz>sv
zW`-I*dX4ub@+?OOqpb9@0jS?OI#7HWa<$Noa*k1tz>}nb_^eSkOII~|NnAs6<~K9F
zO7hF{o2{hk+3U3QylW;ov+QLiA;Trq-eU}lv&u3C#bD_{qfrzeAW;{Z1`yT?l%NmL
zJLhc!Q{?=rUcLwX9VCd=Ymzg+d4A?M>$~dVv3Y$YEYfm>?qxowC}U9F$J)fBYTdC>
zF+_y!QQ2cJ6h9QH!!=C$7?yk*@MQp4BRe29C?h)|`mZ87b#mb_<MiD|PFt(8wYQyT
zY8^ahMwF+>;nFL24%V2!RE-T>%=uVDji8l$Nhd1LTt-?s4B^=pnBw6o#oZ`AaM4W>
zp<rK5U+dU>yD+93a9KxEvq{=xZN~ELc%K+4y%v8=G`$0%wnHc0%t^Xkd@3;(d<k`Z
z>bXR3_exs9jqFd%$^~<lSiR`})z1>0;-hN)%*0x-wEPNNd$vIgXBeW=h46OVH%{f{
z#Oo@5e3H#&YVoz(hm8TtxqsGt)jZ0H4>Of5gMBUY6KTleOSR6M635NVrD0$y*++>_
zCOIO;N?%C+K<JM{m2Jp<w3oNk8FN$gDeRBEz&n)hn4^pOI&XfYANrCp0R8BD8cOe?
zv<4+s*vPL@m|RO;-zQb|s&CAUmDS~~&R341Zh*eVWlmo{igN6%dYKQQ!Loz7NP~mr
z{B0fNU+Q3JV{W&4Vy0XCRDCHuA~u$uP2VeuOs8PVOybMgRWz{tp=yd^Q`OOoV8$`P
zMgsbX#UZuoZP?Lh7Y7A$Dv8ez+6XubtitF^Tn@?awk^g`dX0KB+j-U$e)T<To!F$_
zx9)9Tst)zZdsec0)Iog^3o(GhZqy<Bi<e*0j#8Rg`Iy`+<W`8kdC4~rH<+idO$kT8
zSRKZ6U4SvbIAAki0x$_UMAsaToJR2uz${=6APL++Za$kotzFz1)IJP&0)SVG@@s$+
z;8`^{*I(qk>Qy|y4&Y1D#R)v8&hp~KMz+#k4EjW!IBv_F9H1uzU&?d<hzfEDz#i-3
z1C*Q4uo*xKX)SAS5Q{SmsTG-6!_`r5v@zNd?T_w@cBq4yuE46MP;`gNX4d>aG@ivG

delta 8329
zcma)BdvH|OdEc`yX;-TUk`MwUv;qWPfdm!?%)<f<!ia}>n}=mDSG!kYt=+xrb5|e)
z&eoZ@6W8f<nlmn`Nu4Hj+B9*JraMj3+MPNR&&0{J^^8s0_#_$s(QzD7_mw7XL*lsK
z?>j5)vWV0${5X4_-}%n(dz^ExzA3K0DMJ4isSkhU-nUME>?@&P`h`}(z$(JN!0SSP
zJh*+G4xK)9@Q5W{Tk1d#|Ld@um#JJlZ%w$m!IbT^ow2idSF6e3nf4lw$D`|}bMY}J
zc`0vC=e3_veLvVCdezT^h2UE1BE#xKH4l%5s2KCh7St>QEFsW=MA~ufm|xep_5|jZ
z-Kh3woTM#d=+-+2Yf!?_)AY{`5_m+jXcKZ-adU0b*MtdWEx`?eQb>KFu4};n6=M-y
z<GB4fD}P=tQQxcU-@yVjpjRV-4yN4kRNB@-X=j`ZHc917W%IGH>_?X<U^(~8TXMqA
zYqc-ZGr;mg?5vfsb&a&sTxhuH$fTQ2CG1o-X-`+muRaxdOzbKys6XWuOH`_HUja(F
zDf(t=ABsMF`R=t_9%uPy0(+dS-7o8?HMUSju>J-BFIk^W=O_E~&ZM19&DipOG|X*9
zHsR+2Kr?{V-HOsCz-GWkz`cON7T{7q2Z0V+xtyI%%3bs$wm^2FAqv<=pj%v8FYlyn
zX=M}kxRuB|Qr?GV);t7ZH9(c-7=x)S|A6x3AYcaoTGjq^D!bU0t56QG&TdiFt4;gG
zfnv*oKCx*%#_*AX3o)<u_OI7|>0ERxM^Mwv@b+XXkxw|8+~5z@h34m5nLOZ)uHBxt
zGGj?=@LyDa%RO757wt>97jWhlagDX0YcLk#v!;C+E2o3Do3L`W_D|ZET-l+nwJd&w
zg|2m{#@xN2jUOd>2vY{6JyAOJn2;x^O7ilpIa`i9G84~CrSqv=+CEN0-lAp~r|a=I
zi$+l|7Ww+U@?5dGH7^QbsxA>B7BL){J5ky_2`$f2mx`n3V=kXR9}0*bt2HM%itbSW
zY>YK8F-F%X(zeAX1qK_l>$uhKbX`u;*&ELNtLkp+7K!5Swxkg2)#ny%DKJ$-MocCn
zWF;3eHI|c3!ggIYv<i0e!v2JnPLp-k5>K2oP9ch6LGn2h^kZ%!X;U?CLf05dxrWdC
zP=A-lvxb;QN0+gPyj6lXVvTrS+P0ZhsHfXwVzT%b?F0VPOf9&Kn<8|>NB`gO5@_GJ
z9Fd2)G;;XROlbR5b~5W+%noWlwLGYO4-OsEK@@lG(!O1L_5-Nec1N^#WH-5<9i2P3
z%n1_n%7d8951Pk>u66THE}bHu%3VMZqAn_!x<}O4I`@dMdatv?=(XACmF={w6L_u)
zg@V!2!ZHDuCeO=zP%*?6Y3CxTlL-Mk=TkRMxRZXyS}QDl9R@Vc3RsBKI7hn1N0irQ
zy|YJ|qmmBsvK#?wC{AQ7a+brG%`}j?lVjxu?aMi?4ltc`UDmz`CsC#zltRXHiT-Od
z*3*!&ChfSHl;TX6>g!60CyQ6ReoGumQnwQ`LH3y8SdCE?^M%E&($YBQ4(YlKk;2Z$
zIV4O_ow19%RJObAp}E1jjzx?cn|>7C=FZdz*39H8O_s{!9GRaWPvwIB9&cDg=x?)V
z@&>)Dye&dLReY=a9q-<GMP_Mv>G?!%lD|na>QE}1o656KjjBk`u@!Tp^dgWePQ*$j
z+&ITCH;#^4J<-#?S}$;>@-)<Rh{Ko7RVAH7X`EIs_q6uTx6#(hjUqVHaZX%Tzv@{b
zo>9wJT^14b+^WreRqBaQu#9KO@RCG^&q6UlC`N?6%e?ZrqIY$VclYm6#R!?iWQsfm
zxpeB~kyim}0_>QgC~YV$VXQY;IewJja%2{L)T6PLTL-9=`p@^@f1kr9mr_5YpRp!G
ziQh-hTKLU4+<t8;4{FnbL+aI7N8nkW=tit7@I065d)GGUdZW%%7ObYMXSjFSN*$t6
zah6K^s91_-sbtn6$HnT`>gnF4!#Yepzm(PJ`clD7IMQ}OESwYnsOD57lvdIHg8KK~
z9paVZx;1x;oqtWj$u9vu3;1J#6O^ZLiooH43BgL@aBvyDh5*>Vd{+J8+GydcD8g{H
zkHjLkHJz83ScixTWVbSPOWJ~#dI&YwdL2jEPW8lbZ%_m&3G1ft)g%^B7z>L|ul##z
zO9aq^{vgXivbdZ6xtBd^N8gVNyb<gn7Btd7M`a!+UqOkFD?rg`!j6UHtEl=U;Lizk
zQ^s=X_>8D*4Mnv50pRZd-$1WOWv?nNuhwCnI}T>PjqM<CdVFCoOtvvV3=I-TuNPnI
z?-D_#<0sTl*9{d|(`6->!|in}Nd#ba1>EzLGmtl8!K2h9UkA)>L9)PapsvbhS`x(3
zydAF;{yhkYMLp_k>l?&(i{D(Y#Bx|pg}Ho!evW9rHJx&`|B`J<xn5lvSSG%st_^IC
z-BIoq%woua(lzyqf#rpNq$1G@GV2<X^tv*NcJ}-%H|K;k6VjQ=*-71KU6`n3t!^Nm
zQ?4`X+ADvv*R9%i2RWF;R2n<CE~rm#Xc7(TiyL~x+r>9F40#K44neZNU|%5Lgg{vU
z7-m!fj?H+Zcij{*?dOC%Jf4zn{uHuuZOSIw45O#Xv}0|CgJoU(2_{5-CO;1-uv4JZ
zMUd7$atPX&b`p}!3o+E#*cVWGfj|e3+cHIl9+Bo65XZ4p<Z5N!u&N^^R|6^cBL9qH
zCQh4K;iEsoq+#|7yc78%I_wAhE#R-v-c;VHSstm;Z3Js{tqV4VhFPV=l-#^U{G|B8
z<{x+u810s|{UuC5>?`RR8rLDqP0%fY0>FvAbfW`zbeJ7$-Wua{n8Kj^p1QuZXNzuu
zvg5|n#Y<$VrhGFGE=5rjr_L1zwjB@!xP0T|%^A~0F$7P<6PcrV7hQf#KzyCkhhD&{
zidQVI7gALz{8d&PSIj+K`M=mp9h%cN;mV{d{|56<170SeRPH9q{{!d+tWnYXTE%&_
z@xIr^w5lB(TxzPRO|vVf(B(+dfJIX(Yj8VVyPq3eL-_5%o)#T8&Er$qgsde+%Z&2w
zSoHwRRfix5U67}e^1rZD2<`Lvo<Bt^U+cg{T{~fC>5`VG*zfFUE8tSbt5C|7$`iTx
z6l*|(@zjLuqam?Iv!gPrqL?y*1s4K%eSuOK0M6+~8a*XJC6m4BNU6fCvI;Y_2~W(U
z8<KQgBxisyj%*Sdiuufb%IaLB`gV4RlWPCYjoxlid}8NL5eETg7IJbI9nM%2wi``I
zhi<9Kt>$`cVxRUM89peNL$q=fWNV%ah1!pSbXTKo({{TcT~tNyZ$EVf#j3HJP}fbM
zYaN-INM)_Gc>?f_In#94FfvpVxeoR56wCRH`knh%h_!0XiEo=6$CwdxZ@d@BZjR+8
z!^Ua5v4FX=!>VN23w=|)yt{3XeF0oS^}-D>in2+6k!~?=nEH8q&0?WgjEKBWsMbCI
z&}Tv!wT#fUkb?rNiI;m8GhZXX<(zIO&#T6v)d4K3)wZGTjZEw+cAMZZoVPRXU(T)S
zZJ?~T-P`Pyn?&(<hgyX3-yETg8h8`zTz*Q^?<L!vM-2I82(%8T`gxS@M(MvHPROPG
zat=nj7(jJDfo>oZKf#&R@$+<D(|$_wG6-4QD2!3YzXx{`EUm+c`zM?uX{U4vD{qT+
zS#JKidgOsl?{-mq{DCLM67Z3-NGEUQQ%?5AS%}q)^*^IJhL?y>tIfl)t$dzFj`)nf
zKbM_gBPwO0+MmO!iy^R|_CZe!)A>|lk{%a?dUkmA=zJF5i8a_#YvneSHvqO^j?lP6
zPjYZCI(XhrO`OjglZ`^QW$0FMZmM(tGVzAmzJIkiQcUje75&TsHkwW7yB%<ZAl87G
zPHtqvhT2F_mz1ntI?z+N7kx$mhXL#$cr`u=mTV&$AfA!7LIRRXH$VgM#YRcEbek)a
zKTN4)Ju)!Dg<=goq0k%&YUe-5`|CXF+`-QsHGzt0L_Ls2&PO0-7`Q`m3Jfv9k_h2p
z8r-6pGlx1By$4C45f2$oyNGz}DV>!k)yEEfzmmetStc^2c=N5x#KpQgKt@eZDm4`>
z#jS!3hDll>($+iaYY~~ZdS|3{aV@=3m_*Rm>}?kz?_$1Q%G1Tx!+#_S_hEuTzz%|u
z89(VaI)(j2$G`ECjMBoXL=@?f{2^5u=6)Kb`!Q%YfJ5$1ly(6=gH|eda7*Sta)^HF
z(Cj8E7+%1mA@hk-q6aV@G$&#c><1hG>;nt~4gwAVem&_!hDLnc%r8!Qswvn?^4-q(
zZyoIvZR(w)tHs%3<FV7ir0~r6^;nsM=Sh@K1I`jqd^n176caLUj^XEVz$rl0%4Obj
z0A=A=RyNft@;BDcvY$!Nr+#*Pqj(5$V$_hD7Vwzdmd~ObCuAQr-6=rOAPT80?EL&b
zdUEN)Q{YdSx_ZiiVYK+fNxQ;BS1iru!{QjmFM(vQ0U*f;sp(GSr<-LyDgO*z;jm3A
zZIWoZx?Q51@ein42Pl)?bdMYro(Ji18oLXZ4Mnn_Va;Plly|1n$IbmYNVD4)D4%zo
zjBV<SL02nWy?AP=u+%@DT0!rXzc|$*7V029KGEfo?gHdDO<tzeiZk21%|e|!(;cZa
z0QK8vq6PT3*aF&7mM-oHJP8_z&mA@A>83_+#1$lGelyd{B)>Dip_#I0uQ4R&T{Fp<
zWeeGYqUd^?Gaov;LVQT2&UTAj@#(V-qJRtaOk)&bEl>G*l%AZoMn}l`mA!m|b|Z7x
zQk@|=Q-<efezU&I9-f_hncvm=rS)_N^EpWwgK8gb5m(jP(beKHBJ^UF8U0q_1CcsZ
z!K8|;2fGGq<Rg$8l#y|W{_BVik>D`n^d(20NUh9NUUt3;1DJL2oEcG`B4zQ-hr26G
zV5GtZZsmNezCzHse90s#@7_ULDIq-D0#iIxrnrvalNVhS5eoKY&8?2j_Z-HI0q*EX
zDmF=btjt)x9`6+S;#=`+qLD+wFm&QgU!=>$Q&wNF8Ph(hUa~scC#k5z`>peG#y(H1
zo^}7~Kdd&fTdf}-TmxTTdciia>Q;YtGqfGojp>{m{AZOL8)kDEnSFKppfO-R_s^Oy
zn@1_}i9*Y48SHDBpGZSy-_+U;T4Q$lc3T-qbW-AzPV|T?#g8YxBh0rU-U}Y!<w=5f
z)n$8Qq@1GPNBNGudRAXI?7O<4FF6gMe;)ZbN*pXNp~MR7`F|89XQ=DXNmU){%j10|
zb$QG4m13w4(8sv+C3!~;a_lU7nGc}Bl7opt@>vabm-4sEkiXi&P)FUaJ~iGhex#n6
z=n)?-zBqA36qruIq@BQrv)`bBrDv28icMullgEr#03RlxZ&(~s%iiV~npljvGnDHf
z8lUR`T3x_xE@v@G)iN7H=@s=b+j++Rl~28vS}Q)Hewn(`RILv6(MOlDdo)4)Ru1ck
zOZFEpz0t0uG;{9ra+8oBh4`DQK7mjep^r_LYo5;5LT>)3ya}bvfL(wCfCmBR=$hk^
z2^1#*PXe9-JVT%Z*{MwKl5uhT6#5F9@Qf<21HKD*8}Jh~IoVa<sL&1ltOf8T>DC0E
zQ)hV+t)~G<Y~(`(_||TXIWj9VsG{IYasdz(<YNG^QQmrh@*}8w5<m)RuDn$kBT`G#
gvHA@WZ=@m88d(<E7im>T(~AN<jWv;-DwFR0KlZrdl>h($

diff --git a/tmw_config.py b/tmw_config.py
index 60ec739..6594241 100644
--- a/tmw_config.py
+++ b/tmw_config.py
@@ -48,10 +48,10 @@
 
 ### pretokenize
 ### Perform some preliminary tokenization.
-inpath = wdir + "2_test/*.txt"
+inpath = wdir + "2_segs/*.txt"
 substitutionsFile = "./extras/fr_pretokenize_subs.csv"
-outfolder = wdir + "3_test/"
-tmw.pretokenize(inpath, substitutionsFile, outfolder)
+outfolder = wdir + "3_segs/"
+#tmw.pretokenize(inpath, substitutionsFile, outfolder)
 
 ### call_treetagger
 ### Perform lemmatization and POS tagging.
@@ -205,7 +205,7 @@
 dpi = 300
 height = 0 # for lineplot; 0=automatic
 mode = "line" # area|line for areaplot or lineplot
-topics = ["48","67","199"] # list of one or several topics
+topics = ["56"] # list of one or several topics
 #tmw.plot_topicsOverTime(averageDatasets, firstWordsFile, outfolder, numberOfTopics, fontscale, dpi, height, mode, topics)
 
 

From 9267827bacaa3939d472af807bb1f13979f81626 Mon Sep 17 00:00:00 2001
From: christofs <c.schoech@gmail.com>
Date: Fri, 28 Aug 2015 10:59:27 +0200
Subject: [PATCH 17/56] topItems: save with fix number of digits

---
 __pycache__/tmw.cpython-34.pyc | Bin 27799 -> 27784 bytes
 tmw.py                         |   6 +++---
 tmw_config.py                  |  11 ++++++-----
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/__pycache__/tmw.cpython-34.pyc b/__pycache__/tmw.cpython-34.pyc
index 526672e938f5e42d98e93819dc9fc0123f1d2f43..8167bdc2aec72c92aeb5b9094fd02337acb34840 100644
GIT binary patch
delta 7642
zcma)B32a=)d497OxyzdrNgb59lub!1o~9Hjk}T?=D2Y0#i`G@CtEb(^)r!k~n|Vu#
zq9|{iz(Jg(Et+ZK)JmNua9h_7+O*pQXzZc}5+F`cH$dVP@g#wRqN%GkXxh}hoD}Z=
z{j=gOB`UB4eVR9KX8wQv`Tk?xKK+LH=o=#ZueEiNu~k2Q>^J``{L6sQDimBt*cbec
z(4P!FI-tX6j~+R0OV5!yn8W`%;^k#J7th-%PuH8alX0e<Y~IsqDs*mnm1SAFW;PeU
z>?UvIo!Pt&FsdJhI>bixzo9~?mxjoQdc5l5g)lW^0oj6{HGq`_I+)0~o)Zh`D$hw_
zZP|tHz_gomWDLVLE<g`jn0l7}c_9KzG>bMN*A%x_C;g3BP*xM%5om?gi#451L)47b
z>MGak&)NAaa;5q~P5&-dpdO<d2y`gzO{O!B4oPR)z2=Zs?o2ixi^zTqi2~O0xV$Y>
zPF|~nwcSIkKh)0J(~ho^PKF!x*Ik+PGU<eq&L*ANO8wR6YCj<M6r1YK_{2(;Za7$g
zQQi@KbJf2Vy?FDH-fg#7|GD5kH|zAv8hRURm9>z+3BV%jvYGr;f8L#PvgtWT?!}9R
zt;j|^HUXLeZ0==fZ2@crybthxKw%qj6`+GahwNO=$tL9<dWbd2PP~W$b`a<mkL2au
zj3e!A!kM%ac~{Dvc+HlFB5Vej(i~$bo#h7@PYwfi0bo@f$fUC?9JvndAlqz_sa|b7
zBn}r_n)<}LLCoPJ1s7sI?du=Z0qI`%7R{J!{)-TkrAI|u_HM(@wLjrq!>QWDCf=v6
zO1RTGD9E<-Pun>ia=e6{b97+Jx#7tUb-QInKdV*k&0O~OgBSs{<WbgDI;qmJ#)O=J
zZawPnTL#3XV)L@RC`72Ibbzs1b9M_SL0Xe4@)9PMkBpC}d~R$^5Hwb8PT&HDM*;8+
zwwS~mU6;r>Hd`3XGwxDT;V-JOZAc`FBW+0`2G#TJ+X~F6u+fSYfNJFFGnaGHO*o#%
z{!l?8mJ>+WnG88mH65p$fhnVy7NXFQqKAcrq(j}Tv#v5H?-|-1ME?fMY9oS>{iki>
zSf!rVmJf@m;+K~X1<vwe!-?M!;X8i%|BjD9`zPhNJjSi@V@K!0kIrPLvhMZluny3h
z)7pP}^rQ}<xo3~|@7Z?<K;4cvu6^Sp6byE)-W^+z8S}{_SS$eI#e}Z*@@_7ZrYOfl
zderl)CqzX3-|7xyvSwp4_LTB=5j!eu1;$Am%LYAKEH5{sW0X5G?sc*hA1yqM&(DOx
z)$|x!R^e*{n9y)np+cPIxzaOUT9(ZwWuIOkg$}b=j=@zFsi$oUS7TVsyda08(8e#c
zKj(Tn$U#KcWS#49(=v%*1S0nh`p-w~UeS3=JXw6Y^JC&@lDa15LL8qWu_|L0K6X~^
z9*az3ov^N%CjL12I0xVq^%*vAP}#1w^$U}A4XYD3{&f|@7S4;5c(H4fc&hkj*IT~*
zODe(gd9kIGXp|qJ1$8)`&CTRFCPh_k_sNGAX6bfNAx<<%C%ibve=m-KT0POde7$aR
zXYw@F42UCAEOaH^L}{K@uXHbKyifIi=-w$lrPi*yDQeZT>$Wa((I2nt63-TW>$`m;
zpC%2*iHcKcaz+Yn9P*xq9Wo`|DW8yP83`x3!`lp?1+SJ_j8Ru(4{t}xOaJ@3@4m|k
zgj?zVq{mpJIj-Nu$ZABQIKpFfIuDAGz)|&TtRwggFLWo?8GMdgbsKvdb)7LxItzAD
z{vX@8=3yPCS#i#C+E2|=Tu&#nF2x2mq*fo_xN5VG<Q%FfIKS(<Qo~EQ((!QUusA;~
zI&ZykLVUeA(DR7c{Z-OK{t4g@0Dnj@LCKl(BF9!fMYadW(wi7H3cyz7GwSzyqXS<-
z6TYJTqya^S*}TNUI!xzBt|HTmbM?=9H;dPb5BCK{f#pMVEM$Z}i?v<@d>z0B4N?S7
zIkB*O6<wbN{5gScoVGnOu#v#bE8^Yn0saQ?HH@mQ993xtda0i8UngEKe!IU@gqRMW
zQEv~7UgYhUcV12)uVX<v06yKIcZG5rc`B<FrC0JhfCU95tMgs-Rl3`!2M5LXi{BVj
zVl8-CLB`L}<G2pkvuRHUZaB7-gKB<gjrg9rJ+w9UK&@J^ia8IoZmVAmtu6c=HR&Xw
zk*+eul_#Tk&+(15;@YDsCEb~vlhh6NwN%O541wy2MKc-pX;XidpE=%CYzk63D3>K>
zGIrjH+t<`*H*FC=DSmg;sIRc#Xp;SqeN4UqRp40?m6|ca2({*X(GBm2m=3VW$Yff2
z`7@~dv}v1qCW4WsmW#C+=9I1QM_BMpzzcu^I~$!EvXJ(ZqiKJ}O-SZ4a)crDi)ek3
zK!;8_GEELtD@~yp$AMH-6=goKk!wpX3@!Jg{)u8GF3i}}2mT3*M%Wv9C-Nl>I0X1>
zz+d8hV|k}WJhJ*?;#}N1E`DD8;?^Je4jV(34a+XfQH@Os<La>OC8*>~XHygV>3RTy
z=m^KAynUG~FIVQ}57g`1ySsD?>>4-0B3@!dC2NZ-cO*oCO9)8N=Gf?V5vHo$Ofc5|
zDYX7)0wUkS7IhQvK3*Y2w^COr=2v!`sKp~a`9Ba<gVz~{@ZqE<{|f8R0$w4YYU3TW
z{~NFoKqczVx5TWf9v;41vmO~fK^L-ThkFRWHQe2zBNXXf8K0cVCS)~PO-?J{u5}0C
zU^)zGbak3Z%KyN6VZ5(L>z?HI$LQmN9~{!vDJM(id!DkIyQ{5$a-F46DUi++DfmdL
zJo}n6nNG<*ni6X;YM0p%#gerU*a~3z0<8!DT+t0Qdqz5(KiNZSOC3fEifh>>teB<i
zlXQ_F_eEMml>_U<{ANF8oAs!^-6wrrqWHw_-6D?joonS1Bsw;2ryMVukS^U^lH1Ld
z$)pDDKR$Luu7y(NB*Ff-kSTNk6REnV4bw(Dp;uHz_b%T*kLEpdx1g_!Kv%mmol0lz
zjJX#2FT1l;*cu%w`KTV>yIJ(Ao{4Xn<Zc28jBLUSXEM%gCC|gLda<Cnw!s%<IZS**
zy)x3)%FYU3T_opUM%McVgj%-mZ~IJ!paJ7_f8c1$d*;QS!Mxi9xP4Kv`<!YRT_41f
zTJ0F^T2z>%MmPAjh~jULwg?j&IqsVsuzdFBw`uNva-t<<$UlX&0oeZqv>rk0zaT5@
zkvDSPKq?NQyPrVUldny1HE`+*mG(M7g~>D$!Foy#R6*`TJqyt~lBDK@dpzTo3igsy
z9Xjxc?@>{_b>K;{5-g!A$IaXMw41$i9x60L))!RA*h=wvwRLRckUu{Wp9}QovMDBW
zsY23$99CWdr2?cYT@h#V>BJOW4uyJVZ2g6$JZ&}BW{;_sJJ8+)*oHO2lP-NUp~OR9
z7_K<!)Rnw(ohWo#gJG<HX%njt?H9+2$wM1OKOY~HcMC>63OGg(t4AIrM@~5~3aRLk
z3DwJoyLY}HV~zsG0qmkIhwaQdy$>&-n9;LR_L?ID;*=Uc^3@Y2cJo2gNK`&c9Wis@
z1-uXN8Jbi<M`rOsE!x+)qeEgsed6d3E9u1iVNy|vvXlfS5hUm!`37Bbt11|Ss&tG<
z;xS<=gl}JJcUYvI5`lVaym!?Z8j+}`F8o1q579rQc5k!lKel&TSRWQvd$QU}hN&e)
z-fvwETFHoou9}6ekTvhSr>n}s*lO#;mX)lvRwV1JWIYdQpjkdKPpdRqQ+|u8PYeuO
z$tLx)V^Ptp8jrW0e%s2|SXb+<`6?(NvKCs&7OS1)v?rHYR~xK(KVG#bTdnq6)x1Wd
zHBVZ#S@WR<#hM^rB!kweSo>VJ`qIHR^|9l9g`F_WFklzK_*{S-E;>_3`;Jb2?PVDy
zgQgSPqsCJ{Pu+4a29E$ZukJ=`58w-UOAQM*I{qU^>2WTUAB$hU(KlDscXRvTrsS<k
z|0<L&(p=E&$0~3#aw~!{z#+f^z(K%az!AVwE@$!@-m>3?4WF5_OmwbW7WqPZ>FRl(
z(|z+q@5*k`LVq6;we(Ff;;*M~i;bd_dgR$+!^yLvQ1-?_$mG;Iffl$V&k|4$IgWOe
z%whcD1RhTUP6HMxV;Ls1s+?X~Usj(3NQGn4dRqPCseR%clF0?*oV1{2DpWq1ax#&9
z^y)!bgeFmfMZh&RHp{CJrA|CHAkG(`c+9DY`xRo?<yo}x-IdVd+W<%%Clfr#jp^c*
zPs%T0DAIr_eofg+SEL(sNBj|W>mb#>TQ<w{!a7YKEU}YF+2tk2H@50Q3=UIXIAtyZ
zAzIzJMisc{PCKU01fhG4sa`s>OW4I<oM{oQIz%5r=z2zVJH==!EfJ*EL+5t*nuWS_
zuB*0^^y)XxMGJ_iu_oF}mM*VIWC@yxkKa{S96L|n6DugqY-V1UDg8h;vrm+>NR?qT
z@0TgU+-YThiNbvMad`CnsK^#Se!gB5a9N#eh$2DeskDwxx^jDToPu<j!#3?hX0e$X
zLt3T;ugrX98<(>QCz2t|VP)>MDaq3v$eQsa6&7mwg&o2rs;*Ge7rt3|Pjn4e@TQ{Z
z!A`*uc>%IP5P1<w|0)VY83g(`ean{<Y4zoc4~bmyjf-6sj2^F`^kNFf>MAr@sF+MO
z^V0{YBXiJ>sm+WHml>;T@v8>i)R3ckvgTR^?#f@soJ)WueO^5m-!5eF&G>E6z+rm~
z`|@S3(?#GZyD!v?nOD@ycE|EGHFe~WeMQbWSBQ;wL$&%hyG@L!!O7tcu&ead)8xrY
zm$xsUWnPcp{Y3O$L-SHmnXOsoaj6=LV#iEj4s}c?R48{p0JR;lFFTq0Nj;tz5p%^)
zCcY;cD>TMpkE!d<w%T$ry^{(cXZ>BX+;KK^!YuL{fc{zXCR&`ouA{|v>H1Z6!K7LB
zTa!mi#_^Tcp<hSfN;*itm}G9q2il7hSlRjCgS|?AABFI{_UbAXayOv&y?zfhmaETA
zE?2*kS|>hOd?__A3e1&I(n;W#kH=|R={v+YrG;_~z<%T$;DZG8a|LIwav<OsVB!u2
z;k@Jk6yq}-!s7=3_eBHsmGr3ikoskMzVY5ItB+n?)5_oRc#9Q>s$RO<Rrm<47u#~r
z*KUf^^pk<@neRuv(13rz+k)0sz#hP1z-hoGKmsrYcoOgw;8O%Tn4Ou<-7vv~KjraP
zyw~u)@pa($0Y3r!Tun`N7C7j3VW+(SzSAtO$mJ=#SX>RrbJT|)Htox<%*rW@psYr}
j5dg9P7r?P&@lnVR;RRKBmL!8TS6&d<U_EM8X8!*HR1Xzo

delta 7611
zcma)BdvH|OdB11hE3KpzLJzP&y8=mGkOT${0c;5a0z*L9ARD-bte4%Z)uPqj?YS!+
zLa<QRO`3F?$4Q)N-Lcb|*73ylbka@Q$)l}1`J;*ZOzSk`8)xEnI&o<pZra$bowV-v
z`_7_Wf$-QKemeKu^Z3s9{l3S!`;k|~XI~NFpGKM*ns)!-p^twt{7auORlwg*IOP8a
zVSXxb&jvGm;_#sdQqr-c>o4N3+2E99u9z&PvX0rT+g9G1u?i)}RMUZzU4Fwb%*G4F
z<QY42v1DB+xjvw&UJZ1MxOzKK4y>UeGNi5qPoD}>GvSj_^mGE+2wZ<UZ#!1P=LQ`s
zi?!uSbo*xPj3r|j7GHo~v@rDq{c{2YhFB&#gzT(r3uU}fEGW@^U7!_I&o%b6Ca9T+
zxIx<)E~ZLTvQ2%jad?ykXvC-pfg8v<lexU*2BbA(pR-6Rd$v$YG{`;-Spm3%$CXkt
zYtaeskMwO~`5|^8HDkF!Y30!{o6p-a<K%N`D_6)^7wY9#Pe*=5Y^}659ruVfm20_w
z0!n#D49$feiJiW5*WmWcEdQK;uU)W)WtiS3+GPXwUkAXJW>X<wnjS9M(^erjXUT1N
zv2YaGWT@9pc9w_GI}8{EYyfNoEC;j`xPer$XcaPYD?KDyWfxxXQ8(djn?sA2?7SsY
zg|s!9N|$UYH)9N|8M~S|GxVly7RVL&0h$_;TQKx)0Ced3^0|T~SEK1?1ufFcH=_r|
zNTscHNc0b23fm8cPk3C<@J81s?eor}8I#Mt6kulbs#x3Z?O>JbO*`jc!W2=5&*=u!
z_Dm51vaP%`siGUOoOG&axxQ)Zq9fbYlWi-8A(k0(X3seLzycpyau=5LODkKoVM56L
z)Fpe>zqD--M=Q&gmqa;4JymNZBHB<b7{v1UAc{PS=bC+5@z?}t2Cy>0#zaV4;0Suz
z3~&V2n8X~jDV?`ctYOeh#_+b0RZ!JKv}2Q)sO;&;2r;0Z?c82w`h>MqEC57v8|ZXJ
zX{Rm6Ve;1rgMD0II+f3p$Ark7b{?vPG;V<6K$adB7LpG`0<gvmYRfy4S+^7YJ%-U?
zu>NOK#Ibrg&vlK6M=D?L+T=UI#)ikfBf{6c^!K`l!1Yebak-mXrg!|v;kocVvxVt`
zeZDYa`smHWruX5oqh<ij`|dNn_w79hpl-_<H$CHf$n!^6?Mf`{nefO_Eb9aD62c8R
zCA*l<QAn$5?%7onBBcIzRkv2zGOaT9l-h9-DC%?t3(I3z*6+|_CAl6Q+qolepC?VR
zY2j&XKOOTr=rOUp&e!@eq2-1^<s{9srK7#Hwl}MkmAgO+GtAp^zztGZo=H(`x`$S)
zzJR}(etzM4i?-wXIbxWN1?xQAv_>K*fykYrKOSQDik{12s`A;MkBY+&QP;#=fP+y(
zBB<5EW@piE(#UbF6LuSCh(A^-$<Y>(TC=%F72+KO3zN-87AL9w%fzq+dyx>&#W#!T
z$`9hNd-g3U1n<v_EhR)$=4e4ToGTP(OB|C{s7T+@H4C%MPEa99G{~i$BnNvZiGilN
z*4H&)w%W5LnyLpR5h)hBGIqK;&r~n=Esx$J`oH$wE2Qe}za$#eQ~lc(x#(;CadD~Q
z8R+xuaY({(qT+OpoRNZ?fC@^0Qv@c-PRWBr%S$-Pb&e`NwD=Go!WeZnv33XYTkdD?
zzWXj`3vT7!q{l>5o9i6CcSDFoNeYi<C|3f-Xv6L5n~84!d0yyxqQ_t6R@1t{sM(|y
zlPiE-l<W_zTfNo{)2t**<$9@E4ePl~!KT>2iZs>7*4?qiY$#e(K5#NOo2m^bZA;5>
z$>`8HKP<AhyKX`}QQ5HmF0t$5B!~Pk;40u6!31SuPJ*n38uvJsrZEbBCvjAHUVUyb
zw&6F?gs-?>lEC%SIV2W#!(=~l6`5O{nBN%OBA%|S9rBAZ?|+rXCIWiXOVpN61D*h|
zLj4qhvsNN3e+OL`0UsrBqcbUo6s&h(=6wwB<^WFtK7&z_`cd_IpqJ{|;ePQ<<?n`j
zM1biqtA4&=>@*+0cJf*Rc?JuT0bpGUMN^d9$WvJ?#BKTefCT|1i}MHQt9Q32Hf|JO
zsQl$dC3?ZjIx^1E<GAZfUC24E@1m8GazM>*S}i`Wp4_x8aa*Z4I%<1hy$k9Wn|jO7
zQj;tRiQJ&Bq#PX_IKHt|s6EVJR@$>gD`U2#&Sk6K#+)&miAA${_Gw*g)t))t)Ex@D
zbW<)%&*oDlE15c{es}X$QK|gn<}pus!O<l9A^Vtoj;2U>mJ9;eBR@c`IZy1)cSOSV
zVN17RGAEtVaTIx`ZtHrcz7*)t%&BSNSFzw10lx+)!`WapWFgl}j^=vvc3Lu*J1~L?
z{Ww~`PGAO(Su$riW<=_$G6{p!RTVWpu#zLLc40`l5A_dI;SMavs^0&{ShRt?kxwE&
zfp_-;6yUe;K3Y4e-X2+fF>y|B8y8=zJiqPxo)2n8))dPw%sOP1Lb+}@<)o?P%oVZ|
z`{*tJf|v~)n@XuOTzT2DBwtW3@92x0ZP05{2a9Bt5%sLCtlXIvWkg3dnyby0@R?yM
z(e(ta?XO_SpAZoF7LI7P^68Uxo0#p?RgL-e-8yRVNJqX%>zIvrowo?rW*qrRtp6zB
z(*#s)`~}*74q!e~iF)tr;!zbE8M#rj9vXR&ZeLH0tS5YFq_53vph$1a<m7B2EnmXS
z)5<g2zaI|fhOrynlV&sW>v;87fJVSg$?q}r4FbR+H<Y!g<|gu+${y_~qg>}*s1(SR
zh!kv+pp!ahP3E$4Elo+ZXt8T-VB-7@mV6n&`+ozizXpISZVSzxl@{kucFIV#L+`R0
z1HVIS8)kEcE)wLvNNcEapq+$QpQom?UNyAqsOMFqa&6Zxk%WEc+PMUY9hgaFEhm<i
zHr-q@JM{fXrv}&iz=1>Z``D|NB-sC0N5~XZ+0sO+?&-jEP<`J~vE5z!GMIML+=Xh(
z4cRi6%@tC4eFyZOu`igxstomfR9ALy@%+$Gt0umsv%8KU(6Wv%*QhnYsm;}{2fI26
zzrGX08DuR^{I+^=Pe(g@E1Y%FMnBp!;JI$7<$M2ji2aR&G+iJ#9M^&la=W}hfZG>U
zyJgifHsHsSrrJ3cUsRc7$L{pJVN^aj)+Tgt<TdpPcz<^1Kc%_*$cvWPLw*FiT?cT8
z`w?1yM&O1Waz`#4NW>qb``^&rOwKmJ<-oBis_k7LRVFhCPXm-4sD#{$f)-o54H;@q
z*bn6GYQ<jls)PIQ^1NwOF7JO_tOQM{%&|+UQqC@1H?hsP0asM_ftBL2+IC>wc4%F1
znVj<t7YkV?b+uA*eMM~WT`c4yk?E><p_EHcJMtAnsRIM2mhw0kTd?PZ<j>Lm3E)4m
zMtIVuFC~<K=(EC<mCH_*v<tCEeGkL<7}c|M9NZ^DmD2~;iD9-PllTpcdJ_;5<SETA
z<wJ7ltOebWhz@B|effiZ_x=oH0yIVj0qm;0538GH`YDD$FfAu<%h0Pj4{Z<;wg1pp
zAJj3N?M>^T0$R$_Oa&O3LO4vRQ^wL%J$iVPXi|?Kezl%eOe5wNl`2c^z+A!(uAf}Q
zk~FMF9Hh1b#1)5mQ@8nlK^C?`-RZDMCnW^++W4S|s-`0=#HboLvU`u$cuW|bnUK+0
z62`pHPhsk5FfxtwBt*$;ob?--h=H!Ag|2`x@6lb#KG8nVY^=z%7?~({wbCe$n5S9G
zjA<{TBo+x9nKt!LM`G#aM(5?AQED{KMvSvf#(WUFIst*6=IWy;7-(hMwO~~-J7{Kz
z7i%%*Lq?|4m=7#S6g4RFi;UkmmgrKCJ}_2(3!3<E!2b}8&-sY;u~|gI*yK09E@Py@
zTsnrjMt%vS*i$Z}^$Wl+0q`jKHsBq=6#z()rGNaN{6G3*hHqG-&ZWxup2Rfnpds>I
z0Q`zPiNHgvNH4$u2!IdZ2fVkJg|&T|p-gLJpj()~9G$ng*jFDMT)9oO(f?~iggzTK
zc$?|-VN`Tck8J&;qbEeU=6e5*B{-ipL8>UAm4FgQ7;W}0CVvAS8v!kVo4l~*fS<sM
zEG|pG6i?N6kL?x9RQJS7r?e;1f`%?j*&5gAQ6~`jIyG;XIq=G>J#%TN-2Ko7(N>WU
zS#`0#ZkrEdSKeAXY8#$^4}e|ANd>p_S-NeNGV(Vt6ro>Nyt>w<d(lO@82%x3yM8Ko
zw{DSb!g$z_|3N*JQWpGMaxi19LVBth^Z8@C`VY|R);WhhVc9d5?&Az`i_@#;kB^EK
zl^-8(6YXYzzJAc1jEZ#%$W%{KL^jp(lRG`%Hq?VB;}KSEmFViq$yoViNYqMaDL7^$
zl2@80;=^`u%1V{!BVrwyna9lN8kuj)WA=tx$_Q$?`MgYK=FZDZErwetyww~N@yfi}
zEXr3&$hnpnl46PK>DZ(#cf`giBG)*)f@~<`1c^pXTIMUS%zR}X*HQ<Lt<Au~EJe_6
zq{w83iU}1ywNvyGRadAFo%&Y!ebF^s$D6uB2it-pvK{h*AhH8OzZZpR5J&YSeXW-h
zIrYrxHDXQWrPJ{`MvvD~dNG9)O?8qiR7tvm`O0m?k!v6wQ=1tZt})h);CBnUo*^Nv
zFX+1zdtN8zbODz1dG(p(4$)Wn`{a|N<t^-c56<PupQq}7E;SV3u)IpWkm~MQO^dh<
z2UAmW&YB`N-U!d?pHm&;11df_(hYW1KXvLj6UUMaz3S+h6U^)J8{dWAZ%|$;9<w%U
zJg$~LOf04_hdQPc%8?sieL4=M&RF?d<ML>Fj~J>vmi~^2)=3Q6?^jdS_Q-Q+vXj1L
zs28k(8)|W_JA0rOnE=o~`oROOx6xXM7V9PcUV6dM_3G;6;i_^xwRNf~%=OdHB>9WC
zPhy<9YR>;Y^i}oyMf&QlR&9fj`xd_+O}DGhO@`Ft*?zIP@_cq)l$k1ljFrZ(9e2^J
z>dz44loe_rpbyIp0X7rRPZXTIYH@%=fDSuQ1y_&wB&|8jF?lU!-x3Ve=W}D?ZuOJg
zeDvlMtBYq>w_n8~e8M<d487|4v+?qF2(tC2FWhdRNIw&#9Q_S%4W@Jgh5*BWt$-20
z0RaE1GJ)13fGNN<KoYqA!t6}(qK+k((E9}7X+RVo%;XmU6~Ncjsp+0FS0}IH`G)|$
z#w@PBWdwRyTnxx%)MwI9fT=UKEXWu}P*S6V0}y#-FMvbGVpGVw@nS206tb+Up*S{Y
K=v9&Y{Qm=)68yIS

diff --git a/tmw.py b/tmw.py
index f84e357..9dc1e89 100644
--- a/tmw.py
+++ b/tmw.py
@@ -741,7 +741,7 @@ def create_barchart_topTopics(dataToPlot, targetCategory, item,
     outfolder = outfolder+targetCategory+"/"
     if not os.path.exists(outfolder):
         os.makedirs(outfolder)
-    figure_filename = outfolder+"topTopics_"+item+".png"
+    figure_filename = outfolder+"tT_"+item+".png"
     plt.savefig(figure_filename, dpi=dpi)
     plt.close()
 
@@ -796,7 +796,7 @@ def create_topItems_barchart(dataToPlot, firstWords, targetCategory, topic,
     print("  Creating plot for topic: "+str(topic))
     ## Doing the plotting.
     dataToPlot.plot(kind="bar", legend=None) 
-    plt.title("Top "+targetCategory+" für topic "+str(topic)+" ("+str(firstWords)+")", fontsize=15)
+    plt.title("Top "+targetCategory+" für topic: "+str(firstWords), fontsize=15)
     plt.ylabel("Scores", fontsize=13)
     plt.xlabel(targetCategory, fontsize=13)
     plt.setp(plt.xticks()[1], rotation=90, fontsize = 11)   
@@ -808,7 +808,7 @@ def create_topItems_barchart(dataToPlot, firstWords, targetCategory, topic,
     outfolder = outfolder+targetCategory+"/"
     if not os.path.exists(outfolder):
         os.makedirs(outfolder)
-    figure_filename = outfolder+"topItems_"+str(topic)+".png"
+    figure_filename = outfolder+"tI_by-"+targetCategory+"-{:03d}".format(topic)+".png"
     plt.savefig(figure_filename, dpi=dpi)
     plt.close()
 
diff --git a/tmw_config.py b/tmw_config.py
index 6594241..92cd94e 100644
--- a/tmw_config.py
+++ b/tmw_config.py
@@ -49,8 +49,8 @@
 ### pretokenize
 ### Perform some preliminary tokenization.
 inpath = wdir + "2_segs/*.txt"
+outfolder = wdir + "3_tokens/"
 substitutionsFile = "./extras/fr_pretokenize_subs.csv"
-outfolder = wdir + "3_segs/"
 #tmw.pretokenize(inpath, substitutionsFile, outfolder)
 
 ### call_treetagger
@@ -174,7 +174,8 @@
 outfolder = wdir+"/8_visuals/topItems/"
 firstWordsFile = wdir+"/7_aggregates/firstWords.csv"
 numberOfTopics = 250 # must be actual number of topics modeled. 
-targetCategories = ["author-name", "subgenre", "title", "decade", "author-gender"] 
+targetCategories = ["segmentID"] 
+#targetCategories = ["author-name", "subgenre", "title", "decade", "author-gender"] 
 # choose one or several from: author-name, decade, subgenre, gender, idno, title, segmentID
 topItemsShown = 30 
 fontscale = 0.8
@@ -205,7 +206,7 @@
 dpi = 300
 height = 0 # for lineplot; 0=automatic
 mode = "line" # area|line for areaplot or lineplot
-topics = ["56"] # list of one or several topics
+topics = ["115"] # list of one or several topics
 #tmw.plot_topicsOverTime(averageDatasets, firstWordsFile, outfolder, numberOfTopics, fontscale, dpi, height, mode, topics)
 
 
@@ -216,9 +217,9 @@
 
 ### 5c show segment
 ## To read a specific segment, better than looking in the folder.
-segmentID = "rf0546§000083"
+segmentID = "rf0166§0118"
 outfolder = wdir+"/9_sel-segs/"
-#tmw.show_segment(wdir,segmentID, outfolder)
+tmw.show_segment(wdir,segmentID, outfolder)
 
 ### 6b - create_topicscores_lineplot
 inpath = wdir + "7_aggregates/*-lp.csv"  # narrow down as needed

From 04b6191971a08048878881c37b481ce74a8da66d Mon Sep 17 00:00:00 2001
From: christofs <c.schoech@gmail.com>
Date: Fri, 28 Aug 2015 11:07:12 +0200
Subject: [PATCH 18/56] make_wordle: reorganized with central function

---
 tmw.py | 95 +++++++++++++++++++++++++++-------------------------------
 1 file changed, 44 insertions(+), 51 deletions(-)

diff --git a/tmw.py b/tmw.py
index 9dc1e89..3de72f1 100644
--- a/tmw.py
+++ b/tmw.py
@@ -384,10 +384,8 @@ def call_mallet_modeling(mallet_path, inputfile,outfolder,num_topics,optimize_in
     """Function to perform topic modeling with Mallet."""
     print("\nLaunched call_mallet_modeling.")
 
-    ### Getting ready.
     import os
     import subprocess
-    
     if not os.path.exists(outfolder):
         os.makedirs(outfolder)
 
@@ -513,7 +511,8 @@ def merge_data(corpuspath, metadatafile, topics_in_texts, mastermatrixfile,
 def create_mastermatrix(corpuspath, outfolder, mastermatrixfile, metadatafile, 
                         topics_in_texts, number_of_topics):
     """Builds the mastermatrix uniting all information about texts and topic scores."""
-    print("\nLaunched create_mastermatrix. (This could take a while.)")
+    print("\nLaunched create_mastermatrix.")
+    print("(Warning: This is very memory-intensive and may take a while.)")
     if not os.path.exists(outfolder):
         os.makedirs(outfolder)
     mastermatrix = merge_data(corpuspath, metadatafile, topics_in_texts, 
@@ -593,73 +592,67 @@ def save_firstWords(topicWordFile, outfolder, filename):
 import matplotlib.pyplot as plt
 
 
-
 #################################
 # make_wordle_from_mallet       #
 #################################
 
-def make_wordle_from_mallet(word_weights_file,topics,words,outfolder, 
+from wordcloud import WordCloud
+import random
+
+
+def read_mallet_output(word_weights_file):
+    """Reads Mallet output (topics with words and word weights) into dataframe.""" 
+    word_scores = pd.read_table(word_weights_file, header=None, sep="\t")
+    word_scores = word_scores.sort(columns=[0,2], axis=0, ascending=[True, False])
+    word_scores_grouped = word_scores.groupby(0)
+    #print(word_scores.head())
+    return word_scores_grouped
+
+def get_wordlewords(words, word_weights_file, topic):
+    """Transform Mallet output for wordle generation."""
+    topic_word_scores = read_mallet_output(word_weights_file).get_group(topic)
+    top_topic_word_scores = topic_word_scores.iloc[0:words]
+    topic_words = top_topic_word_scores.loc[:,1].tolist()
+    word_scores = top_topic_word_scores.loc[:,2].tolist()
+    wordlewords = ""
+    j = 0
+    for word in topic_words:
+        word = word
+        score = word_scores[j]
+        j += 1
+        wordlewords = wordlewords + ((word + " ") * score)
+    return wordlewords
+        
+def get_color_scale(word, font_size, position, orientation, random_state=None):
+    """ Create color scheme for wordle."""
+    #return "hsl(0, 00%, %d%%)" % random.randint(80, 100) # Greys for black background.
+    return "hsl(221, 65%%, %d%%)" % random.randint(30, 35) # Dark blue for white background
+
+def make_wordle_from_mallet(word_weights_file, 
+                            topics,words,outfolder, 
                             font_path, dpi):
     """Generate wordles from Mallet output, using the wordcloud module."""
     print("\nLaunched make_wordle_from_mallet.")
-
-    from wordcloud import WordCloud
-    import random
-
-    if not os.path.exists(outfolder):
-        os.makedirs(outfolder)
-    
-    def read_mallet_output(word_weights_file):
-        """Reads Mallet output (topics with words and word weights) into dataframe.""" 
-        word_scores = pd.read_table(word_weights_file, header=None, sep="\t")
-        word_scores = word_scores.sort(columns=[0,2], axis=0, ascending=[True, False])
-        word_scores_grouped = word_scores.groupby(0)
-        #print(word_scores.head())
-        return word_scores_grouped
-
-    def get_wordlewords(words,topic):
-        """Transform Mallet output for wordle generation."""
-        topic_word_scores = read_mallet_output(word_weights_file).get_group(topic)
-        top_topic_word_scores = topic_word_scores.iloc[0:words]
-        topic_words = top_topic_word_scores.loc[:,1].tolist()
-        word_scores = top_topic_word_scores.loc[:,2].tolist()
-        wordlewords = ""
-        j = 0
-        for word in topic_words:
-            word = word
-            score = word_scores[j]
-            j += 1
-            wordlewords = wordlewords + ((word + " ") * score)
-        return wordlewords
-        
-    def get_color_scale(word, font_size, position, orientation, random_state=None):
-        """ Create color scheme for wordle."""
-        #return "hsl(0, 00%, %d%%)" % random.randint(80, 100) # Greys for black background.
-        return "hsl(221, 65%%, %d%%)" % random.randint(30, 35) # Dark blue for white background
-
-# TODO: pack this into a proper separate function.
-
-    ## Creates the wordle visualisation, using results from the above functions.
     for topic in range(0,topics):
-        ## Defines filename and title for the wordle image.
-        figure_filename = "wordle_tp"+"{:03d}".format(topic) + ".png"
-        figure_title = "topic "+ str(topic)        
         ## Gets the text for one topic.
-        text = get_wordlewords(words,topic)
-        #print(text)
-        ## Generates, recolors and saves the wordcloud.
-        #original# wordcloud = WordCloud(background_color="white", margin=5).generate(text)
-        #font_path = "/home/christof/.fonts/AveriaSans-Regular.ttf"
+        text = get_wordlewords(words, word_weights_file, topic)
         wordcloud = WordCloud(font_path=font_path, background_color="white", margin=5).generate(text)
         default_colors = wordcloud.to_array()
+        figure_title = "topic "+ str(topic)        
         plt.imshow(wordcloud.recolor(color_func=get_color_scale, random_state=3))
         plt.imshow(default_colors)
         plt.imshow(wordcloud)
         plt.title(figure_title, fontsize=24)
         plt.axis("off")
+        
+        ## Saving the image file.
+        if not os.path.exists(outfolder):
+            os.makedirs(outfolder)
+        figure_filename = "wordle_tp"+"{:03d}".format(topic) + ".png"
         plt.savefig(outfolder + figure_filename, dpi=dpi)
         plt.close()
     print("Done.")
+
     
 def crop_images(inpath, outfolder, left, upper, right, lower):
     """ Function to crop wordle files."""

From 94b9bfdac9f106a2284aa9f59c3259124311689b Mon Sep 17 00:00:00 2001
From: christofs <c.schoech@gmail.com>
Date: Fri, 28 Aug 2015 12:05:33 +0200
Subject: [PATCH 19/56] Move extras (stoplists) into wdir folder

---
 __pycache__/tmw.cpython-34.pyc | Bin 27784 -> 27692 bytes
 tmw_config.py                  |  10 +++++-----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/__pycache__/tmw.cpython-34.pyc b/__pycache__/tmw.cpython-34.pyc
index 8167bdc2aec72c92aeb5b9094fd02337acb34840..aa754bd23bfce6f7330967bc0ad99dd9b9cfb517 100644
GIT binary patch
delta 3019
zcma)8YfN0n6+Y+g`@z#-z(80B3|?%5jf1fXU=r*)U~GedV6Ox8vU9<^UUt{B7h}jG
z6yfyoD{4oI6e+6OU#+9KQhZyfRh8C1s;bJ5#!{s=S@n@9O&h6AtJbQbrnS;@W-)et
zRO;QGv){})bKWy|zw$1={th<(HCz&0Zh3F&{%Zgqz$1@~@F0!c_ugop!3_Y>3aQ6n
zsR?3DfP&@-sR+a(Alg9eg&P2|-5{bM+BJTh1;joO`{4#ctQo`s5FMJ;0-_VdL5=r-
zI0T|g<5m!dK|G-`>Ffq^MB}|6HhMrD)e|ys3`DQS?I4~6(Wfz4>IX5P@qQ4;K@4h4
zR-Xd#w8kB<<##8<4uTj0aYC<?2PZ*%LE|nEr$9WTF?ll#VubK4#GU|g8pNn(kZWf^
zoYnXUh;tyuG$t>{L7dn4D2NFVlNyu9#f0@jtQW)-h-dXWd5<s=opxR^vdtbNjA2xz
ztsmgGoqx3c$A#Z@{?=Y;;IACxz-`1osqc3j-a(ZBTPxc{Kt_i8dq*`MQNGTeJuVP#
z5FQX-*hE9KeUSG;(g&LcmCaAq0xI4)HuhL>)Lc%-FIyrKmzHT;b4zB<w&ap&%f$7L
zwSm3Urc5W&i@lM_%ZXfszSk_d8d<WIGII4mB5hmgT;iG)G1DTlWUi{;b>1!c45?wt
z$wP1j(fQ%Qdj?iGw+{c*h5ME4>2Hc+AuyehVknha7I6~R54W11#*p)yV^g?s$si>>
z2csU_vCNR45{|PJKOq`!8bdU44iU;h9w}T%BIFGQ>n_N<AZc)R^8hazP`B=eyc?2~
zA6hkdNw{Ev7)dv50<3$s*&7e|?nn7v&g#04HLm;NdN1TXaAKwl)&r0aXzGQ$pX>zr
zhjjOlSe|kp;9At=gHTmN+pn&|G_6iVeQO<)(oE+TGIA+$#!RIwJCa$pv&*)c>+AG-
zbj<3dzIuO1j!=;D6LqWai^{#TT($I<V_bFjN7^J(3gaGP^TWT}iK(Iq`AWG-qPp2X
z9xw`CvL`Z2>fil?U9z3{MY4}jp={Bz=i-@EM$YBpX3COW0~%CF8Dt|~^~Pe=Mh`)5
zIh}62-heipaYLuypmc<A-3vC&=}SeS9&~ZeY|1)Sgc|cfZwhKha8c?h7hEq&l^>EG
zT?tB<A8g7dw=mRDLTw);wZuk}m;tbZAm|$+af3u~iyvV52NVDHBm|Uv5e890OQ`uY
zDJ}9#AXy5@GHw^zockDTS}EU}VwyrN15pmi3hmBu@<&vVPa*A7CClYQv_^=W-_pK=
z!@#S%V471!GxADQQO&E>_^~VM#{-9Uu6gZDHW7~$g2kLK&`Fd5b$_5yePf_DQgDfE
zV$E|RleRj}Myq6)i*=I;7hK*`=JA?#*+>foS2krAyosgU<;+UKCoQc{m#pBi6E;=y
zT~O~`tl8*hX<yE~W-TNZ<pG*S!}1f>`6DK}$0CeEY@(XUPk7o&q!3u4LmTHqTPPLQ
zg1Ma9>Xuv~OzM`UHK&ag%C=?;iIkN#m#pnRQ|*IItt;GBkCQ@h2XUuSi7j;Dm17f@
z&}RphxGHH@<E#!=8~8QnrKdB9H=WOh@`#R;Klwi5o6hX1A0Sq!n&DwAD08^6s+K%0
zxO0}BEqJfniTIVAtXE$j?!yN4_HYkIlyBq!zOA}P+HlO785uwvQ?H&L@}A`gbj
zpTrN<zR`M|bWV-7q46(6WzSUMl=I@54A!~n*2$H!$F54F76V2dl8=VG;FO%Z!;bu7
zY(9F5Rk0tkp2-N4(@dUWGR$O@$r)9#SnK6N$Og4C-c()mfmu4=5#~hH!Fav;*?0w3
zs&~fQ@Z#<BDeRtNp$kkd5{Z_{amIQ>2cz>$CYVe!*$VDk6DKgL{y5Q%F{fejDXgC0
zaDrq8DAm<ihmD53<jAS_vEdT$<MSO2$}YxQ?@>nQRc87E&ZwVHH{-nXm+3eMK6WwN
z^e|=}t`7WR%*AlIODWv@7k8CwuO5xGE!Kr=Di+&~87Ch*gst;zF2hcQiV<ojbeK9g
z9p+K>>O`&j^QA+mRO54Pm{kpvb<XT_r5Kvw{l8_4vub6gujV$#%cln(#EZ3tyz0C)
zlf?2zgn8Rlbz-i|JIBye`MK_<H+eJPqr91F@2aECVcd;n)CV%>+@1S1dMj8yp>CNC
zm{f0>FXQi(Isa_S|I-rG4UI<|c)vc4e)%Nh2-z#vPoGPju5-jFg!Gk>8#U*xl<t!K
z)YdccJ@~w{8UG0JBbBl)Kk%nu?cGJMLy#*>Xn>@5B*;M~*Hyzp+_OsXpX6+V`sPA!
z4|Agx`lcy(%xpHbD*tB;x|xf0<{0RtbuLyHoB5&7Yix;|xrOxm@ek@ti+k_|=k>*}
z;_L?Twy!r&@p`LdA9K9vc_uFqp$kg4`NJmA^?%q#oEO>28?{~Cwi4Byi5m4<qRRNx
zrS2s5<EHvJad&5gUX3{~$pW8XlwW28ed?FV?-bj(k#*f;U2aDFm!o*w237t_qZs4G
zNhY5^iZVuPbgx?Gd`6~aiY4?>+HPf;<+ZC!J|CW>7X!!}OxPLOu5jlLRh}xPj&4e|
p>}lq|gY+@NzVO~~MYuLx8Lkhvl=;dr><atCWo4f5E;XBK|1YG;kS72D

delta 3189
zcma)8T})f&8Ghem{DV1QLNJ&DsZ&UZ!%u+xkP<>jLJ|n!uec$QQF`!a2Rnx2d^Sx&
z&1jk}Z7x<#{ibQ#%{J|3m7=NL(WG^|SgYk~+C{Fq##*IKm7;Xrx{GQ`G*#N(_cKYz
zv`IVoJiOok^FGgej(zK6eDf|gePNdShwK0J;#;=>K7~z}i{b&gvzOj$n#1*-AX?xX
z0JGab>;e(d@v|UygJ{)p7(^S0b{%gA(E(zQj-Lb338G8KO(42K?A38Ih<zaT>v#u<
z10Z^IOa~nV@w|>(K=gt*MDaSz(y0IfAsvBPI<*hPVLho8L_dflIwl4KAdc#|9mFvZ
z$8}6>20@(A@g5K-L7dVtF&hFgtm7^ar$LP9m{^_xF{<Oed~Q7-yborH@)(G5J+PnI
zOhqP~KN^`Px64G+P?I}9#dn<jEnm6tU8lRX%D|7E6YW1o{7gOU*o!M_eP^u~;W|RL
zO-1+g>@YyMK$L)RQ>TGBO3?#(52U<s-2fC`I>D#j-7|sDE9~56hId65li6_0UP%iz
z(Rrr~4f3dhjdMBeU0)hl<!s+~+lB4wUeDoB%)=ve1?cwC&9-K5p!#u5P_+$IuHOX6
zl|af3DLU4ri4Ex<??w}siISUHk^46h-k^^G3F7Sq;e#6hYZR2g8vqFeQ3}EjHxSl*
zaODByd>{<SQ^E!e&fSAG8ZHB2LaG$<rF7al)QWPbUGsyefSkYBN$oPo69HOElMGlh
zSrQR|JPB0^Vhe~W5L@9oxKcFJWZG-xkS~W6#l(pAqPRl$1VB_no`?iN)X=CwlW}9X
zVx5;hYL!23R6?F;)`AF87wU03v+nVD6^oPi6ZvN#&(0O+FeAWAZ^1PCSg)s(G*gj=
z)qzv-<+!wR@$h9^igY|1PDpzxtp0L%DjdN==`{5oPunYEEQVweUu+dyHsu^U_#$p4
zAtB3cFzRtP28`9cD%>Ah?@QY;E1m7{ka0`QFInkyJU4Hz<T5L{7%kD4pRk1Pt#oG{
zf;Z`-%P<An9@3A*HXub7BIE(SX+Yf?<>-PGS*<_RC0y)$U5W~YR4(D+^{2*5Hjh))
zxM7Wqqusr5rI+m+oa^S=AU_!0Y;zq|1^uJFduSRN&1QL^mb|a7rRA5d!Zh_xMZBvW
zQE4q_6SiCmkLpCjw)%GOJ}gnc>#cYB<!SY|-j_=whWsbxC?BZFzO80aZ^mp|EguRS
ztDUS{?_=FAkY1Z6#oV-(FaG~nv}$Tl9;90e)GrQABC0^&KsQ<EAxgqtlDlb!Y$YfJ
zc*mHXw&i@5tckzM&S@)L#j`_CoxN$M`cZ$KZ`G5tGs#$39q#XXq2MnLCsvkY(nqBs
zm-q-@!d}j0ldJL1LfmxOiA2FAGRal<puHULh*V3H`%5{m7?+j&lG?JAHc%5i!S#2z
zwV$Dp=lL#dRA<4Lv*#@-t*ZrBCY>vIl1te|`*Oi6<C;goolEA@ae0?ld9&6l@kH{1
zY^PQvARq94zh*ei5N0r$a*#n2K#u%4Mfe*UE(B=#ij2=ElIi%ewG=Oy8y&@CcJdZA
zw5CuY;t6Xdy`gJXs~vIagdUmwvm@I(XISbsS_Lk@QAuy)Dy%e2Uoa5#)flxD2i(nQ
z;<H#ze_OHK`Nxr<f!}k=j@d|W>{pNH5r5@89Q-rlZKrVZmxxtL4GrV0`edlFx{jHY
zWaGI^!E+^-j9tvi230ZKhmC4vxCghX>%;B%k$P{q1;?Gc!~KZkDl~H3Gs;<yDQo0J
z<6oIz6%#D+V-1loF+do^R+LlDZ%10NmLL2q+Xt>H3{ei{=tmiKaFjTfsY_$EIPKgV
z`+&Kfo>*u*$-6T(*}yQ&aGK#1!w|y=!x;iLzqjZ+cWL8F<spvm(}8j?N9>F&a^|LX
zFU)=0)UAd3Dyq)mTzn}zzhKGOq9t>6DmvMM=bZdx8at!BWSZd|0ae8W$9j1ON0SUM
zFq~x=RpaMFs&lFfJJrI}ew<Msp086w$L^?W(R0CBmV#eiKvD#;4jT<Q>wFO<duG+0
zv(Mn9`uyxU6l!p~2`y)FI>w^>{#@p{Cwgcpv?x#XcqBz9SHEmzB3cS>QHq1yp{39&
zY!=`rGu?PieKfNjmz{?*UD#^z++~7DSus_erOsDpsq@tvj;iSSLHvRG{`nTPoev4k
zukqqfn8O^ZA4Z#0?OX`oSDkY~%s40KQdqfp22IPT4_@jvU*csiGu#JtbgV%&%$1f2
zD`!RRiL{-&tvco#Q95Vl|A3w<p0=nWRs$y0N$b0K@9ji`y0hR@&559@S!itjf9gkl
zE75rHiB7(vPC831k$&p6PGfXlp-jKQvS+M?cv@FY8$A|x7k1#P^RI>bh<{i2#o{As
zXXCr-lg#oJ2D(AM%FzJBWmFl<^2jUH`0DM%*&ZH>Y$?9p+*T%&zAFD`0=n#ru5$vd
zPZHQL`oAS`m5=30Z&nQnAAYWk3p+foQD>}FUF)lMj$X*%+}EkS@va?<xHTKePqk$J
za+QIXM9lxJ30)3Ps*2T;dw8K%u$!Bg)WBFs?Mzl1^u5-XY{S=8GI^(_g}#fj`sW<d
zOARIc$b!fLHJ<wEx-RzZytYm6$8dn5k71l)is9*JQ_fHyy~N|zf-RTjMV>Ls@VKO<
z&CLwM(|O6)xsQH20n<?5x!8;e_5Q`?`W^HunEwpZYwj|)m?66B&E|m5Jfe*B?tcU2
CNWilI

diff --git a/tmw_config.py b/tmw_config.py
index 92cd94e..9c52055 100644
--- a/tmw_config.py
+++ b/tmw_config.py
@@ -65,8 +65,8 @@
 inpath = wdir + "4_tagged/*.trt"
 outfolder = wdir + "5_lemmata/"
 mode = "frN" # frN=nouns, esN=nouns, frNV=nouns+verbs, frNVAA=nouns+verbs+adj+adverbs 
-stoplist_errors = "./extras/fr_stopwords_errors.txt" # in tmw folder
-#tmw.make_lemmatext(inpath, outfolder, mode, stoplist_errors)
+stoplist_errors = wdir+"extras/fr_stopwords_errors.txt" # in tmw folder
+tmw.make_lemmatext(inpath, outfolder, mode, stoplist_errors)
 
 
 
@@ -80,7 +80,7 @@
 infolder = wdir + "5_lemmata/"
 outfolder = wdir + "6_mallet/" 
 outfile = outfolder + "corpus.mallet"
-stoplist_project = "./extras/fr_stopwords_project.txt" # in tmw folder
+stoplist_project = wdir+"extras/fr_stopwords_project.txt" # in tmw folder
 #tmw.call_mallet_import(mallet_path, infolder, outfolder, outfile, stoplist_project)
 
 ### call_mallet_model
@@ -206,8 +206,8 @@
 dpi = 300
 height = 0 # for lineplot; 0=automatic
 mode = "line" # area|line for areaplot or lineplot
-topics = ["115"] # list of one or several topics
-#tmw.plot_topicsOverTime(averageDatasets, firstWordsFile, outfolder, numberOfTopics, fontscale, dpi, height, mode, topics)
+topics = ["25","60"] # list of one or several topics
+tmw.plot_topicsOverTime(averageDatasets, firstWordsFile, outfolder, numberOfTopics, fontscale, dpi, height, mode, topics)
 
 
 

From 5a9c8e30a12588be90aa4ae5e7f42e81c63e4250 Mon Sep 17 00:00:00 2001
From: christofs <c.schoech@gmail.com>
Date: Fri, 28 Aug 2015 12:06:10 +0200
Subject: [PATCH 20/56] Delete extras in tmw folder (they are project specific)

---
 extras/fr_pretokenize_subs.csv  | 157 -------
 extras/fr_stopwords_errors.txt  | 738 --------------------------------
 extras/fr_stopwords_project.txt |  55 ---
 3 files changed, 950 deletions(-)
 delete mode 100644 extras/fr_pretokenize_subs.csv
 delete mode 100644 extras/fr_stopwords_errors.txt
 delete mode 100644 extras/fr_stopwords_project.txt

diff --git a/extras/fr_pretokenize_subs.csv b/extras/fr_pretokenize_subs.csv
deleted file mode 100644
index 273c5a8..0000000
--- a/extras/fr_pretokenize_subs.csv
+++ /dev/null
@@ -1,157 +0,0 @@
-"string§To§Find","string§To§Replace"
-’,'
-J,"Je "
-qu'elle,que elle
-"’","'"
-"J'","Je "
-"j'","je "
-"S'","Se "
-"s'","se "
-"C'","Ce "
-"c'","ce "
-"N'","Ne "
-"n'","ne "
-"D'","De "
-"d'","de "
-"L'","Le "
-"l'","la "
-"T'","tu "
-"t'","tu "
-"-le"," le"
-"-moi"," moi"
-"m'","me "
-"M'","Me "
-"-je"," je"
-"-il"," il"
-"-on"," on"
-"-lui"," lui"
-"-elle"," elle"
-"-nous"," nous"
-"-vous"," vous"
-"-nous"," nous"
-"-ce"," ce"
-"-tu"," tu"
-"-toi"," toi"
-"jusqu'à'","jusque à"
-"aujourd'hui","aujourdhui"
-"-t",""
-"-y"," y"
-"-en"," en"
-"-ci"," ci"
-"-là"," là"
-"Qu'","Que "
-"qu'","que "
-"-même"," même"
-" Il "," il "
-" Ils "," ils "
-" Elles "," elles "
-" Elle "," elle "
-" Je "," je "
-" Tu "," tu "
-" Toi "," toi "
-" Nous "," nous "
-" Vous "," vous "
-" Mais "," mais "
-" Ne "," ne "
-" Et "," et "
-" Pourquoi "," pourquoi "
-" Alors "," alors "
-" Aussi "," aussi "
-" Car "," car "
-" Au "," au "
-" Ses "," ses "
-" Se "," se "
-" Moi "," moi "
-" Toute "," toute "
-" Tout "," tout "
-" Hier "," hier "
-" Non "," non "
-" Comme "," comme "
-" Dans "," dans "
-" Pour "," pour "
-" Voilà "," voilà "
-" Son "," son "
-" Une "," une "
-" Un "," un "
-" Où "," où "
-" De "," de "
-" Qui "," qui "
-" Depuis "," depuis "
-" Ça "," ça "
-" Sur "," sur "
-" Ensuite "," ensuite "
-" Puis "," puis "
-" On "," on "
-" Si "," si "
-" Même "," même "
-" Toutefois "," toutefois "
-" Ainsi "," ainsi "
-" Aucun "," aucun "
-" Ce "," ce "
-" Ces "," ces "
-" Toutes "," toutes "
-" En "," en "
-" Après "," après "
-" Quel "," quel "
-" Quelle "," quelle "
-" Quand "," quand "
-" Celle "," celle "
-" Puisque "," puisque "
-" Tous "," tous "
-" Dès "," dès "
-" Cet "," cet "
-" Lorsque "," lorsque "
-" Lui "," lui "
-" Sauf "," sauf "
-" Moins "," moins "
-" Encore "," encore "
-" Cependant "," cependant "
-" Comment "," comment "
-" Assez "," assez "
-" Ma "," ma "
-" Quelques "," quelques "
-" Leurs "," leurs "
-" Ceux "," ceux "
-" Par "," par "
-" Devant "," devant "
-" Bien "," bien "
-" Personne "," personne "
-" Près "," près "
-" Avant "," avant "
-" Rien "," rien "
-" Partout "," partout "
-" Pourtant "," pourtant "
-" Déjà "," déjà "
-" Enfin "," enfin "
-" Maintenant "," maintenant "
-" Quoi "," quoi "
-" Eh "," eh "
-" Ah "," ah "
-" Oh "," oh "
-" Jamais "," jamais "
-" Mon "," mon "
-" Cela "," cela "
-" Du "," du "
-" Oui "," oui "
-" Ou "," ou "
-" Sa "," sa "
-" Celui "," celui "
-" Cette "," cette "
-" Des "," des "
-" Naturellement "," naturellement "
-" Sans "," sans "
-" Vos "," vos "
-" Votre "," votre "
-" Notre "," notre "
-" Peut-être "," peut-être "
-" Mes "," mes "
-" Celle "," celle "
-" Tant "," tant "
-" Demain "," demain "
-" Qu "," que "
-" qu "," que "
-" quelqu "," quelque "
-" jusqu "," jusque "
-" Jusqu "," jusque "
-" aujourd hui "," aujourd'hui "
-"  ","  "
diff --git a/extras/fr_stopwords_errors.txt b/extras/fr_stopwords_errors.txt
deleted file mode 100644
index 331dfad..0000000
--- a/extras/fr_stopwords_errors.txt
+++ /dev/null
@@ -1,738 +0,0 @@
-a
-à
-abord
-aboutissant
-achille
-adieu
-afin
-aglaé
-aglante
-ah
-ahi
-ai
-aidant
-aie
-ai-je
-ailler
-ailleurs
-ainsi
-ais
-aise
-aise
-al
-alexandre
-aller
-alors
-angélique
-annibal
-après
-arlequin
-arrivant
-assez
-assurément
-as-tu
-a-t-elle
-a-t-il
-a-t-on
-attends
-atys
-au
-aucun
-aucune
-aucuns
-aujour
-aujourd
-aujourd'hui
-auprès
-aussi
-aussitôt
-autant
-autre
-autrement
-autres
-aux
-avant
-avec
-avecque
-avez-vous
-avoir
-avoir
-baccarat
-bailli
-bajazet
-barbier
-bas
-bazile
-beaucoup
-bégayait
-bel
-ben
-bérénice
-bian
-biau
-bien
-bientôt
-bizarre
-blaise
-bon
-bonne
-bous
-bout
-brousse
-brute
-c
-ça
-çà
-cab
-calo
-canadien
-capucin
-car
-cassandre
-caton
-ce
-cé
-ceci
-cela
-celle
-celles
-celui
-cent
-cent
-cents
-cents
-cependant
-certain
-ces
-ces
-césar
-cesse
-cesse 
-cet
-cette
-ceux
-chacun
-chaque
-chatouilleuse
-che
-chère
-cheux
-chez
-chourineur
-ci
-cinq
-cinquante
-claudine
-clémence
-colette
-colin
-combien
-comme
-comment
-contre
-courant
-crois-moi
-croyez-moi
-croyez-vous
-cru
-crus 
-crût
-cynthia
-d
-da
-d'abord
-d'ailleurs
-damis
-dan
-dans
-daphné
-davantage
-de
-dé
-debout
-début
-dedans
-dehors
-déjà
-demain
-depuis
-dernier
-dernière
-des
-dès
-descendant
-désormais
-dessus
-deux
-devant
-di
-dire
-dis-je
-dis-moi
-dis-tu
-dites-moi
-dites-vous
-dit-il
-dix
-dix-huit
-dix-neuf
-dix-sept
-do 
-dois-je
-dom
-donc
-dont
-dorante
-dos
-douze
-drès
-droite
-du
-dur
-écoutez-moi
-effraya
-effrayait
-effrayant
-effrayé
-effrayée
-effrayer
-effrayés
-effrayons
-eh
-élise
-elle
-elle-même
-elles
-elles-mêmes
-embrun 
-émeri
-en
-encor
-encore
-enfin
-ensemble
-entendant
-entier
-entrait
-entre
-essai
-essaya
-essayai
-essayais 
-essayait
-essayant
-essayé
-essayer
-essayez
-est
-est-ce
-est-elle
-est-il
-es-tu
-et
-étai
-état
-été
-êtes-vous
-être
-êtres
-eu
-eun
-eune
-eus
-eûs
-eut
-eût
-eux
-eux-mêmes
-fa
-fade
-faible
-faire
-faites-vous
-falloir
-fatmé
-faut-il
-fi
-figaro
-fil
-fin
-fis
-fit
-fois
-folles
-force
-fort
-fossinde
-frontin
-fur
-fût
-gauche
-gerfaut
-gille
-gilles
-glisse
-goualeuse
-grave
-gris
-guère
-guise
-ha
-haut
-hé
-hélas
-heureux
-hi
-hier
-hippolyte
-ho
-holà
-homme-là
-hors
-hui
-huit
-hylas
-i
-ici
-ii
-il
-ils
-in
-indifférent 
-indispensable
-insu
-irai
-isabelle
-isolé
-itou
-j
-jamais
-jason
-jaune
-je
-jé
-jean
-jeté
-jj
-joyeux 
-juan
-jugé
-jusq
-jusqu
-jusque
-jusques
-juste
-justement
-l
-la
-là
-là-bas
-là-dedans
-là-dessus
-laisse-moi
-laisser
-laissez-moi
-large
-le
-lé
-léandre
-les
-leur
-leurs
-levant 
-levé
-levé
-li
-lire
-lis
-lisai
-lisaient
-lisais
-lisait
-lisant
-lisette
-lisez
-lisons
-loin
-long
-longtemps
-lors
-lorsq
-lorsque
-lucas
-lucile
-lui
-lui-même
-lut
-ly
-m
-ma
-mac
-magnier
-maintenant
-mais
-mal
-malgré
-manière
-manqué
-margot
-marie
-marié
-marmouset
-marton
-mathurin
-mauvais
-me
-mé
-méchant 
-même
-mêmes
-mêmes
-ménandre
-mes
-mettre
-mi
-mien
-mienne
-miens
-miens
-mieux
-mille
-mille
-mine
-mis
-mise
-moderne
-moi
-moi-même
-moins
-mon
-monsieu
-monsir
-morgué
-mot
-moue
-moujik
-muet
-muet 
-muette
-n
-ne
-né
-nérine
-ni
-no
-noir
-nommés
-non
-nos
-notre
-nôtre
-nous
-noute
-nouveaux
-nu
-nue
-nul
-oh
-on
-ons
-ont
-onze
-oronte
-ose
-ou
-où
-oublie
-oui
-ous
-ouvrait
-ouvrant
-ouvrons
-palmure
-palsangué
-paquier
-par
-parbleu
-parce
-pareil
-parfait
-pargué
-parler
-parmi
-parole
-parsonne
-part
-parton
-partout
-paru
-pas
-paya
-paya
-payai
-payais
-payait
-payât
-payé
-payées
-payer
-payés
-payez
-payons
-peignait
-pendant
-pensé
-per
-perdre
-personne
-personnes
-peu
-peut-être
-peut-il
-peut-on
-pierre
-pis
-pleurer
-plu
-plupart
-plus
-plusieurs
-plut
-plutôt
-point
-pompée
-porter 
-possible
-pou
-pour
-pourquoi
-pourtant
-pourvu
-poussa
-pouvez-vous
-pouvoir
-premier
-première
-prendre
-près
-presque
-priant
-prie
-pris
-prise
-promptement
-puis
-puis-je
-puisq
-puisque
-pût
-pyrrhus
-qu
-quaker
-quakeresse
-quand
-quant
-quarante
-quatorze
-quatre
-quatre-vingt
-quatre-vingt-dix
-que
-qué
-quel
-quelle
-quelles
-quelq
-quelque
-quelque
-quelquefois
-quelques
-quelqu'un
-quels
-queu
-queuque
-qui
-quinze
-quinze
-quoi
-quoiq
-quoique
-raide
-ram
-ramené
-ramener
-ramener 
-reçois
-refait
-rendre
-rendre
-rénine
-rentré
-reste
-rester
-rian
-riche
-rien
-rocambole
-rosine
-rouge
-s
-sa
-sachem
-sais-tu
-sanche
-sans
-saurer
-savez-vous
-savoir
-scipion
-se
-second
-seconde
-secouée
-seize
-selon
-semble
-sembler
-sen
-serra
-ses
-seul
-seule
-seulement
-si
-sien
-sienne
-signifiant
-sis
-sitot
-six
-soi-même
-soixante
-soixante-dix
-sommes
-son
-songez
-sont
-sophie
-sortant
-sortir
-soudain
-sous
-soutenant
-souvent
-ste
-sti
-suis
-suis
-suivre
-sujet
-sur
-sûr
-surtout
-sylla
-sylvanire
-t
-ta
-taisez-vous
-tandis
-tant
-tantôt
-tartarin
-tatigué
-te
-té
-tel
-téléga
-telle
-tellement
-tels
-tenez
-tenir
-tenons
-tente
-terrier
-tes
-thésée
-tien
-tienne
-tiennent
-tiens
-tient
-timar
-tirant
-tirer
-tirinte
-tirsis
-toi
-toi-même
-tom
-tombée
-ton
-tôt
-toujours
-tous
-tout
-toute
-toutefois
-toutes
-travers
-treize
-tremblant
-tremblante
-tremble
-trente
-très
-trois
-trop
-trouvé
-trouver
-tu
-tullie
-turc
-un
-une
-unknown
-<unknown>
-ur
-ursuline
-utile
-v
-vanda
-vela
-velà
-venir
-venu 
-vers
-veux-tu
-vi
-viant
-vingt
-vint
-vis
-vit
-vite
-vivant
-vla
-vlà
-voici
-voilà
-voir
-vois-je
-vois-je
-vois-tu
-voit
-vont
-vos
-votre
-vôtre
-vouloir
-vous
-vous-même
-voute
-voyant
-vraiment
-vue
-waterproof
-y
-zoé
diff --git a/extras/fr_stopwords_project.txt b/extras/fr_stopwords_project.txt
deleted file mode 100644
index 1a3f48d..0000000
--- a/extras/fr_stopwords_project.txt
+++ /dev/null
@@ -1,55 +0,0 @@
-air
-ais
-an
-année
-bras
-brousse
-chose
-chott
-côté
-coup
-doute
-effet
-état
-été
-façon
-fait
-femme
-fois
-fond
-genre
-gens
-heure
-homme
-instant
-jour
-lieu
-main
-mal
-mètre
-milieu
-moment
-monde
-nom
-nu
-nue
-oeil
-œil
-parole
-pas
-peine
-personne
-personnes
-petit
-pied
-place
-sens
-sorte
-suite
-temps
-tête
-tour
-travers
-un
-vieux
-voix

From d2485660f3e176f8c8e7e803a61e64d7b7ea8e52 Mon Sep 17 00:00:00 2001
From: christofs <c.schoech@gmail.com>
Date: Fri, 28 Aug 2015 12:17:21 +0200
Subject: [PATCH 21/56] config: Moving and adding comments above function
 calls.

---
 tmw_config.py | 45 +++++++++++++++++++++++++++------------------
 1 file changed, 27 insertions(+), 18 deletions(-)

diff --git a/tmw_config.py b/tmw_config.py
index 9c52055..2802c7c 100644
--- a/tmw_config.py
+++ b/tmw_config.py
@@ -22,7 +22,7 @@
 #print(help(topmod))
 
 ### Set the general working directory.
-wdir = "/home/christof/Dropbox/0-Analysen/2015/hybrid/rf740c/" # end with slash.
+wdir = "/home/christof/Dropbox/0-Analysen/2015/hybrid/rf740d/" # end with slash.
 
 ################################
 ###    PREPROCESSING TEXTS   ###
@@ -36,11 +36,14 @@
 
 ### segmenter
 ### Split entire texts into smaller segments.
+### target: The desired length of each text segment in words. 
+### sizetolerancefactor: 1=exact target; >1 = some tolerance, e.g. 1.1= +/-10%.
+### preserveparagraphs: True|False, whether \n from input are kept in output.
 inpath = wdir + "1_txt/*.txt"
 outfolder = wdir + "2_segs/"
 target = 600
-sizetolerancefactor = 1.1 # 1 = exact target; >1 = with some tolerance (1.1 = +/- 10%).
-preserveparagraphs = True # True|False
+sizetolerancefactor = 1.1
+preserveparagraphs = True
 #tmw.segmenter(inpath, outfolder, target, sizetolerancefactor, preserveparagraphs)
 
 ### segments_to_bins: inpath, outfile
@@ -66,7 +69,7 @@
 outfolder = wdir + "5_lemmata/"
 mode = "frN" # frN=nouns, esN=nouns, frNV=nouns+verbs, frNVAA=nouns+verbs+adj+adverbs 
 stoplist_errors = wdir+"extras/fr_stopwords_errors.txt" # in tmw folder
-tmw.make_lemmatext(inpath, outfolder, mode, stoplist_errors)
+#tmw.make_lemmatext(inpath, outfolder, mode, stoplist_errors)
 
 
 
@@ -81,20 +84,25 @@
 outfolder = wdir + "6_mallet/" 
 outfile = outfolder + "corpus.mallet"
 stoplist_project = wdir+"extras/fr_stopwords_project.txt" # in tmw folder
-#tmw.call_mallet_import(mallet_path, infolder, outfolder, outfile, stoplist_project)
+tmw.call_mallet_import(mallet_path, infolder, outfolder, outfile, stoplist_project)
 
 ### call_mallet_model
 ### Performs the actual topic modeling. 
+### num_topics: Number of different topics the model should find.
+### optimize_interval: interval between hypermarameter optimization.
+### num_iterations: How many times the model is improved. 
+### num_top_words: Number of words to save and display for each topic.
+### num_threads: Number of parallel processing threads to use. 
 mallet_path = "/home/christof/Programs/Mallet/bin/mallet"
 inputfile = wdir + "6_mallet/corpus.mallet"
 outfolder = wdir + "6_mallet/"
 num_topics = "250"
 optimize_interval = "100"
-num_iterations = "5000"
+num_iterations = "1000"
 num_top_words = "200"
 doc_topics_max = num_topics
 num_threads = "4"
-#tmw.call_mallet_modeling(mallet_path, inputfile, outfolder, num_topics, optimize_interval, num_iterations, num_top_words, doc_topics_max)
+tmw.call_mallet_modeling(mallet_path, inputfile, outfolder, num_topics, optimize_interval, num_iterations, num_top_words, doc_topics_max)
 
 
 
@@ -110,23 +118,23 @@
 metadatafile = wdir+"/metadata.csv"
 topics_in_texts = wdir+"/6_mallet/topics-in-texts.csv"
 number_of_topics = 250
-#tmw.create_mastermatrix(corpuspath, outfolder, mastermatrixfile, metadatafile, topics_in_texts, number_of_topics)
+tmw.create_mastermatrix(corpuspath, outfolder, mastermatrixfile, metadatafile, topics_in_texts, number_of_topics)
 
 ### calculate_averageTopicScores
 ### Based on the mastermatrix, calculates various average topic score datasets.
+### targets: one or several:author|decade|subgenre|author-gender|idno|segmentID|narration
 mastermatrixfile = wdir+"/7_aggregates/mastermatrix.csv"
 outfolder = wdir+"7_aggregates/"
-# targets: one or several:author|decade|subgenre|author-gender|idno|segmentID|narration
 targets = ["author-name", "author-gender", "title", "decade", "subgenre", 
            "idno", "segmentID", "narration", "protagonist-policier"] 
-#tmw.calculate_averageTopicScores(mastermatrixfile, targets, outfolder)
+tmw.calculate_averageTopicScores(mastermatrixfile, targets, outfolder)
 
 ### save_firstWords
 ### Saves the first words of each topic to a separate file.
 topicWordFile = wdir+"6_mallet/topics-with-words.csv"
 outfolder = wdir+"7_aggregates/"
 filename = "firstWords.csv"
-#tmw.save_firstWords(topicWordFile, outfolder, filename)
+tmw.save_firstWords(topicWordFile, outfolder, filename)
 
 
 
@@ -156,27 +164,28 @@
 
 ### plot_topTopics
 ### For each item from a category, creates a barchart of the top topics.
+### targetCategories: one or several: "author-name", "author-gender", "decade", "subgenre", "title"
+### numberOfTopics: Must be the actual number of topics modeled before.
 averageDatasets = wdir+"/7_aggregates/avg*.csv" 
 firstWordsFile = wdir+"/7_aggregates/firstWords.csv"
-numberOfTopics = 250 # must be actual number of topics modeled.
 targetCategories = ["author-name", "author-gender", "decade", "subgenre", "title"] 
-# one or several: "author-name", "author-gender", "decade", "subgenre", "title"
 topTopicsShown = 30 
+numberOfTopics = 250 
 fontscale = 1.0
 height = 0 # 0=automatic and variable
 dpi = 300
 outfolder = wdir+"/8_visuals/topTopics/"
 #tmw.plot_topTopics(averageDatasets, firstWordsFile, numberOfTopics, targetCategories, topTopicsShown, fontscale, height, dpi, outfolder)
 
-### plot_topItems
+### plot_topItems ###
 ### For each topic, creates a barchart with top items from a category. 
+### targetCategories: one or several from the following list:
+### "author-name", "decade", "subgenre", "gender", "idno", "title", "segmentID"
 averageDatasets = wdir+"/7_aggregates/avg*.csv" 
 outfolder = wdir+"/8_visuals/topItems/"
 firstWordsFile = wdir+"/7_aggregates/firstWords.csv"
 numberOfTopics = 250 # must be actual number of topics modeled. 
-targetCategories = ["segmentID"] 
-#targetCategories = ["author-name", "subgenre", "title", "decade", "author-gender"] 
-# choose one or several from: author-name, decade, subgenre, gender, idno, title, segmentID
+targetCategories = ["author-name", "subgenre", "title", "decade", "author-gender", "segmentID"] 
 topItemsShown = 30 
 fontscale = 0.8
 height = 0 # 0=automatic and flexible
@@ -196,7 +205,7 @@
 dpi = 300
 #tmw.plot_distinctiveness_heatmap(averageDatasets, firstWordsFile, outfolder, targetCategories, numberOfTopics, topTopicsShown, fontscale, dpi)
 
-### plot_topicsOverTime
+### plot_topicsOverTime ###
 ### Creates lineplots or areaplots for topic development over time.
 averageDatasets = wdir+"/7_aggregates/avgtopicscores_by-decade.csv" 
 firstWordsFile = wdir+"/7_aggregates/firstWords.csv"

From 6901aa0316e66a44415b2d63c269092b971efa87 Mon Sep 17 00:00:00 2001
From: christofs <c.schoech@gmail.com>
Date: Fri, 28 Aug 2015 16:30:26 +0200
Subject: [PATCH 22/56] Deactivate all

---
 tmw.py        |  3 +--
 tmw_config.py | 12 ++++++------
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/tmw.py b/tmw.py
index 3de72f1..e3253a8 100644
--- a/tmw.py
+++ b/tmw.py
@@ -728,8 +728,7 @@ def create_barchart_topTopics(dataToPlot, targetCategory, item,
     plt.xlabel("Topics", fontsize=13)
     if height != 0:
         plt.ylim((0.000,height))
-    plt.tight_layout() 
-
+<   
     ## Saving the plot to disk.
     outfolder = outfolder+targetCategory+"/"
     if not os.path.exists(outfolder):
diff --git a/tmw_config.py b/tmw_config.py
index 2802c7c..425b2d2 100644
--- a/tmw_config.py
+++ b/tmw_config.py
@@ -84,7 +84,7 @@
 outfolder = wdir + "6_mallet/" 
 outfile = outfolder + "corpus.mallet"
 stoplist_project = wdir+"extras/fr_stopwords_project.txt" # in tmw folder
-tmw.call_mallet_import(mallet_path, infolder, outfolder, outfile, stoplist_project)
+#tmw.call_mallet_import(mallet_path, infolder, outfolder, outfile, stoplist_project)
 
 ### call_mallet_model
 ### Performs the actual topic modeling. 
@@ -102,7 +102,7 @@
 num_top_words = "200"
 doc_topics_max = num_topics
 num_threads = "4"
-tmw.call_mallet_modeling(mallet_path, inputfile, outfolder, num_topics, optimize_interval, num_iterations, num_top_words, doc_topics_max)
+#tmw.call_mallet_modeling(mallet_path, inputfile, outfolder, num_topics, optimize_interval, num_iterations, num_top_words, doc_topics_max)
 
 
 
@@ -127,14 +127,14 @@
 outfolder = wdir+"7_aggregates/"
 targets = ["author-name", "author-gender", "title", "decade", "subgenre", 
            "idno", "segmentID", "narration", "protagonist-policier"] 
-tmw.calculate_averageTopicScores(mastermatrixfile, targets, outfolder)
+#tmw.calculate_averageTopicScores(mastermatrixfile, targets, outfolder)
 
 ### save_firstWords
 ### Saves the first words of each topic to a separate file.
 topicWordFile = wdir+"6_mallet/topics-with-words.csv"
 outfolder = wdir+"7_aggregates/"
 filename = "firstWords.csv"
-tmw.save_firstWords(topicWordFile, outfolder, filename)
+#tmw.save_firstWords(topicWordFile, outfolder, filename)
 
 
 
@@ -216,7 +216,7 @@
 height = 0 # for lineplot; 0=automatic
 mode = "line" # area|line for areaplot or lineplot
 topics = ["25","60"] # list of one or several topics
-tmw.plot_topicsOverTime(averageDatasets, firstWordsFile, outfolder, numberOfTopics, fontscale, dpi, height, mode, topics)
+#tmw.plot_topicsOverTime(averageDatasets, firstWordsFile, outfolder, numberOfTopics, fontscale, dpi, height, mode, topics)
 
 
 
@@ -228,7 +228,7 @@
 ## To read a specific segment, better than looking in the folder.
 segmentID = "rf0166§0118"
 outfolder = wdir+"/9_sel-segs/"
-tmw.show_segment(wdir,segmentID, outfolder)
+#tmw.show_segment(wdir,segmentID, outfolder)
 
 ### 6b - create_topicscores_lineplot
 inpath = wdir + "7_aggregates/*-lp.csv"  # narrow down as needed

From 962448b3f0352ece730b385c03e0c08c41d282a3 Mon Sep 17 00:00:00 2001
From: christofs <c.schoech@gmail.com>
Date: Fri, 28 Aug 2015 16:58:56 +0200
Subject: [PATCH 23/56] NEW FEATURE: Added topicClustering

---
 __pycache__/tmw.cpython-34.pyc | Bin 27692 -> 29733 bytes
 tmw.py                         |  65 ++++++++++++++++++++++++++++++++-
 tmw_config.py                  |  22 ++++++++++-
 3 files changed, 84 insertions(+), 3 deletions(-)

diff --git a/__pycache__/tmw.cpython-34.pyc b/__pycache__/tmw.cpython-34.pyc
index aa754bd23bfce6f7330967bc0ad99dd9b9cfb517..94aed7693b6192515159a5b14338f75dd9772c3e 100644
GIT binary patch
delta 2500
zcmZWreQZ-z6hG&+>(;Jaw_(>IAO+;3D9zLu=NAq)fdSiq7RLB+w!N?2>$-N__a<)X
z(ik*oOhmax11A2Ze-MpIYD|nKe#O5`h<_L(Ph(6Z@e}{?BT8cQoZGRYzQ_A<dha>+
z-1|EpcdvYoFMWq=eh!8L-^;J}U3>%J8@TH6klaU~{mZ#EFB+-BopA6j5c@#vherSo
z-VI_D!~rdD1~CTWpqB3eaR|g=E%$){5D3LBaBvHVBOvbA<M;AlGCp4W(zta1{i<`@
z*|ilPR1l(~6Qc7zaQsjRK%o_A_&9?7nH~wMAGWQ!CIppc5D};}(G;R%K%q^o+<qRz
z>gVmZtKGLn)Q-W))yw%To;ITk*I*Qv&>O@qPaAe(JAF$~j?^v;ZblqY-8=hzLkJ*;
zReI-rD=r#z=uWow2Cxf5WEUok;0-8~wGVf$$LJrd64VYlu^WR%2m>fnH5@7<deqV3
zD2~=1AKqc8^@D5FrQK_BNSz#8uinkHbUNkfLC2i3$1`b}ok&Zkqn6t94ECgW2cryQ
z1o5!k&2o&K$OKD!81^#kXBbi=hq_fy@+RC+%OsB>?yb40b_4gQANL=?!?l6YF2j3(
zG&LdiSd6yNy=S$0b8IEf);=2>$CcWFUR!(7YilpPbqA?K19+--`p~n8=W0Di{z81G
zmKcB4z^_i_ma9)PE$aGQM0I7HcFo?~ywwPs4y!Yn&3L|cA=75yx3%v?6!F(nx%X%b
z@8|mPo0^fo@IUw3vtDRnjmr~#_@}zlx;aX0>&HvjLSH|9_9UnW)~^~qj>fzJ6-ojR
z2p<SajTXooApBYmfM^8~<dNgxS<v+I6s1+6iF1yGc@I=Pl#wCzV(*S+bJuQ)<xR)2
zO8FQ?YqTsyGFz6Wok*xly=$xTSMs0Q8uv*~0ePC?H-_ICz94YD6J}b(y>9pz8IPJ)
ze!{W0T1C?h@oM&NQ>Mz(R@M#he0^~@^e<m>BQK`m241m)b~LUvs#h$!i4QwKlVAjJ
z8E(XGLq1OYXRq+(rzszOny<?wIJz9^z*qq&AP8cGP7ja=AZ{M|7-)Rp$cJ#)U{2<}
zObZp5L);R{#1$D-5hi^gKZ1%E3PdH%dEilE+zUAamJ+GQX*HtuKhsTDPg9B0zh*9C
zF7f(4n!#y-f?wA+wF3{=_dMxPa05KZ8!thx)}q!xE-Bhxt<?e*G7WO@`9v*nbRCTH
zP>6q!kc|6P<&K`}+=@-H%#2kO9NnBWkItB->}<?QXNqPbk&wh649gxuSz`E>;R3<;
z%yhi19&-+|?BQHL%hFE_KQO$^@Ct)Yi%Trs$M8DCIT#^)jl{TLK1SwlD}~g`+NttB
z+jQHDmhFt>8VM)gB~{le(oS0L;04}cIH3(~PRSv)YE$I)4t=2U`bgT5)<df}HFO80
z)Ci-G`rR^INpFPScKVhWev~KG-<hTOsj9A9UZvnt7>QFZs3vfzI*4{k0PeDYP7@R;
zV1yI~$|5f;6-|wTBO|Rs>;ow;c5e<2*eB}Un=_&wLat8T?E?@cmr)U@Le!zg&LO9t
zt@H)!7Xf)}V?)Z4s0e{*BMoZ$G#u8x=!Qwg!^2#&)27Uo<rD`tTb!{SQ<6wAb$wH&
zGf@`$e`#gw(e2~tZDHsjnAFjBTN{SslFLTZKIcu?59W1mvu-5ic)QL5*RZp$f5I{)
zck)@cxoDLp(|OYk2(u()IWN;wvYS0@XBcF-sv~#Y{3MS&#qczN8{%fL*c)`~ciRM|
zM7oqUhjd)k!-J7(=fXk98=Jj_<9!Y77w96o&S<AQFW}{yZUfzY@-(fU<wn2{H_*kV
zu#0{)ngRu#>c3v*d<j#@QyEeMP&c63r1YR-^i*{?NR@J>iGoeppvxC2f0A+UT<?Ho
zPZ!g(u?7pVa*jKxm5tfflvPYi%bAVM*#C*t+|pg?nNoJb6m{kPMiA$k2FfKfF&y{G
zV3qc)`<y(<)bZisE%H&8pJRye{35IJ2_EBGlbY_0JgY&s)v(@H!_&ZGPxS<hFuh$U
z&yaD}Lw5XlKpnF_=(?ZvG^P3v3i$%*#QW7q;bQv;`${s@uXNQvxxLp7*;#9PHn9*g
ww|PO3iWRwQH%_A+%IA6P1$An29nPt@Cih}Q1&b>}E5b(5AM6Zv1iORAKRRweiU0rr

delta 705
zcmZXRUr3Wt7{=e{+njE0b8c;GSWTH|BhV<MKb9FSat-I^T=P$6tYvInIKR&bVqK(?
zbXD+1mt7T9giy!32)e5asoT0rXu)o(tDrw9o%1OXbUx0*%X`jwe$P4I!y2BxN93#1
z<+#$mmR)~B^ojl%7FqYpX{<bpJkb2(hpCXI!Wmj58t<h7C=<<z7E4bIK2D!{+&!mJ
z$xaJIsMx3i3KqN+yl$g82bCJB<RsZ%02ym$$Hn1+wry^z)KkGvl{&cuD2pZ@8F&j1
zN8*S0M=HXz@yqSIR4j{I3m}3ZeDZT50Efj5FFbPkpr^z}ybnn6(ZmT`5=8nC-%Omu
zXI@RTU_{gsoxo4_4Em839fLCnS>*){x9RNFw!^N4l$)lHisj^Ktxg}5CVcyVpUcOr
z98R~{H(gctemu-y@(o;0H^VFD(|6I6Rmf2#V^X3`dRo=3(q8XU)v%I`lAMyQgo#WH
zQE?}8708H&tVhEzhjOEs6nZY8S^q{jKhxaUtIi!%vY(HqJGnX*#x-FbpF-GVnHUpj
zVoad9J<0PEF|c?&aSwPd&P}cZFU8rZWep#>YkCK7$g6bcZ+Xm6FH~f|!VK}7d>`J4
z`U`FiH4(TN1is4yuaM+#)BUK5y_Y`x2RFH<)0WBZVn2RyvAB=-%&rDnt@71vl0oLb
t)0ZSEdW2ni)ss@t5ha_C$uI`$^+nZK;u}T>%KXR}Mu2}CJ1y-_?GK9zrIP>v

diff --git a/tmw.py b/tmw.py
index e3253a8..a764673 100644
--- a/tmw.py
+++ b/tmw.py
@@ -728,7 +728,7 @@ def create_barchart_topTopics(dataToPlot, targetCategory, item,
     plt.xlabel("Topics", fontsize=13)
     if height != 0:
         plt.ylim((0.000,height))
-<   
+   
     ## Saving the plot to disk.
     outfolder = outfolder+targetCategory+"/"
     if not os.path.exists(outfolder):
@@ -1039,6 +1039,69 @@ def plot_topicsOverTime(averageDatasets, firstWordsFile, outfolder,
 
 
 
+###########################
+## topic_clustering     ###
+###########################
+
+import scipy.cluster as sc
+
+def get_topWordScores(wordWeightsFile, WordsPerTopic):
+    """Reads Mallet output (topics with words and word weights) into dataframe.""" 
+    print("- getting topWordScores...")
+    wordScores = pd.read_table(wordWeightsFile, header=None, sep="\t")
+    wordScores = wordScores.sort(columns=[0,2], axis=0, ascending=[True, False])
+    topWordScores = wordScores.groupby(0).head(WordsPerTopic)
+    #print(topWordScores)
+    return topWordScores
+
+def build_scoreMatrix(topWordScores, topicsToUse):
+    """Transform Mallet output for wordle generation."""
+    print("- building frequency table...")
+    topWordScores = topWordScores.groupby(0)
+    listOfWordScores = []
+    for topic,data in topWordScores:
+        if topic in list(range(0,topicsToUse)):
+            words = data.loc[:,1].tolist()
+            scores = data.loc[:,2].tolist()
+            wordScores = dict(zip(words, scores))
+            wordScores = pd.Series(wordScores, name=topic)
+            listOfWordScores.append(wordScores)
+        scoreMatrix = pd.concat(listOfWordScores, axis=1)
+        scoreMatrix = scoreMatrix.fillna(10)
+    #print(scoreMatrix.head)
+    scoreMatrix = scoreMatrix.T
+    return scoreMatrix
+
+def perform_clustering(scoreMatrix, method, metric, wordsPerTopic, outfolder): 
+    print("- performing clustering...")
+    distanceMatrix = sc.hierarchy.linkage(scoreMatrix, method=method, metric=metric)
+    #print(distanceMatrix)
+    sc.hierarchy.dendrogram(distanceMatrix)
+    plt.setp(plt.xticks()[1], rotation=90, fontsize = 2)   
+    plt.tight_layout() 
+    #plt.show()
+
+    ## Saving the image file.
+    if not os.path.exists(outfolder):
+        os.makedirs(outfolder)
+    figure_filename = "clustering_"+method+"-"+metric+"-"+str(wordsPerTopic)+"words"+".svg"
+    plt.savefig(outfolder + figure_filename, dpi=300)
+    plt.close()
+    
+
+def topicClustering(wordWeightsFile, wordsPerTopic, outfolder, 
+                    method, metric, topicsToUse):
+    """Display dendrogram of topic similarity using clustering."""
+    print("Launched topicClustering.")
+    ## Gets the necessary data: the word scores for each topic
+    topWordScores = get_topWordScores(wordWeightsFile, wordsPerTopic)
+    ## Turn the data into a dataframe for further processing
+    scoreMatrix = build_scoreMatrix(topWordScores, topicsToUse)
+    ## Do clustering on the dataframe
+    perform_clustering(scoreMatrix, method, metric, wordsPerTopic, outfolder)
+    print("Done.")
+
+
 
 ##################################################################
 ###    OTHER / OBSOLETE                                        ###
diff --git a/tmw_config.py b/tmw_config.py
index 425b2d2..f406646 100644
--- a/tmw_config.py
+++ b/tmw_config.py
@@ -22,7 +22,7 @@
 #print(help(topmod))
 
 ### Set the general working directory.
-wdir = "/home/christof/Dropbox/0-Analysen/2015/hybrid/rf740d/" # end with slash.
+wdir = "/home/christof/Dropbox/0-Analysen/2015/hybrid/rf740c/" # end with slash.
 
 ################################
 ###    PREPROCESSING TEXTS   ###
@@ -118,7 +118,7 @@
 metadatafile = wdir+"/metadata.csv"
 topics_in_texts = wdir+"/6_mallet/topics-in-texts.csv"
 number_of_topics = 250
-tmw.create_mastermatrix(corpuspath, outfolder, mastermatrixfile, metadatafile, topics_in_texts, number_of_topics)
+#tmw.create_mastermatrix(corpuspath, outfolder, mastermatrixfile, metadatafile, topics_in_texts, number_of_topics)
 
 ### calculate_averageTopicScores
 ### Based on the mastermatrix, calculates various average topic score datasets.
@@ -218,6 +218,24 @@
 topics = ["25","60"] # list of one or several topics
 #tmw.plot_topicsOverTime(averageDatasets, firstWordsFile, outfolder, numberOfTopics, fontscale, dpi, height, mode, topics)
 
+### topic_clustering ###
+### This function will create a dendrogram grouping topics based on their word weight similarity.
+### wordsPerTopic: Number of top words for each topic to take into account for similarity measure.
+### method: The clustering method used to build the dendrogram. 
+###   Options: ward|single|complete|average|weighted|centroid|median
+###   See http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.cluster.hierarchy.linkage.html 
+### metric: The distance measure used to build the distance matrix.
+###   Options: euclidean|minkowski|cityblock|seuclidean|sqeuclidean|cosine|correlation|hamming|jaccard etc.
+###   See: http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html
+wordWeightsFile = wdir + "6_mallet/" + "word-weights.txt"
+outfolder = wdir + "8_visuals/clustering/"
+topicsToUse = 250 # should be identical to all topics modeled.
+wordsPerTopic = 10
+method="complete" 
+metric="cosine"
+tmw.topicClustering(wordWeightsFile, wordsPerTopic, outfolder, method, metric, topicsToUse)
+
+
 
 
 ################################

From 9ce79ae1167875d3e27dca450f64b839b6490203 Mon Sep 17 00:00:00 2001
From: christofs <c.schoech@gmail.com>
Date: Sun, 30 Aug 2015 11:03:59 +0200
Subject: [PATCH 24/56] More comments

---
 __pycache__/tmw.cpython-34.pyc | Bin 29733 -> 30060 bytes
 tmw.py                         |  21 +++++++++++++--------
 tmw_config.py                  |  11 ++++++-----
 3 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/__pycache__/tmw.cpython-34.pyc b/__pycache__/tmw.cpython-34.pyc
index 94aed7693b6192515159a5b14338f75dd9772c3e..98f88eaf98a98ecdc9a250f1165a94e428d2809d 100644
GIT binary patch
delta 947
zcmYjPOK1~O6uo!y?c~=qsgtB12b5}%_(4R`3Q8kt3nHkr3Ps|)F-}Zp(l=8qVM382
z{=ixWL=b6(x^p81x9(i)HY=AZ(xq$f4bnKnJM-SX=bm@o-1q(m{{DsPPBNwbdHwxH
z;nPR;akEgW=y#T^rm*a~?i!9X^@W-zt>x_x%_1siF!($%vf?9%5cmqLqSA?g7luXz
zL>NQ~RsqpNXeiK#f*&CUAm?K}R-E!6Vo(%u5Cb4o5D6CPQD~j)g;EqE6{@0qx);jq
zh7urF27w%CA`2RM(8LgEUK0EeZ>EVH4XW!`kQ|VSG&C~Mpwck&+#TRZ_TLh1@|EW!
zX=0dUS%|*>$?l<u23jWy-UK-t;zB=4!2*cluDe`sF9C3^JQ8f!m6dMnu2pxfw)wLN
z0P-0U9Oc<Lo}FjNGJK&}d-S@~vW&`9bIJ8g$t|dwwvbL;))p3|MZ9jTYPnv`Zp#L(
z+KH<*N|{7Ht+&&<(J!p%?Yu6mD@-=1iDu1RqGnP!sy}d~aJz{<`?j27OVjKje7I%T
z<uIvRa|%FNkpcT-<K=vicE%=Uf#%&Por@H^s@SzCFER5n!wADP!wkb1%g@m)SUWi!
zn9lvlQHlVnIIaxhAnwB)y~j`u4bkc#YAD|l+lVrY9+gI%2HSM)Aly21YWW0rygFZj
zP8^o^b33Z@qtF=ukFI76{5bdnd{6kAMgSKlVRrEd%mx=)GfGJf)?3HV@*ZVXK4Ob+
z7<AI_#^{IMbLbE8L|TR`pL6aB!wZI&4C}$So4Inv<Bs(kb}>u_)1(3+Rmsqs!W539
v+zbvZe$5)3?VSnvoN{YBE^p!9SFHDxVU2-Z$xViL3|qlNcO#NdmXgX}gQ(Hf

delta 579
zcmXYs&uSA<6voe)%p~c}<X>Yu4N?i{!dP2`y6Cc2-KgL~T-1f7%^gFA>BO9wv{<IN
z2$G6mTkb-v(47mHr9OdApdc=~>cWMJ_y&UKq|PwEd*+_+yWc&(f8qNdsQ$P`^^;B?
ze*bu+hT-|?*~VJG6Y5UXZag8s*Y~5=vqrMI7W&5@!aCwtbJVGsZ|n8v2!REG4XFbI
z1WX(VvLMXC06^PN*AD0`=p5)gORSNED`4>$1PX!zq_&vjA;^yV{}&_nk3ovyX$4+t
zovld`(u~MGT>x)RyWnk~C6>gPX1P%WU1AB2vdW_@4e+2@Hkt115?uoTT13VTK~=zE
za8hW6{W$T7=U$<kqPoM_vghKBHchasbCuDon7`fW(uvZ3q^aAc)peROTgwY(^Xba<
zbL;?Ac)_aRMO5}UR`4=<NN?HN%1W1?JbzM4NPN$X_hu%O45T@kO{PHLc?S-l>G$qp
z-F)iJ&&UDIJiXwB-Uu3Orah_l1+N4<g4gEDa?RYP%ET?3&sVI2jIzr7P9hzch(1>z
sOJ{T)lq=HV@%{J_UNYi;DcBWACLIXg2@VBU3@+5nX)<)G?v!i&1DpkV_y7O^

diff --git a/tmw.py b/tmw.py
index a764673..7748f66 100644
--- a/tmw.py
+++ b/tmw.py
@@ -1056,7 +1056,7 @@ def get_topWordScores(wordWeightsFile, WordsPerTopic):
 
 def build_scoreMatrix(topWordScores, topicsToUse):
     """Transform Mallet output for wordle generation."""
-    print("- building frequency table...")
+    print("- building score matrix...")
     topWordScores = topWordScores.groupby(0)
     listOfWordScores = []
     for topic,data in topWordScores:
@@ -1076,29 +1076,34 @@ def perform_clustering(scoreMatrix, method, metric, wordsPerTopic, outfolder):
     print("- performing clustering...")
     distanceMatrix = sc.hierarchy.linkage(scoreMatrix, method=method, metric=metric)
     #print(distanceMatrix)
+    plt.figure(figsize=(25,10))
     sc.hierarchy.dendrogram(distanceMatrix)
-    plt.setp(plt.xticks()[1], rotation=90, fontsize = 2)   
+    plt.setp(plt.xticks()[1], rotation=90, fontsize = 6)   
+    plt.title("Topic-Clustering Dendrogramm", fontsize=20)
+    plt.ylabel("Distanz", fontsize=16)
+    plt.xlabel("Parameter: "+method+" clustering - "+metric+" distance measure - "+str(wordsPerTopic)+" words", fontsize=16)
     plt.tight_layout() 
-    #plt.show()
 
     ## Saving the image file.
     if not os.path.exists(outfolder):
         os.makedirs(outfolder)
-    figure_filename = "clustering_"+method+"-"+metric+"-"+str(wordsPerTopic)+"words"+".svg"
-    plt.savefig(outfolder + figure_filename, dpi=300)
+    figure_filename = "clustering_"+metric+"-"+method+"-"+str(wordsPerTopic)+"words"+".png"
+    plt.savefig(outfolder + figure_filename, dpi=600)
     plt.close()
     
 
 def topicClustering(wordWeightsFile, wordsPerTopic, outfolder, 
-                    method, metric, topicsToUse):
+                    methods, metrics, topicsToUse):
     """Display dendrogram of topic similarity using clustering."""
-    print("Launched topicClustering.")
+    print("\nLaunched topicClustering.")
     ## Gets the necessary data: the word scores for each topic
     topWordScores = get_topWordScores(wordWeightsFile, wordsPerTopic)
     ## Turn the data into a dataframe for further processing
     scoreMatrix = build_scoreMatrix(topWordScores, topicsToUse)
     ## Do clustering on the dataframe
-    perform_clustering(scoreMatrix, method, metric, wordsPerTopic, outfolder)
+    for method in methods: 
+        for metric in metrics: 
+            perform_clustering(scoreMatrix, method, metric, wordsPerTopic, outfolder)
     print("Done.")
 
 
diff --git a/tmw_config.py b/tmw_config.py
index f406646..26847d8 100644
--- a/tmw_config.py
+++ b/tmw_config.py
@@ -218,7 +218,7 @@
 topics = ["25","60"] # list of one or several topics
 #tmw.plot_topicsOverTime(averageDatasets, firstWordsFile, outfolder, numberOfTopics, fontscale, dpi, height, mode, topics)
 
-### topic_clustering ###
+### topicClustering ###
 ### This function will create a dendrogram grouping topics based on their word weight similarity.
 ### wordsPerTopic: Number of top words for each topic to take into account for similarity measure.
 ### method: The clustering method used to build the dendrogram. 
@@ -227,13 +227,14 @@
 ### metric: The distance measure used to build the distance matrix.
 ###   Options: euclidean|minkowski|cityblock|seuclidean|sqeuclidean|cosine|correlation|hamming|jaccard etc.
 ###   See: http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html
+###   Interesting combination: *weighted+cosine  
 wordWeightsFile = wdir + "6_mallet/" + "word-weights.txt"
 outfolder = wdir + "8_visuals/clustering/"
 topicsToUse = 250 # should be identical to all topics modeled.
-wordsPerTopic = 10
-method="complete" 
-metric="cosine"
-tmw.topicClustering(wordWeightsFile, wordsPerTopic, outfolder, method, metric, topicsToUse)
+wordsPerTopic = 50
+methods=["weighted"] # list
+metrics=["cosine"] # list
+tmw.topicClustering(wordWeightsFile, wordsPerTopic, outfolder, methods, metrics, topicsToUse)
 
 
 

From c6f2e21b9243ac2583204d5c73a128c380b5412d Mon Sep 17 00:00:00 2001
From: christofs <c.schoech@gmail.com>
Date: Sun, 30 Aug 2015 14:56:08 +0200
Subject: [PATCH 25/56] Added topic-based clustering / dendrogram for items

---
 __pycache__/tmw.cpython-34.pyc | Bin 30060 -> 32130 bytes
 tmw.py                         |  80 ++++++++++++++++++++++++++++++---
 tmw_config.py                  |  65 ++++++++++++++++++++-------
 3 files changed, 124 insertions(+), 21 deletions(-)

diff --git a/__pycache__/tmw.cpython-34.pyc b/__pycache__/tmw.cpython-34.pyc
index 98f88eaf98a98ecdc9a250f1165a94e428d2809d..50b060cd8883e588396d79447445916300105ef3 100644
GIT binary patch
delta 2072
zcmZWqU2IfE6h3oz+5c^~+ikboQoP7dSd*<njS?gV=t{%@5-VZ|rEGU@%Wn6tb8oS%
zyV-<FqVd6>8w3n08ZprbhV;<~6CZsdnwYo`hNw@*=nEPXV&ZpZ`vZD6^WB+q&&)Y<
z&UeoK_NzE|S8V=05)GZd`_s|yzb5*b?s+_z@4>LQdTwsMZ9KD!iciq--6SVSo}^cZ
zj_)NoN%EAQ?<1KbIi=@gB#ES;QkITqN#;pPEq{uI<7sPY$df!RQcI(=&l=*vrN#0#
zV>n{Ug6-rgdCSb0b)WfCRY|+v<5X*fe8&BzdMA~vS!%kf%9C0)R+_gROBE_JkGkL1
zwume4-?e|DR|HWyq|W1|-gON%-g?@L8+7rtJRzeSzo>|c4MKeYl{~)b#}5o*jA7V+
z0vzrCq9C#G3ps*{f*M3vXD7a+rk5I6dui53YLFa*iXL7{gQSNfCct$heb~_QG}lpM
z9TmOA6MXQ~tU>B!Ere2lWRMm~Ku3mXcV&xK3TvgSTFLi+CDI@KBqO|OMraM39VOX?
z!&YqK?1$l*LFYT3#dtDpP^#&phEMyXC*yO2bWC<5H9hXe;%rMHHl(gHTx7V!aDrp{
z1J51-wEcEfIf%ackwrHc?f~{5Ry*)Y2UUbcafZtb998uNgFZ;_`iqyJ0n&bTgpVHu
zw4+&_VEc%r4mei19b2|ErV5<!`Eom)J7cNbjJ3Z|vf3d?%j5Z~vg~&E>im#<D%1Bu
z-_(4eBq!ITIY$*1PBJA1VHT7S{URbZiXbqh$42ph2pZ}Wc#;pW4xTWE4@1*qPPkuf
z9cm$}NHY?;D2!%+94{4-K`;YZCn3O=A|mXkW>E7790^bZ2>}U$Obze=I|c891~7zu
zoW~USfTYSWXLm&fizt&W$zr5fw1nP@{C8{B7-St0IMBm6Ps%tY;a^hrQqt*Sf`Y$E
zndls{wxOF13ub&DYX~Ps#U3igslhIiozRlr<)x@Vv7cJnndGvC0le8F-uGBabtGVR
zQVvqG8PfMm@Ij3bWJ4s^ueuG}@B)2fJUx84KA16=s~r?G^Q*N!GBTpxWTp(f#rQV!
z#aGHrStVNqShCICQythbi)nrM0El6(^$9a;RivuUs9d?c%Txv48^|s<9rxAhrq=yE
z$ZvLdSWje3J9ma1DU`uFa~iZ8smGXAnKjoY+hS5NL)4dmx_6|8;^>2s!ixqN-rp&R
z3+&`O3>O&&7%nk<%rsNmd3qV(j@I@a2&-$n=Bnu2T3`A@UVg;D*H<R2wd3r}%3Tg=
zmUjx{ppK0zX8KzFEB%vo*TPccYT!f22zfSmVyJ@ko=qYtdc}4TLWRUdOzgy3eGYT0
zHNoZ0gKuoN0Qa;^$z;n;oFFa?-8sSh7P!caLL&^)!js&=vJ;OHSh0}ds>x1lp=Ox7
zDGCJophzRk1;%v2AqFi5NX+e_V{@YzJ#>sMLJG=C4`}19k0AoJQtqrNS2Jx^r#orP
zJU*4PEot)QO}kJolyc}NXU%#0KMbp<LI-p6mHg>tW%t%xVy~x@J?;uR4CzRxVXbCB
z>8w1*-``_+pFt-{XYZQ3O%0E9-5>eaIr0gFj{8v5Ko8P?PC;~|lu*~*in`VRDeG|W
zA)N<x6X&M)FFkJ`6{DI<ud$&w7_Knz-<ob-+D&zxmp2%Aqk6~{JA2*RPM0|6{^8X6
WHgp@2U?d(%L=u?ABSVq(5#wK3m&w=w

delta 298
zcmZqr&G_aOqsluTUaoI59)wD4V_<m9fCQL<YzH7N{>-#d<pGnpBSVxULrN?|N*qHA
z14EP(Lkc59ux9*bKW6a^Mv2X7+5Sw7ikr9PYB0%*D5PW-mn7yTrz+&8CKi_#r7Gwu
zRIw?P=NF|E-{PM9JO7J_P(f-@T7FS(d~!}{aY<@XW?s7SWR5~r#<`RA3jgxXWn^H`
z<SJ?fQbn^TTNDLLy0ZY?!Uu97GZzy-5b`qeGRiR)Ets5JbwMl{B+CYpWfWo*U@Te!
z((k-ExH^<kWg5tkP7u)#B0xqLEdUV<K?F!;k=EouCE>~MYWNtdCjYG|;FaZP;^E{G
L;t}DI;$Z>+mC#I*

diff --git a/tmw.py b/tmw.py
index 7748f66..c04190d 100644
--- a/tmw.py
+++ b/tmw.py
@@ -1039,8 +1039,9 @@ def plot_topicsOverTime(averageDatasets, firstWordsFile, outfolder,
 
 
 
+
 ###########################
-## topic_clustering     ###
+## topicClustering     ###
 ###########################
 
 import scipy.cluster as sc
@@ -1072,7 +1073,7 @@ def build_scoreMatrix(topWordScores, topicsToUse):
     scoreMatrix = scoreMatrix.T
     return scoreMatrix
 
-def perform_clustering(scoreMatrix, method, metric, wordsPerTopic, outfolder): 
+def perform_topicClustering(scoreMatrix, method, metric, wordsPerTopic, outfolder): 
     print("- performing clustering...")
     distanceMatrix = sc.hierarchy.linkage(scoreMatrix, method=method, metric=metric)
     #print(distanceMatrix)
@@ -1081,13 +1082,13 @@ def perform_clustering(scoreMatrix, method, metric, wordsPerTopic, outfolder):
     plt.setp(plt.xticks()[1], rotation=90, fontsize = 6)   
     plt.title("Topic-Clustering Dendrogramm", fontsize=20)
     plt.ylabel("Distanz", fontsize=16)
-    plt.xlabel("Parameter: "+method+" clustering - "+metric+" distance measure - "+str(wordsPerTopic)+" words", fontsize=16)
+    plt.xlabel("Parameter: "+method+" clustering - "+metric+" distance - "+str(wordsPerTopic)+" words", fontsize=16)
     plt.tight_layout() 
 
     ## Saving the image file.
     if not os.path.exists(outfolder):
         os.makedirs(outfolder)
-    figure_filename = "clustering_"+metric+"-"+method+"-"+str(wordsPerTopic)+"words"+".png"
+    figure_filename = "topic-clustering_"+metric+"-"+method+"-"+str(wordsPerTopic)+"words"+".png"
     plt.savefig(outfolder + figure_filename, dpi=600)
     plt.close()
     
@@ -1103,7 +1104,76 @@ def topicClustering(wordWeightsFile, wordsPerTopic, outfolder,
     ## Do clustering on the dataframe
     for method in methods: 
         for metric in metrics: 
-            perform_clustering(scoreMatrix, method, metric, wordsPerTopic, outfolder)
+            perform_topicClustering(scoreMatrix, method, metric, wordsPerTopic, outfolder)
+    print("Done.")
+
+
+
+###########################
+## itemClustering     ###
+###########################
+
+import scipy.cluster as sc
+
+def build_itemScoreMatrix(averageDatasets, targetCategory, 
+                          topicsPerItem, sortingCriterium):
+    """Reads Mallet output (topics with words and word weights) into dataframe.""" 
+    print("- getting topWordScores...")
+    for averageFile in glob.glob(averageDatasets): 
+        if targetCategory in averageFile:
+            itemScores = pd.read_table(averageFile, header=0, index_col=0, sep=",")
+            itemScores = itemScores.T 
+            if sortingCriterium == "std": 
+                itemScores["sorting"] = itemScores.std(axis=1)
+            elif sortingCriterium == "mean": 
+                itemScores["sorting"] = itemScores.mean(axis=1)
+            itemScores = itemScores.sort(columns=["sorting"], axis=0, ascending=False)
+            itemScoreMatrix = itemScores.iloc[0:topicsPerItem,0:-1]
+            itemScoreMatrix = itemScoreMatrix.T
+            #print(itemScoreMatrix)
+            return itemScoreMatrix
+
+def perform_itemClustering(itemScoreMatrix, targetCategory, method, metric, 
+                           topicsPerItem, sortingCriterium, figsize, outfolder): 
+    print("- performing clustering...")
+
+    ## Perform the actual clustering
+    itemDistanceMatrix = sc.hierarchy.linkage(itemScoreMatrix, method=method, metric=metric)
+        
+    ## Plot the distance matrix as a dendrogram
+    plt.figure(figsize=figsize) # TODO: this could be a a parameter.
+    itemLabels = itemScoreMatrix.index.values
+    sc.hierarchy.dendrogram(itemDistanceMatrix, labels=itemLabels, orientation="right")
+
+    ## Format items labels to x-axis tick labels
+    plt.setp(plt.xticks()[1], rotation=90, fontsize = 12)
+    plt.title("Item Clustering Dendrogramm: "+targetCategory, fontsize=20)
+    plt.ylabel("Distance", fontsize=16)
+    plt.xlabel("Parameter: "+method+" clustering - "+metric+" distance - "+str(topicsPerItem)+" topics", fontsize=16)
+    plt.tight_layout() 
+
+    ## Save the image file.
+    print("- saving image file.")
+    if not os.path.exists(outfolder):
+        os.makedirs(outfolder)
+    figure_filename = "item-clustering_"+targetCategory+"_"+metric+"-"+method+"-"+str(topicsPerItem)+"topics"+"-"+sortingCriterium+".png"
+    plt.savefig(outfolder + figure_filename, dpi=600)
+    plt.close()
+    
+def itemClustering(averageDatasets, figsize, outfolder, topicsPerItem, 
+                   targetCategories, methods, metrics, sortingCriterium):
+    """Display dendrogram of topic-based item similarity using clustering."""
+    print("\nLaunched itemClustering.")
+    for targetCategory in targetCategories: 
+        ## Load topic scores per itema and turn into score matrix
+        itemScoreMatrix = build_itemScoreMatrix(averageDatasets, targetCategory, 
+                                                topicsPerItem, sortingCriterium)
+        ## Do clustering on the dataframe
+        for method in methods: 
+            for metric in metrics: 
+                perform_itemClustering(itemScoreMatrix, targetCategory, 
+                                       method, metric, topicsPerItem, 
+                                       sortingCriterium, figsize, outfolder)
     print("Done.")
 
 
diff --git a/tmw_config.py b/tmw_config.py
index 26847d8..f44e880 100644
--- a/tmw_config.py
+++ b/tmw_config.py
@@ -15,8 +15,9 @@
 # 1. Preprocessing Texts
 # 2. Topic Modeling
 # 3. Posprocessing Data
-# 4. Visualization
-# 5. Other / Obsolete
+# 4. Basic Visualizations
+# 5. Advanced Visualizations
+# 6. Other / Obsolete
 
 import tmw
 #print(help(topmod))
@@ -139,7 +140,7 @@
 
 
 ################################
-###    VISUALIZATION         ###
+###  BASIC VISUALIZATION     ###
 ################################
 
 ### make_wordle_from_mallet
@@ -192,7 +193,13 @@
 dpi = 300
 #tmw.plot_topItems(averageDatasets, outfolder, firstWordsFile, numberOfTopics, targetCategories, topItemsShown, fontscale, height, dpi)
 
-### plot_distinctiveness_heatmap
+
+
+################################
+###  ADVANCED VISUALIZATION  ###
+################################
+
+### plot_distinctiveness_heatmap ###
 ### For each category, make a heatmap of most distinctive topics. 
 averageDatasets = wdir+"/7_aggregates/avg*.csv" 
 firstWordsFile = wdir+"/7_aggregates/firstWords.csv"
@@ -219,23 +226,49 @@
 #tmw.plot_topicsOverTime(averageDatasets, firstWordsFile, outfolder, numberOfTopics, fontscale, dpi, height, mode, topics)
 
 ### topicClustering ###
-### This function will create a dendrogram grouping topics based on their word weight similarity.
-### wordsPerTopic: Number of top words for each topic to take into account for similarity measure.
-### method: The clustering method used to build the dendrogram. 
-###   Options: ward|single|complete|average|weighted|centroid|median
-###   See http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.cluster.hierarchy.linkage.html 
-### metric: The distance measure used to build the distance matrix.
-###   Options: euclidean|minkowski|cityblock|seuclidean|sqeuclidean|cosine|correlation|hamming|jaccard etc.
-###   See: http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html
-###   Interesting combination: *weighted+cosine  
+# This function will create a dendrogram grouping topics based on their word weight similarity.
+# Parameters 
+# wordsPerTopic: Number of top words for each topic to take into account for similarity measure.
+# method: The clustering method used to build the dendrogram. 
+#  Options: ward|single|complete|average|weighted|centroid|median
+#  See http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.cluster.hierarchy.linkage.html 
+# metric: The distance measure used to build the distance matrix.
+#  Options: euclidean|minkowski|cityblock|seuclidean|sqeuclidean|cosine|correlation|hamming|jaccard etc.
+#  See: http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html
+#  Interesting combination: *weighted+cosine  
 wordWeightsFile = wdir + "6_mallet/" + "word-weights.txt"
 outfolder = wdir + "8_visuals/clustering/"
-topicsToUse = 250 # should be identical to all topics modeled.
+topicsToUse = 250 # = all topics modeled
 wordsPerTopic = 50
 methods=["weighted"] # list
 metrics=["cosine"] # list
-tmw.topicClustering(wordWeightsFile, wordsPerTopic, outfolder, methods, metrics, topicsToUse)
-
+#tmw.topicClustering(wordWeightsFile, wordsPerTopic, outfolder, methods, metrics, topicsToUse)
+
+### itemClustering ###
+# This function creates a dendrogram of items in a category (authors, titles).
+# The clustering is based on the topic scores of the items. 
+# Input: the average topic score file for the category of interest. 
+# Parameters
+# figsize: The size of the resulting figure in inches, width x height.
+# sortingCriterium: Topics to be used are sorted by this criterium (descending)
+# topicsPerItem: Number of top topics to be used as the basis for clustering.
+# targetCategories: Things like author, title, year, depending on available data.
+# method: The clustering method used to build the dendrogram. 
+#  Options: ward|single|complete|average|weighted|centroid|median
+#  See http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.cluster.hierarchy.linkage.html 
+# metric: The distance measure used to build the distance matrix.
+#  Options: euclidean|minkowski|cityblock|seuclidean|sqeuclidean|cosine|correlation|hamming|jaccard etc.
+#  See: http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html
+#  Interesting combination: *weighted+cosine  
+averageDatasets = wdir+"/7_aggregates/avg*.csv" 
+figsize = (10,20) # width,height
+outfolder = wdir + "8_visuals/clustering/"
+topicsPerItem = 250 
+sortingCriterium = "std" # std|mean
+targetCategories = ["author-name", "title", "decade"] # list
+methods=["weighted"] # list
+metrics=["cosine"] # list
+tmw.itemClustering(averageDatasets, figsize, outfolder, topicsPerItem, targetCategories, methods, metrics, sortingCriterium)
 
 
 

From 4ef7019dc66006d469d2781052c9f563686fc2c3 Mon Sep 17 00:00:00 2001
From: christofs <c.schoech@gmail.com>
Date: Sun, 30 Aug 2015 15:07:00 +0200
Subject: [PATCH 26/56] Added some minor TODOs

---
 tmw.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tmw.py b/tmw.py
index c04190d..e811975 100644
--- a/tmw.py
+++ b/tmw.py
@@ -1044,6 +1044,8 @@ def plot_topicsOverTime(averageDatasets, firstWordsFile, outfolder,
 ## topicClustering     ###
 ###########################
 
+# TOOD: Add figsize and orientation parameters.
+
 import scipy.cluster as sc
 
 def get_topWordScores(wordWeightsFile, WordsPerTopic):
@@ -1110,9 +1112,11 @@ def topicClustering(wordWeightsFile, wordsPerTopic, outfolder,
 
 
 ###########################
-## itemClustering     ###
+## itemClustering       ###
 ###########################
 
+# TOOD: Add orientation to parameters.
+
 import scipy.cluster as sc
 
 def build_itemScoreMatrix(averageDatasets, targetCategory, 

From ac05b364b23f66934d618b91b55ee73a8ac910b0 Mon Sep 17 00:00:00 2001
From: christofs <c.schoech@gmail.com>
Date: Sun, 30 Aug 2015 15:07:50 +0200
Subject: [PATCH 27/56] Added another TODO

---
 tmw.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tmw.py b/tmw.py
index e811975..70661dc 100644
--- a/tmw.py
+++ b/tmw.py
@@ -1045,6 +1045,7 @@ def plot_topicsOverTime(averageDatasets, firstWordsFile, outfolder,
 ###########################
 
 # TOOD: Add figsize and orientation parameters.
+# TODO: Add "firstwords" as leaf labels instead of topic numbers.
 
 import scipy.cluster as sc
 

From 8b0ea137897e024c6805e99b7b2d6b59200918a4 Mon Sep 17 00:00:00 2001
From: christofs <c.schoech@gmail.com>
Date: Sun, 30 Aug 2015 20:19:28 +0200
Subject: [PATCH 28/56] First attempt at PCA, not ready yet

---
 __pycache__/tmw.cpython-34.pyc | Bin 32130 -> 32973 bytes
 tmw.py                         |  64 +++++++++++++++++++++++++++++++--
 tmw_config.py                  |  20 ++++++++---
 3 files changed, 78 insertions(+), 6 deletions(-)

diff --git a/__pycache__/tmw.cpython-34.pyc b/__pycache__/tmw.cpython-34.pyc
index 50b060cd8883e588396d79447445916300105ef3..fc565f93c63db31e0caa65d056bfe6f6a2d929f0 100644
GIT binary patch
delta 1004
zcmYjPT}Tvh5TDugK3VfTO-+l|q~HP%Ng;?HDnBw4KRQA%LXO=%J-ypI|NFm`Yeno5
zB6^7SJy0;}r4No23VQCLpyzul-+B;wj-Z+C(DwMx&HQHmvopWnKKPAaH<X?Otugvz
zbpQ6o7l3{E?@`IWLdThS*VC+CGaxqv`C*tH1~g4DEqDkpI|60~%z59x4kmz!kQ;^B
zQ7{+4%=-8Z22+W}>V(?9fG4Y&rAY<PR(JE$DqgIHmn(=jt53wHf<`r73Mu&5>s;AZ
zoO|A<m0wt@zFmEe_|%&$cdsX~rcMoBHG=gyBlI7lGsttkBqRL^DhTKTEF!E?tiW0j
zqynZ2ZV)PfYawt$ut*^QmDa<Cqj2>O$N<P7m<=HPVk$EOu-;Ko_{PxPORyFO8HU9O
zw{pY&>HwPxWE412hXT;fj2yyr8J9CKLr5wSoG8b8b+eGQ=XJSY>9kfdnG`lHQPb=>
zqmVBaY|EC8K!QXcnZ!s`XdJv@^P2FQn#syqM2fWSWC|iL`Un)SI6c6ogKI*Lk(SaD
ziHO+YbiX&g+R+h{X)$lfp={o=oovB&YHHC)i#>1pOv`#x(GuKmE-S4(P4=;Y_>ECV
zP@)*YAU5$&Ifvpsbv%?KOa**=IcCT?l4JEUxJ=xDF9tH#B!;Vit3suLH6VqUHwI&j
zx+F*dDna6`+50}$vOYXgvJFY2>aw8M2V~lGLMZIzSt(V_8oFZ?gyraI+tiD>g7gU{
zpp=uza`Ska`Yc$cUSBR*4h#|1$%%yekKPx|5-*kriz)VRvFY4myUpeq8=dwO{p{^{
zTd9`Qp?bT|%oWQ7Y7`sQ2x`hH6uVx&{H^UL=lqWs;}}EnfvzXUs;T=fUZ@>+mU32F
z*h$m!)pxSItgo{_jo9Uv?Xlr|;yg>`9k|Evk@v?fc6B!?T11O$ty&v-aji>hkF{!z
Inyx8-0c(5d6951J

delta 225
zcmX@x$kg<kQTZJYFIVgDN1@jaF)%!4KmyD_wgV6st8G*kWR_xN$Z=taa$!hGW=Kh4
zXklQ8a%D(iWC+$w-CV{jp1~-&xi8zFiBW0uz1%=%Mw873h0TnN9-FrmwK6frZMG_9
zXX4IgWMHV0^(;xvRhYa#Uu<$xc@tAX-sETH9~lca->8_!$k;u(yh?F$f9+YZl`KG;
z*+9-=6k-%$EZPXvtLe76sIG>Qy=WbfQ?zlibb}gW(`2WH0#P}BCLT^6As!JP5g<#5
KN18{5hY0`#3Ohmo

diff --git a/tmw.py b/tmw.py
index 70661dc..8502a5a 100644
--- a/tmw.py
+++ b/tmw.py
@@ -1151,7 +1151,7 @@ def perform_itemClustering(itemScoreMatrix, targetCategory, method, metric,
     sc.hierarchy.dendrogram(itemDistanceMatrix, labels=itemLabels, orientation="right")
 
     ## Format items labels to x-axis tick labels
-    plt.setp(plt.xticks()[1], rotation=90, fontsize = 12)
+    plt.setp(plt.xticks()[1], rotation=90, fontsize = 10)
     plt.title("Item Clustering Dendrogramm: "+targetCategory, fontsize=20)
     plt.ylabel("Distance", fontsize=16)
     plt.xlabel("Parameter: "+method+" clustering - "+metric+" distance - "+str(topicsPerItem)+" topics", fontsize=16)
@@ -1161,7 +1161,7 @@ def perform_itemClustering(itemScoreMatrix, targetCategory, method, metric,
     print("- saving image file.")
     if not os.path.exists(outfolder):
         os.makedirs(outfolder)
-    figure_filename = "item-clustering_"+targetCategory+"_"+metric+"-"+method+"-"+str(topicsPerItem)+"topics"+"-"+sortingCriterium+".png"
+    figure_filename = "item-clustering_"+targetCategory+"_"+metric+"-"+method+"-"+str(topicsPerItem)+"topics"+"-"+sortingCriterium+".svg"
     plt.savefig(outfolder + figure_filename, dpi=600)
     plt.close()
     
@@ -1183,6 +1183,66 @@ def itemClustering(averageDatasets, figsize, outfolder, topicsPerItem,
 
 
 
+
+###########################
+## itemPCA              ###
+###########################
+
+from sklearn.decomposition import PCA
+
+#def build_itemScoreMatrix(averageDatasets, targetCategory, 
+#                          topicsPerItem, sortingCriterium):
+#    """Reads Mallet output (topics with words and word weights) into dataframe.""" 
+#    print("- building item score matrix...")
+#    for averageFile in glob.glob(averageDatasets): 
+#        if targetCategory in averageFile:
+#            itemScores = pd.read_table(averageFile, header=0, index_col=0, sep=",")
+#            itemScores = itemScores.T 
+#            if sortingCriterium == "std": 
+#                itemScores["sorting"] = itemScores.std(axis=1)
+#            elif sortingCriterium == "mean": 
+#                itemScores["sorting"] = itemScores.mean(axis=1)
+#            itemScores = itemScores.sort(columns=["sorting"], axis=0, ascending=False)
+#            itemScoreMatrix = itemScores.iloc[0:topicsPerItem,0:-1]
+#            itemScoreMatrix = itemScoreMatrix.T
+#            #print(itemScoreMatrix)
+#            return itemScoreMatrix
+
+def perform_itemPCA(itemScoreMatrix, targetCategory, topicsPerItem, 
+                    sortingCriterium, figsize, outfolder):
+    print("- doing the PCA...")
+    itemScoreMatrix = itemScoreMatrix.T
+    targetDimensions = 2
+    pca = PCA(n_components=targetDimensions)
+    pca = pca.fit(itemScoreMatrix)
+    pca = pca.transform(itemScoreMatrix)
+#   plt.scatter(pca[0,0:20], pca[1,0:20])
+    for i in list(range(0,len(pca)-1)):
+        plt.scatter(pca[i,:], pca[i+1,:])
+
+
+def itemPCA(averageDatasets, targetCategories, 
+            topicsPerItem, sortingCriterium, figsize, outfolder): 
+    """Function to perform PCA on per-item topic scores and plot the result."""
+    print("Launched itemPCA.")
+    for targetCategory in targetCategories: 
+        ## Load topic scores per item and turn into score matrix
+        ## (Using the function from itemClustering above!)
+        itemScoreMatrix = build_itemScoreMatrix(averageDatasets, targetCategory, 
+                                            topicsPerItem, sortingCriterium)
+        ## Do clustering on the dataframe
+        perform_itemPCA(itemScoreMatrix, targetCategory, topicsPerItem, sortingCriterium, figsize, outfolder)
+    print("Done.")
+
+    
+    
+
+    
+
+
+
+
+
 ##################################################################
 ###    OTHER / OBSOLETE                                        ###
 ##################################################################
diff --git a/tmw_config.py b/tmw_config.py
index f44e880..b73a14b 100644
--- a/tmw_config.py
+++ b/tmw_config.py
@@ -260,15 +260,27 @@
 #  Options: euclidean|minkowski|cityblock|seuclidean|sqeuclidean|cosine|correlation|hamming|jaccard etc.
 #  See: http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html
 #  Interesting combination: *weighted+cosine  
+averageDatasets = wdir+"/7_aggregates/avg*title.csv" 
+figsize = (10,80) # width,height
+outfolder = wdir + "8_visuals/clustering/"
+topicsPerItem = 250
+sortingCriterium = "std" # std|mean
+targetCategories = ["title"] # list
+methods=["weighted"] # list
+metrics=["cosine"] # list
+#tmw.itemClustering(averageDatasets, figsize, outfolder, topicsPerItem, targetCategories, methods, metrics, sortingCriterium)
+
+
+### itemPCA ###
 averageDatasets = wdir+"/7_aggregates/avg*.csv" 
-figsize = (10,20) # width,height
+figsize = (10,10) # width,height
 outfolder = wdir + "8_visuals/clustering/"
-topicsPerItem = 250 
+topicsPerItem = 250
 sortingCriterium = "std" # std|mean
-targetCategories = ["author-name", "title", "decade"] # list
+targetCategories = ["subgenre"] # list
 methods=["weighted"] # list
 metrics=["cosine"] # list
-tmw.itemClustering(averageDatasets, figsize, outfolder, topicsPerItem, targetCategories, methods, metrics, sortingCriterium)
+tmw.itemPCA(averageDatasets, targetCategories, topicsPerItem, sortingCriterium, figsize, outfolder)
 
 
 

From 5af5e9096b98dcc2b8d7af51398f7955db437bb3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20Schl=C3=B6r?= <daniel.schloer@gmail.com>
Date: Mon, 31 Aug 2015 16:05:35 +0200
Subject: [PATCH 29/56] generalized segments to bins with variable number of
 bins

---
 tmw.py | 53 ++++++++++++++++++++++++++++-------------------------
 1 file changed, 28 insertions(+), 25 deletions(-)

diff --git a/tmw.py b/tmw.py
index dabf8f4..c12940b 100644
--- a/tmw.py
+++ b/tmw.py
@@ -242,10 +242,11 @@ def segmenter(inpath, outfolder, target, sizetolerancefactor = 1, preserveparagr
 
     print("Done.")
 
-def segments_to_bins(inpath, outfile):
+def segments_to_bins(inpath, outfile, binsnb = 5):
     """Script for sorting text segments into bins."""
     print("\nLaunched segments_to_bins.")
 
+    import math, sys
     import os
     import glob
     from collections import Counter
@@ -254,10 +255,11 @@ def segments_to_bins(inpath, outfile):
     ### Define various objects for later use.
     txtids = []
     segids = []
-    #binsnb = 5
+
     filenames = []
     binids = []
 
+    offset = sys.maxsize # used to track wrong segmenting (i.e. with segment numbering not starting with 0)
 
     ### Get filenames, text identifiers, segment identifiers.
     for file in glob.glob(inpath):
@@ -267,9 +269,13 @@ def segments_to_bins(inpath, outfile):
         segid = filename[-4:]
         #print(filename, txtid, segid)
         segids.append(segid)
+        offset = min(offset, int(segid))
     #txtids_sr = pd.Series(txtids)
     #segids_sr = pd.Series(segids)
 
+    if offset > 0:
+        print("Warning! Segment numbering should start at 0. Using offset: " + str(offset))
+
     ### For each text identifier, get number of segments.
     txtids_ct = Counter(txtids)
     sum_segnbs = 0
@@ -280,14 +286,16 @@ def segments_to_bins(inpath, outfile):
         #print(txtid, segnb)
     print("Total number of segments: ", sum_segnbs)
 
+    for txtid in txtids_ct:
+        countsegs = txtids_ct[txtid]
+        if binsnb > int(countsegs):
+            print("Warning! You are expecting more bins than segments available! Bins will not be filled continuously!")
 
     ### Match each filename to the number of segments of the text.
 
-    bcount0 = 0
-    bcount1 = 0
-    bcount2 = 0
-    bcount3 = 0
-    bcount4 = 0
+    bcount = dict()
+    for i in range(0, binsnb):
+        bcount[i] = 0
 
     for file in glob.glob(inpath):
         filename = os.path.basename(file)[:-4]
@@ -303,32 +311,27 @@ def segments_to_bins(inpath, outfile):
         #print(txtid,segid,segnb)
         binid = ""
 
-        segprop = int(segid) / int(segnb)
+        segprop = (int(segid) - offset) / int(segnb)
         #print(txtid, segid, segnb, segprop)
-        if segprop > 0 and segprop <= 0.21:
-            binid = 1
-            bcount0 += 1
-        if segprop > 0.21 and segprop <= 0.41:
-            binid = 2
-            bcount1 += 1
-        if segprop > 0.41 and segprop <= 0.61:
-            binid = 3
-            bcount2 += 1
-        if segprop > 0.61 and segprop <= 0.81:
-            binid = 4
-            bcount3 += 1
-        if segprop > 0.81 and segprop <= 1:
-            binid = 5
-            bcount4 += 1
+
+
+        binid = math.floor(segprop * binsnb)
+
+        if binid == binsnb: # avoid 1.0 beeing in seperate bin (should never happen due to offset!)
+            print("Error: Segment numbering is wrong! Continuing anyway...")
+            binid -= 1
+
+        bcount[binid] += 1
+
         #print(segprop, binid)
 
-        filenames.append(filename[:10])
+        filenames.append(filename[:11])
         binids.append(binid)
     filenames_sr = pd.Series(filenames, name="filenames")
     binids_sr = pd.Series(binids, name="binids")
     files_and_bins = pd.concat([filenames_sr,binids_sr], axis=1)
 
-    print("chunks per bin: ", bcount0,bcount1,bcount2,bcount3,bcount4)
+    print("chunks per bin: ", bcount)
     with open(outfile, "w") as outfile:
         files_and_bins.to_csv(outfile, index=False)
 

From b62adc94d88173560a2b2dab1c4e1baf225ab7f8 Mon Sep 17 00:00:00 2001
From: christofs <c.schoech@gmail.com>
Date: Mon, 31 Aug 2015 17:06:51 +0200
Subject: [PATCH 30/56] renamed my_tmw.py to tmw_config.py

---
 my_tmw.py => tmw_config.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename my_tmw.py => tmw_config.py (100%)

diff --git a/my_tmw.py b/tmw_config.py
similarity index 100%
rename from my_tmw.py
rename to tmw_config.py

From 3607c7db2693790d92ffe67d7fee04ef5c9d08ca Mon Sep 17 00:00:00 2001
From: christofs <c.schoech@gmail.com>
Date: Mon, 31 Aug 2015 17:19:46 +0200
Subject: [PATCH 31/56] Remove traces of merge conflict

---
 __pycache__/tmw.cpython-34.pyc | Bin 32973 -> 35635 bytes
 tmw.py                         |   4 ----
 tmw_config.py                  |   8 ++++----
 3 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/__pycache__/tmw.cpython-34.pyc b/__pycache__/tmw.cpython-34.pyc
index fc565f93c63db31e0caa65d056bfe6f6a2d929f0..ccb18ae9e6026ee38f88806d334f8bbd82232363 100644
GIT binary patch
delta 14900
zcmb7r32+?OdFFdP2QY&HFt`B{AP2lhBq35H#X}TDksu{r5Gg_g*&;O>OgDf5=fdj-
z2|_@|w&Hk8<*Hcf*ohN6vE{_MT-jA2m5Lo#tgOrHwY|31PVAN2$vW3M#r4LrQoEZi
zc^&Wf|8Foe8bWrKB!0Yp$N&EKzyI#$SKn={e8|}RgIH_yRO#=Z+VpK9{!Y~W37~x(
zKTh_=%}<nr!fFy$v-o)_Of-Z@H#3X~3;)96=Y=rC!ioy3h1+Pvgw-mnHeS*~lPxK7
zF=3*=U089Rh6x?Q>g0Btu)2iR&23ER5mqm^<HG727FIt$VD^Bp)^WRY6=|0+yI=h0
zZRPdC8srhI9TL_CZubgnqp%X(?h_U;ZQ^#nur>>83%7xCtFX3ld!4Yh3u_0rfl!k@
zC}xL*xq%;(BDYcG6T(agSs5M{Ch(T;6V^^%w@FyLgteR7*hC6zL|FGTC$<P{kFfUg
z5cYgPSo^rWO<4Pdb%5L0|3P6L<n|6>9TL`IZYPCxL|6}T8+#oU*2CQ1DXe3{dZf$`
z*!Z}x9_99KVVw}xNp6FJQDL3p_Wi5p+9S-p!a6Oi$G9KVJT9y=+}<awv%)&ZZBTYz
zSQof`Kv-kKdV<@a@}jUVar>YsTTcq>GCzRiabZ2h?Zd*lBCH8+gM7`<hs5krVLmLZ
ztHQd*LpXsctf#sCh_J2;>jt-Rm}i8Q;`XEWEPqy5X&%BkZ%hg+!w)BgWeLmXHV%y~
zr$p|Q$e$MGX<XX0ux5DdF=1teHOp-rT$6T2m}i7_Q&>6f2NR^YoXoG>3gnEK*`(}D
z@s_Q|7uS0o;q}t=qTz1<{#E$s{buCqh4UBF%CVInD$#E<>bNRfO1bH2$86JWJ7?$Z
zg6o*x<4xE4Lr^iZWucUsELwM5d%^XC1WoVLO@ngS`&LuAyctteGeCwW(;s?>;^!oO
zoE!j&m?6;0OR*dhaw#O1q*yY96%ezASPF>erRcMQqVIV_EQf^~5KCdP6cl*;#UZiW
zB$k@QY*1JsF^hGArsJQS08h%7uF_ygJTF+_FDJueN@CNbR9*NrPt^ST-+u1Mp8eMH
zVsOM=a8(lxhyA9aGg3;sGd#}^PZd=@?W#BiRUd#K%H)cUoox1-9DACWq6TPeJ&iSG
zit_~*oID%M7cE<(pm~LK-c|{kH(UjqXhrO{%DQ$1(`EEH#3Lc&G9f$Vpj11(r<-q-
znYLh!3dXFqVP-O@w&RytM<7WcLE!D%Fy$%qJ7E9;%)OHPGW?7#+Ld_1w{tPra68_y
z$UfQXsYp_`c^`>fG7cHu8<9;<yeeJX2&`(e=yRKd3x3m;MfhncqMxFxW9aCkak?b8
znQjasfX??CVgR>gwRk(C`Eo?eM#XFkT`nf71BQ#9Smja99Wdq&89!xL7m~9o3VpEc
z6}qb34HWRc;`sn53JIFDkrxBSEbKgsVE@5%i1h4oL5PL_?Y$B0Il3qCQz%#!-0I%w
zsN2<RZgD0X4!JS#wnJj03jUbp*4#tJMD+w<+D${K#{xuCg=!|c4;w)}x^D)B`i}Qn
z%b7-Ae)(r?Q`>i~`>YbccU(PPwYzpk%}Hn>X5Pa|ee*TG(<zeeZ_`diQhArum6dW0
zWnJuvB+jQDH&M}?gp*ygk0cgjkwm&wvI|zWFr8Su(h*5Oe-r71TXfU8L~$zN%k<*9
zNMg*sU0r%R>(21{L}bt1pWu%g!OqDxwS@>EYvBiSc0p~W#~lFP*JC|Bq%f+N09ggq
zLLlb785`<jb*$Nhu$xxXHjLxe)~#j2rtSmqVUu%Y70kBVMP)g;Y{t$OEPJ6*b9Z9$
zEPk9x0N91lunY$J<T}{_3(*&d8_~czc?i!@*&B$;F!e&ey8@#|Tn-sc!2yi+RmOJd
z!MN-UbQ;0XI-|BAZgd!<a!6L~b&vN8fz4$YW*BKGLXK@PwSyujMLs|VSkiMy<bom}
z63fl{IV^Hvk#7>q5&ev{&0;w!VBcq1?qTI27|y%62a2xEHfh%dxdh#3(|{SHH;$I&
zGA3}xQVSXW<(P0`3=ogOJirh1%H`Ho%xJ_2GpI#`Mpk7R@tGmg8n!bcNF*LXM_jmV
zRRcwsF@DcTS@bQ(SMhg<rFOPShsamWc2*uGD4!N$xkD_xS1h%H>}9cx!A`N<MILJw
z5h7wqP~Am_rq&4qb8ja|sOx+Z=8=}cLw1X$ZZX?UfhZoO+e6RfFx$vyYSn)}EacoN
zIWYkiT)H~<0Wsmi$i{gAx-KU>7pJ|?w4ZD%&btX^J0&~gLQ}ks_}=7LQu<A~tm9f)
zRq>MOX(j1Qxy74y0UD+PkU{U+_y{|sx8&}{=BE=kp4+?Y@G|S{VraBjut!t`(~`Z4
zb%Zse8y)H(fvQrOt%aO>bEJy;5RJAIh!JQd&_;k=ZwIwV`P3c)Wmd+$)O>(I4}c%m
zs;@|8l0B-Q23QaG(`Xl&2)CHCRl1O|r_vd>sMG<P(?B8XXq6fYo^n%JEF%x=hqDEG
zD6_w3O;!&Q4pMqQn9CMywSk%;*6Idj_9GQ@Wvf5MB<Bc#fDXp+H_m@!MyHGidyRH>
z{^6=R$Bl5HRfdfoiNArE+}0|$OZ5nr-=+8}L-#^|hVVnjVK+V(@LoCI<6Z1qU#{qI
zkm9I{4u?cOT-9OdC=|At<v7AH%CLnY1Tsi!2~}Ho4~ulL;$%l}%n2x2O|@C2NC5}5
z0z#>f7nRWw)*a9$C?AwsOC^mWdkN@4Xqfi>9a_e*ajS5l*ZfRM+A8MWCnnH~Q;?tA
zC`rN*dVq*lT0~xmG!hO58<FB5<tk%kmsr?DN;!ICogkfthQ?tbphM8oZs>WJm^&m{
zVPJ2dbB2%$2la~k>sLVW3?{0F&=V1}Oa$*H=Vs~OgbOW&P3RR1!@`Z1iCnA#&e0oj
zv5cG!&%I))PaK{5BRR3KS(2fsakW#f&DHg*TwOSzDcjAI;W+K$W>~0zK@26|41VYn
z5>Lu7ZgT+x|0ZZ0M!R+!cxLOw_L5B;rNEU7fw^fD%l)-G0F(RaLb~bf>z4<}@XTTr
z8iwd5cK3@V{KM@aM){0Ex6{snfJ8S;(IHHr>%rmsMQ8v$2Lnf8+esuq3Irv_X9Ow<
zb57m;w3nRo<pewdnO)G!8Tj*3F0Ui2PMvTG=p<Vym)CPA?iHi!#k2wb<0K?;a(Pe?
zO}NBfzG9b+DAIP^%`ZYSU{H}bV;Ft}*F<|uq!eUN0WK#87k6IHAU$$9vT%xu3}}MV
zpVffENn{IdF)^7fI3tVeqvzA}g>q)bwyNVWrzsxgjD+p(Nu4*SAawUGML&y2C(^0_
z)WeC(I?YHF=JS)bqTQUC;(QMH5Lvj1w42y7l6cCYf#TGZW4nlh){S$(qn9L#<*DlC
z7|c{rUn|Zh(#lTQ3l&e0$QSWQ#3kIBbfLN`k-nAA=F*cndpL1|MsH_xxkRDpCMN9!
z3}Oz)%M=UX#e8wz$=w-#OK!Q_M28`3bhk_<-4}}oPbpPYN7gt}7ARGbj(bw0q2r_r
zcW$TejEsz^U9>((@sb}Q`%5jy4};LG<@mw$Le^P~XJ+OLH=RTYnHC0)Bsl-{LkO$N
z_QGj)N;Ob%Pks>D-Hab_?l{^HkO%TxGDRE-zOh(v)T7iBE~N`r+VPw6=>>8m+DWlj
zA}j6(^4WqPz{PV+=m)KA#`QyRHPf~qnkwgtMdgPt<NAnfa4J*DDg7{nA>-Ui4rzad
z^aDjHx`4u@x_}2YMu1%uJ4dF*C|PD0IV%othT&82Tb-eKX94`M76Y9UQeJA7Lp<3G
z@mw>am!>i<a{c*y3cZC%her#OW`vVi^a1xIw|$v4L(FTFlG&0fmdsf78d8ojBl;$&
z*+v6S3hZJbXBSx3j*v6nI52Q6VO0%&2nwAN09>KbUvY-90GyveKaoH$^5D2^m2sn0
z4g_O}dSh^mtukt~8LjAx!dZ3%Bgkpn44T^t92<k|P>ry$F4$%C$1thYXbbisY$hMN
z9?LqVy0P-Zj@|MgNf+xPlbJMpV`PsXRK?p)g9CpndXz3UQ_Po+JuSWQu21xEqRj_u
zKAKDCC$03cXQdbG-ZxBfiyy$<`e9|KQ4LVt)Pov9zC`gvm-lRU-{Taypz6@!%uhOQ
z)}3dWJ5A#hN0`mjPmg|(T8ff*$dx@^$rKQ&O`%I|@;=u+&^L~p$st`6!A{u@M)U<n
z3^lj%Mz<@=l-^eYv}8=jW-L)mXOKu#bA*azjgTl7?^+<nfe~dp$!0CAu;Y~oi>nVr
zFt0INGdVlWff6aIO3=U%bE?L!mXbm^rsK*<>HVZPA)S>(pC#ok@4fwpAK(}=QWrzA
zW<p*kA%7X$aT?>q4)2Hky^|F1L8;Qy(_nZ_ZRUvtiU`dn9l<#|nmkJVq;z~Qc}Q);
zmHX|PbS{^2m2GRJx4oHxq+DEiY2bn2RhAsG*Kf(lT?)eP8iXq_rN&emEi*9o#D&Gk
zQ5dE|@pj>u8N|pHGjQe9lV+Hjk3MP!9vwYP0Nu7TW*TEBRg*Wk;rQVy!;*%2hR_Bn
zs6<fS?6}1eh3Yl6f@D$$OXe574{jKh&7Qwu5ETGiSEuG8iC-N{_=;<{OJblDZ4+{U
z+*S8e2OGCgu6P^V<ohRi)q=r!FJ-{V?z&hzNf_E!2`s0uuBaRq{kpxG?HtZmCC7~L
zwp>VTLb(xkIjtTBw#op-)@FzY{6MMb_#u`av!!6)UbRJ9(ui-+i+AByuVU3~%cpPJ
zDUB4K`a@nck(KXTaT71dGNJ)7z8L1jC7Nu~tYUGgDMF1)w+MH{Z^=U_Y&XS)!rSQ6
zi3ST&eD&}t2<j}nh^k^UTbSmQ*55m{ZgcgPHL0B}ZWZ*|d<g-@cfF5oS}$K%`Rt~D
zFix*YE>p_8G+Zi^8(04ZJNc1pp)~IjtJHvZck7dztE<euN|sm3W}K8Z|0$Yide3a@
zKWBCzUBprtY)nYqwC_08E~}W)-HzG$wpj>HPzdEC)}7G%vuy+Ne|q29wpqUD{bbv7
z5l#_JkmtQ;`<~r(Gl}9D>L#f?^v;4T3_`Oi<;$qC!B8Jw`R?|uM)@U79D_#O%!1o6
z*b>jmBk&xdz@br8!I^R#L)>MHM^TnKMPM1g3>V>+;knQAf{J@OGwQ=qlyd2eJ<bK8
z?3+{?B7Ndk_J83q8Gl;K3!T7kCL2o5+-yco7jh4BNPpB*G#bq2iWxOd&A=jv{UBOq
zWJ(qDDGnCk7jpCYg5%3^b(L0~_r7=Eo)cy(a~<IwX@+BVk(w6~=4mIILA(Z4$XbP>
zqpmYb?3?0K`vEl7cJIp0-K}O6VNqp~>3v}5;BW)4+lkkDE5^Cm@V>h9n0#d=wri(6
ztxga{zf0g{0v{#t>j2})sOA7Owx%)cI^ILGm;=Ng^+9iWcjDx4Q*#{vf-Za758zac
z*pEP#sMMZqtbzHhL|b+AoZ<cT?u7jGiW~{a!%P#AlMHJ@K1}#OP2d#*w6h<A6PmV@
z5%q`E^~(f)%X{Sh-ic3A^H&J`F@b*v;K%YPEZSW4zn7FKV@i=gUMGPht0@9egHrhR
z5Ck&}(R_8~Q}>U_FiYVty|njKxmE~i(9c(~T3rKx(Va<`Y}g+rae(^&lz=9dT1@Nb
zsMT<FT^R_TYp3ezMpd>-ob<l9w^e?A<(qpwIZXOerzF3G$1!yd&=20R(@O2~Uf8!m
z{+ai?`wk}Gk!<!A`c4VmhrG!CO=UU;q=BgRn{?6B(PojYJyX6PgJ#pTD$bW|%WO~I
zny!~Qvkk&LpJN+EfkXW>TiS+$55iVv=5uuR^eylA_Q&Nfcz?2gtNhB!-|RnSl&e-i
zv5{Z{z(-}Z=hCv~#9>giXe4&LC6nrLB0oBnRgODBg;`U#bvdU#gsPOcYOMGTtW;kj
z@G%1CCjcqDXGeXR9s*<mSSiSDYRwp@*2f9Zd^4tW88}5LO<gEf^FY^LI@EKM<%2?d
z#k~D0Ese7FGSTYyXy_P$KOpcsG#;-VN+%*5_cRJqyOX|U!}zbB|Kr}f4jq<X_davz
zwR4({HMYJ?xFrFyqgsk0X(yxW@8hTOZi>|}%7)RU^buh!JevA~_ZNq^o-@1H?>KC5
zQWdFaB&d@YR~({tG5?u_o9;pw>*am2-B7Quj2?-|@~$c^k5P*f&}2xxj$dYkYzS|m
zuje(I@HLvyq8arq>L6~J9oo;;(Ps9btK$CZyW!5sBT|>6{)>17i(BX+XX~y{5zaD!
ze@mc2eA(pIO@G~MK00iCTX;K<A};a%=;;1NhsZ17CyFZNK=$o_AV%N^CP@9QSm}HC
zEApBdpG;?N(n*nl@mv%mXW)?ha2{_WvITV<^A{nQAU$D5HM*(!LPq@!2GkOO-#mpZ
zobuee{qz_>MrnEvJ+ebvGV*8Y+XTMBlc#NzJ)wnWGm=@@kn|m$1qVtw7ZuYy>J_*3
zOK4CILV+=aTnB*&8J;zzoa(!H!wOlP%1-MSoiX(-aTC*PvdvoXcZA|D0T%TQv@k+k
z#e3R$%WShy>xE6xtZ*oAot{^Aiud>1D^Jw`vhIq!3&#iL|L|sy?=;>NE5CgFxR$TQ
z9u5@~XY%Q3+eu`Q48xWk;mDxk4E?~xGw0Pm5luC>$cBQ=Klw&KNDHA2IAOBA7C)ng
zY)_$bFb|}7A+NI2Gp<4<Q4qR|_4syBC;QQ6#4otlbWXp74onso%%%!mjWX9l{i!U|
z9u9b)Jh936k?_8J;!6ds+^o%fE38Yuj!nkd<@?&auv|l@)-ro5n4ApwC;+S)kJA)v
zRxMOLEuI|h>fL~hNx$z7bRsDq!22roFT%TT^lQ7>$gp!lvdMm^wj)JO>O%lja7Pk!
z-rIU=dx*9%y-TNt_t)JaP12DKzl1yE+IeS3byuSeMSy;z(@?>cPoC<QncA+}tdU*S
z0neYKbsE>Z1{LZPH0xgoe2jnq57jr(qH@2hLopmeQumLAH}u%Rb)wa8LJBcTNe=De
zM=7KjFJ8<QD_I?%ViOIuarjdmet_uj#(=;}ynY!$?2*8`zJAn5U<souFFm#-576x)
zAVn3B3JQ1GSAP$E9QMwW1dbO=yYYjFKQZ<B&!Q-&KH{A`liX(p-0>7Xi6|AOS*;+Q
z#D70nBGSIkO`NObhf#mZ+;r4G3hy`043t|4JIi;?SN(wcwgKS$5jwv?pa}qFJhCuk
z#QgA#&0@|<%sTx;TE`4vz+4!pL^+dbm6<`|lgzQEf0P*F9XvZM2Uo70y-$v?ys+j4
zX}>6eL4ahNkB|eVYT72o;}j0v$Sdczmcuk>9f9=(YNYVBG1&ncAlmg&hO0E{OCt8v
zXu3u;bz+JL;>9eNq^KU#BX>%b0FI17WB5b4NQ>~EI{!PDv<9-wVFm@XYf0P77-B(#
zcl|N7+3c;YWMdZw`|zf=QWJubp}fWgGd1kJ`@&!92#K{!v4FGGBL-b--%qUW>xW=m
zZG~wSk;ZLOlETV$8K%BaM^f`U@v9F)-9!B1WgJjz>0j{P9P5)?Rw7UQifn12<uL-S
z-lr}O9b^&y2vHKJ&JF@>sM@I2PJr_|H0aHh{-e5Hy0pGbSGrmW;qa@QW|KTaf&u#I
zv7bN>fnEXw0DjCR=}hI)cMyHNqq9;qS&wJ|;frL+p&Qz|rgSw5y7#$DLkMF3>e6<(
zZ6);NRjC~=%koc%N_M@&MB5eu+W?@TL)2!?VTss4j~fYWCeR?(HHl_V%HqScvW5=R
zE(}<#+gn%Vn(yhO-uEvbklVfC@h|JlTieOC9ESS8!1Pl+g%;m6B^OKmUNU<#$ZYcI
zQe8m$MUrP|cq`ezRj;vQ<@QrHVkRP<<XHz6EE%jXV1uZLco#MQga8pZ2JXF+v?D=v
zEuD*#IoGlMd9?7R*>Uf{L4T1R@Pfg8aKG9i#TB7K*a>c;su_w63ttEw)Q`aEU*>oh
zg?rbsZ#jO{Dduh6$FATW^WwcSu^tilS0^^hk(D<mx@3<TMga^tDv}deQ5_4K-oUko
zjPD8W%C%t~2-kGtG`3!hTX2@UR~b5aXLr4k@4#UTj@d#Ui(cd)D>s`ebL*6Z^@BB~
zrX*W&<Oa1krQh7Mn$`58NuP{4!g|4iSu?fwD8obMX88jzWe&>&EAKYjr2MG&fv5Yg
z*3A0|ZGC`P-lBDY#h(>{#h>|JGwr-pP5c|>{iW-}_+G}_e0`gI5E4G%Wv+keUnk{}
zIvJ{^fh*uT7nRg~L?;Q0+DXI)eum_eZA|e8n_2JoZ?yJpgOku@{Z)xa)S?cqymn)_
zPUdwQug&4xg*@3>M_)C1(s9dc??9wFNHkxiQI_dQO{V=ARm<=ui;Vrwf)2=8jd#(S
z-2~P&DBiE8293Qz?~hZ1^6<(xQ@<wL*}y-HU3?>VTP=Cz^zKV+<R1W*#a?#s37sG&
zcBCgA)X3bV+J`|udNw_y7VR1O;IifeFe2`@0o2U@VeiN3W4lNgD}{9JpN7#<vvhAo
zj#3)4P*TS(2fY`j)_d=n+|5p7EE(4k=)cloH0Xj>1-_7)DpbNDmQ>aPHpMKT6o2?S
zv2+fcPfyx8)u<RC-0!c1Lt0GosBWV^PPY1F(92}DgdYW(OvroX{_d5JXFe<Sdn!IM
z*>y$SxgPYCy}zYqyj#3i>>Xjve($^Xj*Y};^)Lba2z7#5Y*mg?i}||kXYf@$f~i^J
z=;IY`8Z~6e4jo&D@a0bKj(SH1lOL{g%WH+24GgZ>nzu1{uoBlDCv1&8_43n0#?_$r
z{^>39v6WY*mt>j6GHlrydU<yWiz;u=#^BXzHs~;6BW|n50PtpwopH?&u??zOIu0+B
z#7(|(wnm3&b)z*Bpxy6U^}YzxpP(<evv_qH^uCbYCC_;OlzlU$V@8kgQN3FyD?nFQ
zl3~AxVoaqz%BjF#&<u8JUqH(b!qQvGpx;jE-2_MD{L<0%PTaij!C{)rfJ1@~vQ7xx
zCNNjKtry8Y5l84_CaTE5I*qNod^1;Hk$If3kgVOMnfSbtSxDQfT4cl4=xfwBPT=DN
zm|!Z7>4F)Xq~D}Ribzq@y@2DcD$IECDSW^lr?l{iDG+$6LLqBB-pW(D#bvSICj{qt
zC0myZYZjh6Cg-y`3;q>Vc0J=)z5klu8orE$73CT#EX#FI;JMBQaU9BObi(_?;-+@8
zWoFYTh)Ne0)jsdP70)IQ69Gg@ofpntHP}W?5IVw%H)~PvYH6_ieN4o+=v=x$s)z!c
zDo(3(zD_Crkr4LLN_Nuhd)IKy+8os^QImR-){GN)3P6AH=UjxWQF6#prTK6h#SZk{
zT!l}K+C${^At0i!<w-WOnUlGBics(w$EUnKb6e#z-j%r@=xBC~_97OlX8;f_oul>*
z0;I3_#=>@IiukUB8VZuHF47p?a|5wjDu?OhESl{NTfa(zNzQ;u^zBA_1j$dU+#pqY
z<v*$CBs+I)jed=Auo`jrn54Z(Sov1*)r0=dNBu|YdYQnV1Dqbyp9)vrQgf7EQ!h@K
z7!j{xZTlEjA#q|8VMy;kxSQL|mP$%W#S+u|XLqQD52R1!ZJb{xU-yp8Z<hATGxL8S
zk5_L>T>+7P7+>YsIY<39FsuJVpymr}a!A6oI7A_Q!uwpYw+q=m3uUtw%8U)EoWGqZ
zvs98{f>b)UbOAt1Wu@GG6^PU|0ww{HOZ7ChC@$8ekL&bU3lIN)<kGIoqFNL_bgxtl
zYG<CLz6=36y_U|-EY7uh5G|l)l=|3vI`^9T&n~Q&|J{3iAt`58j60|svnjF(;S9j4
z4X^H?cvp@nQ#9#W0!##lu}o);#9`tD1D44)qNJt`p%6r}8&e>X_#`Dl3Ay*eVpL|m
z4=(<Fc^LzKtbiCCuYL>kUv9K~(xq0ai#l$@e|bS&b}H-o5f`5sIiz`YhV*{S*TtG9
z6j@woA}nQ%p=+@*4%>JbwtbX*j6<J3nSoL!&2JU&QM6TEr^8)5`RJaIw|A*u7QM4e
z-<hs)vj`$l`eAWr_N}q6t_d+|4V-JLAb*%x7Q=y<WdUN@kW?k_{mV=77CI-5RX!N@
z`hRXi`J?nqrlgVV7*(R_b0_)$DMe_*m_`2|CN7mr#GY&XQsE+@VWr+ot(^q=2yp#%
z549d4aD%`Mfw{*CIs_I8ypO;O1YQL2L-;7KbVp|cAEsV@P5EhRy-MKo1inJx>)wxk
zZX=t1^#(nEgTP-Bc%8tv3H&XAHwpZR09PS{I7qGN#pNb7OnqAjP{}}T2cU1c(vwA1
zQ2VJ1`KwUXgi1X~j|VxYeS3tX&d}OcCzCDR8OdlXk8-naD7Fu8H}EP=sH)nfvb5fN
zquiG!8CE1CszhLp0Np~}wGGp$SdW^I;=feQ=j>y2+s^v|=$$M6#$t!sW9?ylEf$E4
X#~x^(j2W@ESWj$Y?2MOr*X;iTjZPfR

delta 12378
zcmb7K3vgUlc|PavYNg$kw37UgAF{o&vE#KZ*-re39ovcHNBoFoJ9hk#?QB+iujG~Y
z<#VrWS+-{<<2WQ?S_o$df$(U;tAr4sI|KrOv<=e&B``yQ(o2Ezn&33hl9EC_ou>cy
zpOtpKR)8wm-{+p^fBy6RkMrMKe$;yDK5N0Rn_D73-SM-#@B4-jKNB^70V>}{Kez9Z
z1$#?Eq#A`yv`N@hXcG4e(H|Bz{T~r_RNOB`KNW<qrN~A_e^l7b!fxSiRA?1;jLR*;
zZWDGpm#MNt*i*P16ZTYLPvbImm@e#2F1MRGJ4Cia^iLUIARzjKkF1<knj!3&+>OT0
z683B^PZM^Ru;*}@=9w$(d0g%kc3jxqT&98Zg}s2wGll&YVK3w|O=7k%TMT!J{yF@(
zNMz@V+&s}gPsmDhT=dgir5l93n8$SsJ0a{PT&9Vou$KyZ8B1k>uzQ5v%T2V-a$&FF
z@<L&+6!wi=UL@?BguRN(H0Nqzui^4yVXqbT&0MDW*9m()mzN0p7GZBF@dK^2QP`Wf
zyiC})3i~!L)2f?=eLI(X$M>~d^sf;17GZDYdg5T4u(xyhMq%F}>>XStZgvWL7nfHF
zd$+Lf<T7!#N7#K_UL#8OUSaR!2V!%-u<zpX&B8t)?1Nk;mW@Eyi{V>D{{~^-E$l<w
zL`)wR_7N^`682GH_j8#xa7@_8xqRDo!%qnN9&Vz&ok$A%UVgY;*ePKTaG5qoOQuD3
ztH^B={oCl!Y+*axwq4kR!XDx>ZPu{1L-g+u_DNx9xSlpH#eu|d`H?`@lD_`mhDD8j
zz;iNVaa|0}(K|w)qT1hwHZ1qUNA}#gFQr^Z`3*(<>qlHqWr|5JHRSrOrtD;$oRjxl
zU#A<7u5Kii@}r~0<Uqkb=Q*REA7t8C81W_wgPE-3hp47F;(c7GCHgmwrO+}OsUrIG
z;S;4W6%#=<1vPU)W)bll(%FLRB!YgU>kMH@)s5<4u3$SVfo@AC$WoLr^a%aj5D_6e
z<aEh(X?~`v2>rnD7W%mx$RYsQme79|Suex-tC56^=+`3qt>>lQ7M<Vup!7oG0{6NQ
z7MCqC^Fkxp3jJ&}*WW0Ho5XNfP|YPN)N|6Knn>kQo_<cAe!eoA$B0V-A-x9SS(uPU
z5FI9?5<q#z+EfD$??7IN(NF8YX`U9$TV6<aw79p_t@A?NIxoo6FUZ3av(kP}Swda5
zFq_^Ronb`-dSlCuQr$Gsz_si~1GMxXWY0*Vx)&^vi1sN-qD)9WFx5omE2vO7&=EZT
z?Z?iqU$M&Wzc?1`^#~j?b>$8EQ#VpuqS<dOxV^=cchVeqAnWA)&|pF3Ql8(KE{x<o
zLSjdHL@5ByfWXa+IR+%HSShBTiOuQkMKKOCpNJpwQfkQYeErkd!V+T%5Xz@=j^C)9
zEEihO7L@H~GifK2x1G^CEcwl6RmO8FYo=*Z_Ye^w+k)LPXm!e2vR%eyr`2XftXbA9
z8IjYihzy~QfH$R~P0q0z12e78&DS(c4Rlz+z^vM?fjLrbrm4558?XZXwbQSSehky6
zq;B6mv(!LpZwWgnvH_6`ii;tJ({M-(H;M}qD01(_2@)#JqXd}tNH5GNWz20ou|Zy>
z?hQa6fNP11O~ND8k{4{i<J+gpi(#IICXS+>CbmQ58rm5D7s)4DB1ImMXbaCoP^L=K
zg;wD;qkAVg(N0WF<0omk*gQUeOk9X!fC@r0hTAHSV0=i3i!EaGvLL<Ndg7wESZWm)
zTE)ef@Y=;Nd0Cp6rjJ3bs+Gjn^_AqIYbr00LenTIF4A&sVwhZTD;}jc70)C(;`st8
zX+T^Eic2Kr)9v!`VS83c>_2k)xH#;QIp7H-iRD0|eJrOx+_7b9Je$c8UhKGX+@h2A
z;)G)TddJGS72QZkzXgCxdWBO?9uOw$Bl`5-DZOm{uF9UVrepCF=T|OUcaarotYK>*
z@ARrD^-D}s&Ge7D354<7lFxdldM75k9SuewPy%=qP^3CQ*se}RX${C)kkueJgOt{R
z&Iais;)hbjqLa57lZkdU4-IjUThKZM`sWq0j!Naz&R{C-6_nbDX7HuJ2<sS3zOmP#
zOr9U0nQ9ZrtsqcYKbVE`sYNI^utL`BVYs4#j(UnZxv(38RIW|7SskQ|)2$e(TNf$b
zxazf8A*)4(tZ9<|vzld#yg{nn<^HZ7xpoB&W{btEBrM<RUEv2+;jCM??A6EyJCpX(
zg<Nsd%R+y7&WEP4uYsHKn>J)qxdA)1=~bc6%)O}u@%n+Zd&UnbCq>&~VZwmrijV>y
zSRj`w`XR?nr;3grJms8oS=xz^-|mhKxL(E^@iK+HyB#A~9!;pnqaQ>`?ZGk)${DH{
zD?-*m>}Mr2HMQs{Sk7c_B<p30S?2(a`B!RpfgBO+kTDWPXQ0<o_m+>&^JEFmumZ9~
zv%%K{T1Y6g6Ll|jseu(ksERE<aQNIrlR?RGbdQ6;QS&Yo#`rDitdn9C!1Nm5J%<Ha
zm1?x82v<fqel2vSJ1*1Zhr4YlSLlDAzpli35H@@=(!ngnCnEzzRY*Iot72HR${w#A
zOsBG0ax_f@!a^2j6UQ)e5JN-sFfoyGFc?_(8x3Avqbxg6zgUQ+)IbNCOOa;OaUR1O
zNl!VBnKhuly&xe^mH%hKjlm-fIe5aWGJFL(f5pNgfkD-$_HwCj@1C*nhLQZKeBo?<
zlOLp(Lw?}Uw!6p`QgQQUKd^c0E)c4A+&<sx+oHnyfyKA2n_whisXH-Q5IRJ*rpfgR
z#cYNGwg~|fq*1|;KA_)Nyfql15jwVDrV(w25iQ$+s#xk|3aG0TR%&o*eow7J#Wt?U
z7S57H85FQ1eDcOo&!NY})EY~&90OwGJeHC)uAp3FgQ{z@=-FaTz{wBuvL>*kIFtf~
zl-h~eOaqw{$8X>UKTs^Vegi|wkLI1TFhW)MK`x|0=jfm5iZ!S;mpbJn&7>5^&ez_O
zjJ#a_(UON{iJ~2Ga4f_YIg)5JqGHG}yLD0;#N1&&nj>5|UXlYG@@N&^ATSBPZG7`d
zLhmp+3<E&S=0}<5q|x+Yj9b8`$b?z#V8m6X&*X{)<qhjd&usZnc~Q^H*7ix+Wlovr
z$=GdE4`A(ZCQmUs+v~VKu;Q-B#3)~XYQ+rO?<8Pl(n<DZXPwN@NzYB9yWeiYdvb7S
zhk=`@vI}Wb?fM<p^m4rvMR>f$njPw;D`(1*-nMdqe3#CwJl_Nh=37Nu|7zumo|=xR
zZl*XdnIp}!2?`&RWbZ;$Eo8)+YpD;Ecip(ql8@@*s;L{^P1P!kF|<jF_>$>DRzW?l
zusnh&!N{mQ`h*H9LqVtCz<u?XS4}U$_hf#9=oQ_#lzE*V6Kw_$@4>hx7|tY|K@-IT
zglqCGMnA}54v{U-WD99O2=C$t#xh0qUW^J4szNTwm1JeIBe}fm%Y%L>UC5_Xp87EQ
z=sl~KwfQY9UpGUc@hI+K53Qa(&6gQJOe-asS3f{S)vLd_x@VUkA-7QRpnglG;HC={
zr2uAhQs&`0lLo2Qv_B*DjcYc^uar-(St?6cAY%0ekY_+X0`gHJ2NMw^3U(w69hNOS
zj`QdRQ78z??3)}WR5H=zM~JE7hzngIf9DLj>RI%H=KBG9Cet_S@tXdUFae4*UuCX~
z+FQW)MC<qH8h3L%^b*aHZqV}bR9(9H)e`d!mJ=bv><6$AI;pRK@G&-!n;LQwVf9s1
zy$|HCi1=-}luO5IR%bOSqwW15-vD_Ly_#!#)yeh-Kf-e-0oa!TShy?^h)}H3EMpA>
z+z>(cJLUHEeKN$LeM~#IY%4(vYMgP^(R~L?hP(o!uE9M?5eAtkW;~8D-v*i3f?|gM
z5p{K{G$l>g^_*la^KVQrs26Q$lix3|-Js?CM=_dOgfje5sm~9lMl-G-Jm;j8TA}}K
z<6QY&9oe)t@uqyUGnv4GdRz~0nqT@J%|$2z&wiu1#<?m^Pm0|E^UZ;1<B%$h6dl`-
zrOpi1V%2XYbW;#CZtrWK*%a4pyNirzdL)aTr_Si;t!=Vhcip;B{-S*Qt=p{9gxOHo
zbst*{_0M1kc7kuZMijWMjal)<S7pKv@)a{Om{G2G823!ylud{mK~EFjC#D+=R%P)q
z%=%-Hr$9=q`*gZ+-hP1e-VcxkQLIgHtVX3ifzs1N{Llf1ESKvyD|4|;;#eld4U4+D
zVo|{5cEp}>%*3%e3;g&mF=>Qt0q;b85gm4cya4hX+S{r-HOnJpx(;A5H+9A#Z!s>E
zW%`G=uamFo&Mn{CZG>ADHX9a(7>gMK_robSO*fNFe(2zKN*4h-euV9<mm1(WnH-_|
zfnK?F;ckBlWSunTEm^@zJ>olfaK(=r&vcB!Gq;Im7S?RJM8+(2wfw}^uq<&P2VQ)0
z=!ph(6;EN99G>4C*sD~jULlf*8Y%q_6%du*&QdhKvtop&QB}od*6>@g&D_&fzsJ}O
zQM6|rP*;5kbB==iHOP_)!r?&-EY);R>yz8NEh+Wewm&RK_4GSd)q7DML5`=ONMb2M
z0i=LHRRmN-me1bt5AvwrHjql6!cOz#;Lw+@Y$wUd`Jo)8TQd184b*>(ph|8f4ug*A
z21oK~6{dq!IcoKr1`CwQp)2H-8F&m71_yop_|C<~x_zALltkzM<jzA*p6;2nLK9gx
zGApTb3XrGJF6;TBOwK)7IE&jRFGY+}Qv=<kh<{elR!mnH?BL!@2}J_pU}i}5&=XyL
zekV>*&nmFv7#IZ~(?A&Ki!d+@sZv^U#P(Zly8V)wF`^)SqxnXZljQaNSmkNl2-O~u
z{^YJMc~*aY*HWunmVdYFHUqA)X&he0cji(<jvG&_0^P&y^(Fye{Hh<=xARUlA3Rk(
zESruAH|qy6kX$J3FcGW9+;r7;1}U5wp-JiB6e2fu0sExs_SMd}29@2|Z4<>nbgrqa
z$*Khg3ZrxhGJMr;2z{fdPiC0+M=TxMGhZ&#_ucdLyisgoS(rm%J>p=2gNKS~VOWz&
zO=buGp2j8uerwfmu%Tk}1nX4Gica-Snf`I0H$p8U&_P*%nUsoC%cXv}@0&eUe?^y8
z_Dt0k5s9hCi6AXOSEmo_%X@EVz%su6%--%*lWCdpP&+**=awf{wOT1BF~jPx)S7Z=
z-&C2du4-HbbIlg+w>0~9@^_PPp*{tEmgBIWLTM#R{{?O+C*qk{k2Y~5s(YorxPRtx
z=CzS-9b0iP!YY0QVaCD2zHFg#7vx>AnP}x;X`=Z{;2)8ipd9548K!t7PI;CYvJF;n
zL;1D+7vyT39>ur?3fw7MaiyOgRXaF%l1d{;IUOt%d+0}zWqk0(=hgbSdP0BbK;kBP
z<!~@bS>R%Rh_#Bq3HkfMB3PS2b0F^XC`mHvWzwhUHYoKQ2WC>@k7ln^;x4Sny4$2~
zLHQ<-b(kZ}1_n0F51r)uiP7OWrir6l-7(IrL_Gs3l|GqL9QGRizXgHlcOUGQd&|!q
zTq1iJFRXZLFwq8(y+jhN2tvqG4LJ}z?ZU;0v>#enx*2`;fb@Z|Ibl}#*qH6rXaMhK
zD|PI(LOk#ghNqu{C$J!blyRnv4uy<Fr^%&K9iTbV)Xsmn&P0X&?4iHhZ#0lGM?Dbs
zPKGvyBY;MpwHMs5)dQj!NVQ_q&m8XR{5?2<L<A|>4B1Rcw+LmV^iK}|i;0;YqozK^
z0B6)wOgh=RjB>2&H;`?m<WS={q^SzG9agRbWa;xYEUkYtd36$~r$JskjDu{0{(#<p
zv{N1~XO2E5qZ=`N6UeQ)`PiJb4B;n;33WRvw}7yrx(%hxAWx!|3dl{HKebKYdu(<I
zXPT&FPdFgE1Kj~=f-sR?AiF_!fb0ahlZfB!!3Za_sdE&1Od6qalC}!=ut5l)Btr*B
zv~W^*)gxWcJ3fcv+KtC=kVnhT@gveWTn6(BaKx_nAWDZpjuIga-G_3VRxyy+kH@<}
z4uRC+x(aFbq>LZtl~uG3*RhsyVDmvO*1PXnE&JgyN`J&-s2*DT54sA&?4;18t@zVn
z_fFDPhf=aN(db>2orBkCsdXF2%KwyfWEIdgK8zKHSP>PB&qDEAAYibMr1xfUrwgrT
zt52gV?74~dO`K1+%X8$Qe@N9xJ$P$Ysbf+cqNF`G9jU5@V#~q@f{jvaoXs9E7yA&+
z?wp}3hg--wrp~Bv{W<9Nf!Xr7?ipAh2g>^grpRf2h>~V_=@x+*HL;+t&!um%+NFLr
z-EE@a38lCK{U2+y$y@e}3v}RIPpy@2r_E9_BMO&A*$k7ptMsi=64npal&X>(p)J>`
z#WwR+gVn667mY?QSR$+!49u#ny-pb(w-?BB`USgN7Ro=dTc!Mn{tsuyk*a+kz|sYX
zJZf}+En=K@Yv9jv=k=;fRpEb~obMg%w%!@gmj)NfB0+qnesS<?e;UlgH6W_q3=t7=
zQb?Tuqd<$g2iyk!2=rkclLc~B2QxZ)vZeDg!2Gxo=SYd-7InJ3{A70x==*9=e=W)r
zEj9d2#7`!2dG$@0RHwl+qn|+?u7cWc##?^6y~5lt&6|jv^*D(!DUeAmioTlZvYrp<
z8N*%DEw3JaT*lbMb86Vio>e*h<5NBR+c80g$k=ptr*U%&i!V+MxRiDC5^9(l{m8D=
zNj2u2#Ovp(7oZk#1<^>=0c_DHvYVCxjg<tv@le;Ax~t(0xW|kZ)utZ>^ee^L`ls0*
z_8NWTnZrNPW7H{w(FHz`sxDLlqB!wn>@(Y9)&)c$<5|DeccuoM?4$wFj7mUcV3MPh
zI3?9#tiKu1FXrAN&zFCh`>r(a!+56%Ce__UUZyu6g;i0$d$HU$>h|K|kdeB+wYYd5
z#H-GNK$_|TN^DI=QDSi}`Xl00=cwy%A;=vSAzIbR=zS&7>^IOm%<MV!rgDY<t#QYb
z#h7(+t=O12*k+w4xpq|ox-ralosoIz^c?GF0sZRfx5)d-?X=etqcLPVX}lyX(V$8a
zsE@o^)%LiU?L5eRMCf+PPPl4-*!EP-4o+1q2U~p7Y=t0@>!Zn;B$z4VUYFwYy=(_^
z&VL8=LT{P8r0?>6lQhov59VRm-%$ppl0HQmM!gQg&g=(N@`GgIZTfO2hI`#%^O^vc
zHeWwEvSiJ*`r)_3C{e_AnED!b)kV>(xxmqf5=g_4QF^FsoypduVbsf55?H%JPMczI
zBSI{%su2wvp;u7%36Q5jnBiZc#Mx-JM4!PULQzWjK1k9V*OP;VyXh;aHrxmI4iba=
zD;%<-QyL%ltX&xUJjh#lBpa1Iljf(}2Szg4N><ABSg1FRE|hOC+oO-lSr6mfsuqXi
zPS&2k(^JRh$6&eq#;wFlDnF)%^|rBH31l6agBqusIYO~HStO0=FeklyYu2A1>nhE_
z3?Y-7psOKX^B0CxDp#WmvoUBF2>WDqwMXzcnUYlPPNVvBbb1HKBSg&S4(>j3HMk6N
z5b1AhMW}!e2P)#JR~p)k-%0TgzsV*@WYSx*BM3++=eJhB<NQMTKK<<ZUz_0dA*}Uw
zkoOUx;OH{S?*)ltqDFcxbh1FXR}>ThUcC)%INmyDYcDq#8LL}*H}Hu4BTXN(Vqv=M
zw8*(qeXzXj;(5ug+!&$1pgz^4xH^utfKfhG$~#ellD49LgsNvi_<q!9zK5#3-R7XY
zs#SbJ;)!?^E7?;tlrBna8E%q#QE5S|AFW(Ljh-DY&516-0!)_bQfZb<>-Uxx$d8u4
zTzW>{HgQtwoy4Rc%EBSIstFt~2AK?p0MpnUMC@@rZ*2M$y4^D<n+zz^^fFI>_r7$A
zQ3-oQsC3Vm3%@~SCExuntnetvyFq|U^%w}kV3X>24<2`b>;(D$;L=f5U`EwPFr4GU
z>YkrK-IE~Ly}?UIhW0hS5d%;^f;#q<^cDg=Yh~JXX|^2EH(yG~r^^GEC`Zgj{G;gh
zG?8)BJ+Xq~RqsOk`$0Yc!b~uC%x9Iwnw3Z<ek(YsYQsAKNt3OK$QCP%3%Gve{)qgf
ze*ONRm+~0WoTreR-VW#S^_-%`%%PH|Fry5tXQ-OKM#y-6*rPX;E;O&key+kGwi=C;
z7^W{e6&c8?KqnJp8=1s)WYV{0=$lO!-?UJ^Hl0$R!&G&fijC_!Hn(rf<}U{IBM;1w
zpVd!2@PnbMG(SsBQmTvL&f=S7U30?BTHPXzmC3&fkzrEOV%ZXa$mr9%XZ5RZz0g)~
z-1O*!b4z$h$k3=~yBUe+WaV}0E0`8<MDfLGn^a!_v%Bg(W}u4|eL0eHjq_QCiSgx3
z%?DYV@wgl05Xilh7ZfUu;wg~JAP<4Oi-_NlAITNZnaH2tUw#EmFM)gq<oh7M0C`QH
zcxWCQel>?U<IFmJaYv=AXqG5NK)OMefpE_8+7uTjm(;cS0o6~P@o{cyprG<9i=oFr
zCT>rvh{w}>fxR}YQS{9n`3N_iDW2<18yio#Wm8aW9j-O+>VueSLbuc>F<$R~xbp-M
ztbh>pEXe0Ta0WGp#(+SWO5bkL*TW-OXA^MXK1c*_UFpB(=8dsX$O;9T?})8xwwha;
Pr!~)O-l4zt@bLcu<Dq%(

diff --git a/tmw.py b/tmw.py
index 6b221b1..d8212d0 100644
--- a/tmw.py
+++ b/tmw.py
@@ -399,15 +399,11 @@ def call_treetagger(infolder, outfolder, tagger):
 
 
 
-<<<<<<< HEAD
 #################################
 # make_lemmatext                #
 #################################
 
 def make_lemmatext(inpath, outfolder, mode, stoplist_errors):
-=======
-def make_lemmatext(inpath, outfolder, mode, stoplist):
->>>>>>> dhd2016
     """Function to extract lemmas from TreeTagger output."""
     print("\nLaunched make_lemmatext.")
 
diff --git a/tmw_config.py b/tmw_config.py
index 872758c..92fa45e 100644
--- a/tmw_config.py
+++ b/tmw_config.py
@@ -23,7 +23,7 @@
 #print(help(topmod))
 
 ### Set the general working directory.
-wdir = "/home/christof/Dropbox/0-Analysen/2015/hybrid/rf740c/" # end with slash.
+wdir = "/home/christof/Dropbox/0-Analysen/2015/hybrid/rf10/" # end with slash.
 
 ################################
 ###    PREPROCESSING TEXTS   ###
@@ -45,12 +45,12 @@
 target = 600
 sizetolerancefactor = 1.1
 preserveparagraphs = True
-#tmw.segmenter(inpath, outfolder, target, sizetolerancefactor, preserveparagraphs)
+tmw.segmenter(inpath, outfolder, target, sizetolerancefactor, preserveparagraphs)
 
 ### segments_to_bins: inpath, outfile
 inpath = wdir + "2_segs/*.txt"
 outfile = wdir + "segs-and-bins.csv"
-#tmw.segments_to_bins(inpath,outfile)
+tmw.segments_to_bins(inpath,outfile)
 
 ### pretokenize
 ### Perform some preliminary tokenization.
@@ -282,7 +282,7 @@
 targetCategories = ["subgenre"] # list
 methods=["weighted"] # list
 metrics=["cosine"] # list
-tmw.itemPCA(averageDatasets, targetCategories, topicsPerItem, sortingCriterium, figsize, outfolder)
+#tmw.itemPCA(averageDatasets, targetCategories, topicsPerItem, sortingCriterium, figsize, outfolder)
 
 
 

From 47914de4ac05f1223efc743dd4160c523e797535 Mon Sep 17 00:00:00 2001
From: christofs <c.schoech@gmail.com>
Date: Mon, 31 Aug 2015 17:25:42 +0200
Subject: [PATCH 32/56] Added binsnb as parameter

---
 __pycache__/tmw.cpython-34.pyc | Bin 35635 -> 35627 bytes
 tmw.py                         |   7 ++++---
 tmw_config.py                  |   7 ++++---
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/__pycache__/tmw.cpython-34.pyc b/__pycache__/tmw.cpython-34.pyc
index ccb18ae9e6026ee38f88806d334f8bbd82232363..ad3e06396f8ae569c71c5f7cd5a65a0f546b29c1 100644
GIT binary patch
delta 11707
zcmb7K3v^sZdA_r&-PNw%l5P1R*((d#UfXgkM}Ee!omh78OMb~O+0J^k_exrOUzxkI
zB_-)LBs}7tK%AjTAgAR3<<&MxVM|JRls16^=GBl;$h4FgXp2LDCZ`<0<&gg0KP&Be
zqZI0M^!vXvb7%hf=lfr?xA3s%FCO-E{;akx^yD2sI<n{m%le^J_RmZFHu`giFLl2C
z@O_pwSYz3iWeY1=YYo;~cD-dcSZJ@a>_*FOVxAhCExU#J2Fsph*|V9a0dp+7mH8&i
zo@?3j9ugZy=3DjxZkuh{ZI<26JPllE*^8KOwd}BEcQ8*cEw=1V=I2@Vn=HGFd75O2
zWiMrZ!IX?`*4cJzkmezgm#O!Ox%r4?FXu5d$qLI}$$Td^WOrNkD(gOB4bq%9SoUh}
z>ay$}%U;7g&3~h1_cFiKve#PnI_8&I_Ik_Sz&y>r(Xux&zudBKvh2;w(^@xM_Le+9
zthDT{mc5O6TJ;vozLoh^mc8Aw`<N#QjGk9pgCxNY%ihWDYb^UV%YHNSB+V|%-pzck
zW$&@<z08wj`z*Vk`E{1P-?9%dzur1!AGGX4{IJ2Y4_o#C^Ca>S%Rb8drm2N*vd(U{
z21)8;mVKPNwpjKF%N}H&HgM9iPcgsEvQJy~?W)P!nuuEV8E(AQvSXG##60aPZrL{T
zeLTNoC3je<oz~z^OW4DfJ;Hsovr)@VF#qN$Id)ltw9B)WeU96A<FF1#l7&m&q(^A=
zTi=p^_U4?#WLRbW?dr|`C#m&U{>?Y&;PHKX55%PFNbSqwe;smjGLenuVk54uH@uT{
zQcgPOY84NhSQ{X)>zeUwbSPs_<ec%Gu3{R<jOC`8;Y8BWerm~%<vwP~<?59{-oKLG
zlp%G#`rXIo6FXK-<mr#xxoLH;efMP5s@!-^2B|6H*MW??DjUm<N{rL~;fzeha<YXU
z<U+Lg;>nEbM5=Ybbw(hY45M$ciUnI1@;m5A$5M`rpkw(oS)m>Zc8Ol~so?1&yjWFf
zT3$`wKw~3Say4G*09gZ4IStw%N|gRwKM_kbi#b9rESwFES2aQgiTzy_;DYMadbe#o
zv5|V&jTRQtn{?806&`!kJq>eb*Me;a!PK%Iq(ME=(B8_@gz>zXi1z1Va>U7L^@E14
z8+ol8(HxLY5}Vfb7c$azlZm*KNZZbM`Tj9SZ(&p8|9HfFwXWs1JhX64bWc{F44=NU
zXXVDbS?)<+UncFWk~P#DnIl;;mS+JbxD}+dkGga+cWzZKbIwU8CLOsQz0DvEAdMhR
zAbfzc!1aJ^1j+OMH-Wtgq>YI7$Ff-`ZOdMIh|HFY&=Cf?nTT$2X_Z_i>Bv|*?hMD`
zxr~%s(On@SYi3#+Ru6#5$Ey8_G#=KlMkVwow}C*_x+<ASJ8}tFAM3JQh3fHHhs553
zZ??xL)~f3HYx5x<$hzLiDl3tq@^lp;cvO4c3D-yo{pgx_CYf~NxkM)I%6(|@XJctQ
z=ITHyHjZr?6|&BDqTicJq_sCehzCw|m7Rza3_d%5VIImQU}U0k_k5(?=xiCc4JFcU
zdPtt22RR7BYT^UpH9c9IUL4OQY*+iqZs>n`1mlkK>zUzU*U1@MVD$NTtoHFx9W-y|
zqa;Yk9ZN-Fc|$IDr-yVfo*5(G;gCD=4>7MrLi<Rjgw5ugm6@!r#ZdsdZq$`J$i2+g
zqr;7oQli*K`7uDaMw*y?Q?#0PjuAgsFD`HcqR!JS<alAQty`>Hi@mTN!Vg3|+Oul4
zu9BGxmD61dZH7MMnN)VmDfPMbKbyslV+B^K-kgl3hV0mum>OHyYqaepRcpU=VzwhC
z3_}O-E+7Wz(P721+V8mWSk}>1=NvkFz?&RglRGx#<`TIvKKh+_gTYpfc0B4TaB>pU
z_@pyZG~$Sm_fV7UVVP!S9SPqUO^qdUiEPq2Oz#|_ow~4BtEyQvh&s{gUFDGv6izJ4
z2?lD>v?H~~su>3io+|7_K7jcdA#A-2y^;yDNx?fsstv^M$M7&n2MBf{@s6&GC!H9Z
zSji8!l{sRJsmV$*WaBpCYP6$6JXpBcVGA;=pD*5+XXg_vb3SY|1hOFlP26$aN(aLW
zSH)w=WTaMClYwWFIGQllBA+=jLJuYIKSa&E2^}!Lz%_np7uv~mR+#fl5%?-`*MK#m
zIq5iN)^YW%&WL!l@SDyxRmT}`aQ@ds@GA1ds~#fSJ1qO<UCj0G-!~cDJeEF}&Rj@u
z(N)xQRC|x^I7G3I*jsMV-dp;1gHW^M_G?f7b_ybITefZEv<4#{If2Ql;GZHwSG&1P
zHkqK1Jq=pIGa4ArUskUz>#GXU8>*pmo>6VHQ7yall3HeR+V8|lK9}at$qmFyc1FHr
z<^pMyF#%xn!JF6x?-ZF;=DF8khK4B_^HF*)BVD$dvLj*Db8whOlMeE-CbghUg#2hs
z?#67U1D;y@xI=rh8FD;~C|#3wE=+lqQurnV`Y^paN3sFcr()-vsF{>9nMYJ^c|v@m
z@Ndg6iu@%SHZbYuFc^vijH(ziCF3aBg`&}spsq;~E}UGHlb3f>n=uXc!;Mp$j}m&%
zlGm5hv*{X^Icg035xmzqeb7eKW=33b`a~+5k+~ODsC$8Us<5Q{KRr8V6qhCCRVogZ
z#ch-YT1f{JY0A=|Dj8OXS04&ZzopfatLNIfm4KCqM}f2G1t&2wnscKVuA5AP7d8Fx
zZ_`b7CT^Nt-F)3BHy6t}lDAm4L$#u3p7@2@(bFlOR*9ZF{hV-&sF!+f>@FJyace2@
ziKfWzYyv_%SynOq3V9ZK<mJNdHC-O@jLP=TTK@&;@jb;YQW_McXe(iyS6LfX=spPY
z7{>TBGJ#C;xcW@*ygYKX#4E48_8LcW<`Tc4$4H|A!U$fehQma>C0|3E;)nvnkTLXD
zC6bxAJOkD{naIk21Q#5ZnN*acHpPnMSSsymF(9duvE*?K@RMefX?1w*l15#}ighX4
z6F4dTzJG1o9LclLQc+Gfw3k@9T7711_bwfxcu)+8x^5~m#xv4!0Wdm4mmgl|%%IV-
z{x_+fbz8+tg*(=*6#0Hg_9DnXgM1U@pNI@ZLPitKXepTl$BU1mjWq#1nVnOdL?ED~
zJw#Gu$6VM5g*nO$<o7TN7EcM1GeRzpN`V*g7+gj`LhhfaOk#B%q@M2GtNJ!{_+O=W
z;yxwbI!om@yqw1rEOEqdWW9_A43aN_@FDuBNEmS<LHScOeHr8hBDygZbLmjc;%prn
zeP0221>`D5)s~N{Q0!72;<=+h>S-Vq{=)KxJhh@$H24Uye!}dp3r(B)g`d&-MdjSI
zBVWq4E6n!SSl7Y2K6jLo3UW&<br|ix1eum!vb4WKTZI9&#0j=JCt5D~SebM}MK;%o
z-xStwR-)q>Odvpve8lz3V?=e;gcFl;t@_E$3&k}R+Oj_KhGMti{b?k~FR8Oz7U!`I
z!Vsj^0aMSo##?d_V97b%4~)pnSk|$1L+t!WIZkywA)7KE9F8e_O3&<oD>mLnE;K%t
z#NK1)Rn68$@pjd=waZg23fs5#dGfqN_zQ_$_p__uy|RPg+SI6`kGdy4;bqrEL=Hgy
z&~QS!xnn50io7WrLKtW&si`8Wq>E>%Q+i0Z@;PwF^O%Vec#h$yE43F6gl!4o)ELwm
zaL<FpiRfBsO1x-nJW;GHDlo%(t}Xf!h`5tIJ!V)8!rBgz7{_&pod9n`UP0d;kbejH
zJ`vqm+U8-0yrqjQ>2>&OaAW5k3J_Bo`GES~ts5yFS97<2V~^2nNz?397+-M023f(F
z8>i|okscY?NtHNTX^35IE;hueF$F*QTh-IowO6-5(^2EtqD7=sBEFelF2+t1jh+SI
zt7%Egvp_5t4IVjP6h6>bE%Kbbu`q`rQIJT)N4>QFAY2ZwVUDbh8f7h!NR3fdD;l6L
z-NdRiG1CaAXH!!#RF=tH^vpccm2(BvvAPERNe9%GKZFz?0{IV+3QRXZq_W{JsL`Dr
zp0H4VwDY3)keYK_Z)N=CHz@REBxx;{bbbs?&>uM_84-mGw|!Nd(2YZ}_&My9>MkBj
zW$-A)IPFi-%|jx6)kote39J-U!f@!AZg?ymm&>4n_foH}9?nqVM>Xu#xp?$a9jVo0
zyOtUE7Na&9#^4qrx_ZQ+f}2cJqmU!V5}Pm;t~Z;^QLUbGM>7{t!ShliD81vO0*aCr
zc%!laRLaR#Vc7uCIGh+U_d2z*Rh*;YSMUy_I}C0)<md%qurI-jL1>lknZ|5gZ__OU
zIU1u17uD*>m~^7NzHTT!O}U}+BT|p=ZWBLMmv^u5Y!iiF?%rnbH94Ep%J8mKY{YTH
zaVmky!EG|PEhesN?}1%=<t>n@6k*wUuze%U>MFcQA(VC)X(;8|bbH|p=d^E(CZ&5l
ziHuEt4Ev<%_LtAsgT@X_RZZCsoog&<E||PSnQ<y+ja=m$!q^(LM-wc2K>7A{iaqL{
z+b^e$ZIi^tj*b81T!C|kqI+RflSR$s2w$dmBVJ8K|CIA!XEp7j$|X^;{+2n5yaZl&
zS<BljW{Y{E-6Q*jy4e4X?ozy>Qk6qfX+@-B@=HW;(L)u}arM^yOMO^ItB>#R=$*;P
zOo-Z*b5d?(dR0$7UHi@TG<)Qcg8#rQ5ihN3LIoVF$(&{px03?i3{uGNK%RaKeGc3K
zaC3-Ii9_bii5_j@Ae#3Hb@#z}r&!hi6&rmhhp>tcA<Y=b97tx0B_Qu&1vXXBxzcp^
zjgTJ*vb@cnxgtpUNZ8XT=88r*!jpwp58fr#!Vgl0o1uK2uBWaJ(xY4r2@VoB)<k3=
zlkKKIN-M*|&wN&H3d^sn4<3%(NO#QxQM%C0rbpPQ2%FHquF68R^^ld)z8qbR(4ABK
zoGUj8_1fWi`BD;Dvf-Ps;tljR?SBybA&}#kBgp;*E=>DJxhydz9H!B7D~44Z<Gf<x
zGZd5Y(U|0%*T}yE3Q>;?bcl4}ZwHo(Rg4$5J6xy8lOSm#k$Pky<fuj*D4upf_jW=x
z9qq~=!<ZyU3WU80i^9joVh^ALvYV|`h*uBwAV3(I1|btfu&P3Ivs{Oik?3r(LP%gM
z&ORI6^jBl4Pagf~!Doo=XEZSSDD9mIZ5(p}GzzZ!APqacnT_m;V{NTzNCJ&mWMp$i
zt*1_egml%FWB+P$rZ4ic3~;k<V8(SB<ycqy$hFedd|(RF)P!1xjq5Nu`gEB{C*DY3
ztpw`X&=<ePNwz`%ggSVlRh%y*PCP1VPT}>_Ah)a9lkMvn!e1vLWDJc%ARO|e;Ld;m
z;JgoH9{f+*>deUnd7Npacxl2J*)WC!&;(&336QfOBOs$7=ZNUq9DHyz8JnQ&;|*h#
zEP*{}5W?Q$5T?Nqb<GH`N~Ehrr`jp8z4_EqF<x*^9Tz6vGMM*csu)crU2x|?#)*)H
zX26Hh#gNG2@eYtNkP2LvAbkaIFn(B8Hqi=PH`Gl5n-A(w)t=keiU~xFyum9iFmuF^
zd#UUH3>8M$8KFy)>N)Uxqg3tCMQm9{`egt*WLb6PVPmrJov0&9fHrvrL!*?W8jR0}
z!2b;b0sBdNGnpCSPW5cgHq|}+xk>krqr*?t<pjlNi<-5MO8j-ba#C1F>DC>a4wXzp
zvS;A~VXq$~uYcH7^?sV&IZri*n@KsQ%~*fsIjE+g1>#$(d#F>~Q#d%(B4*Ps4H?w8
zRIgB)W>P_|CgL}FRtxo1yu;+dC8Ib*LQgi@6fL`_6go7~UGC+ZXtQ*iQG<{br#bP{
zV4yg*%t+Wi*iuSHasj%z-u>?|KN+!^mFyy5?1DAIcEP}eT^j7Z&KMrEJH^k{=j{&h
zK;epAFT^+1e>roHm;C!Zc=rLQyvEo7!#_;BHSlM>^LnMFO7MT3p3e+-h-&qg;icl8
zgz$Olnc=Veekc!?Vdz<`&XwWI;D8r-7vuz3<lPX~`#X>i=NQdUq#8)5n$fz}r4%Km
zeK;pdl(xus6>b>qD8sxd?TtUY9^{d_GWn*{CzH6;L8=*ql<z`^4e1%v!4jr*Ew0+A
zf`qqUkv91_oADpuoj(Mb(V?hoi8irDwVrJkmkMjoJ|-I2!SAL;v?qB%zE^$!T=zkC
z^6#KYCg*T44Vw}yyeu~4(xqE2BJZb89oiimm6OgWVYMs<P>*#LU@W=+K6N^|Wd*QU
zymdD&?|MgfEw~v)%y?FA8BkZVZR!WfZVnm!Q<sLnXSk@a1Y-z%93?|2W<y~T$vEdL
z;MfpS*jn+vuXk^3$Vtwa4b3fPLk1)%x(26<ba?7M^>nIByr=Nv)VGEC0gU&FL{h$m
z$Z7R#W^D~uUF;oCsiy2Qzfrl`nq9UCx|M$nLVr{Yf@4SWFgVs_*YD7!e1wLcB`xkM
zYS2@fE7D}rNgw@4oScwvs8mGWvOs*jdzn$GMeh=Z)NJinEUT@k)kiVg>)gxN?`Rh%
z)KBi{67MTCk&WgVh<@9N<2S)Sp?8YcKm8QON>1kzCVLOa`-splaU6Y1o`;=I$=~2a
z&1VQ#AR4=$2hmpPek`iAb<0$^rNn$cdw`UaQE$q1i}$NTxmThl=yIXLmtCdkHhl#n
z5$EXgrFd-$Ly)tvAP2GE(xa{-&u&MmbOY+SW9C-@6l+?2U~Kui>n%e!!5dMgc8tcF
zF!d1pQHUn8JRyM_81hCRDR|B&%P(LCur7dh6(i}>3DzK%m!(XGy-zKKJA;nrL0I58
zxSb&Eg1!q5F`2G<KTN}?Ok^N)gg$#|L>;()m;^jnl#orFeqcv^#Ww{%zyu%SH`$%+
zoAFw@X=p5wEM7>N5s#@Y<6YwMf<6AQnEz1-T9I$E>GZ2vlZzYRvvi=3WQnCG<^5{M
z<nBlx)`bLR!8UQ6vVZbLAIEf<lYVQfRiBz{%df`_^dkZ;D5xaFR}`5M8B3K}!X~^_
z%2heUUO?MSCQ@=a0r{60^aRM0M06dO@&_o!pa|putZ%MIih%Djit4E}8QO<#rM!oK
zw<bs=;@gvBNJQwm?{W3uon7M7>Zv<_W|G#AV6DeNK23y@q7Q=q6i65o1?YF6(F|R>
zf{}!G`7!k2cq@dhtOXk?mi`Ce5&OsWe1oSUNJUPaSSaM@3M=ovQ*aD7F6e7CrgRxD
z@5EZu&XBH~=x&m>A_dynv^Zz%m+R?0f?ml9l&lJ<G!yY#Y-G=2c)&P@LqaXdch>8g
zV(Dq@>{!0Nrp(S%K0jZ)px&GB6kjO(UH%ENZTh6-Uy(4{pF}Kh<x)s?8^}yJ1ehk<
zk*SB(qRBZeRMj&ms|_gQ4x#ef_r&vz$}1!!q0&8X3Vnmh;@3h2tnk+$p9BFe<x?P>
zDI5UzHz1`f@c)BLr>t12k<VZv&Ie0-{yf_L9t68Lcxg`1zNR)}0IEZ1<1p#I-oLlq
zyFmOzZMZigE*FOG4Hj#uFJQvUM5a9V^a_$!y#xK90r@Nl3&GN{oFx&<PLC9+>mf<W
z8vH(jq`9RD(f2Gt6yEBk`$FQY>KFI@D9=Ay)}|@drXPdT_&(3z6NgHc!Z>4KeT`(G
zj}H<#`c#L$m~vrxW&U#mjW?Gv0lE*SPxvIEw4~6P)YwQaaRa&Zz65=H<Kp8b`sF=N
zm$sOyVp9!MwvNpm*nUfodhl)Y#B=J;-}WyfEF&xPIg*iXcsLBQ_GVbt9572(u|}Xc
z_ZDajladjOIxjTVF68s-<+tC}Sn1o;_&XNnzX^fEAQdy#;<_1MGU4N*Gnm#-qwtMi
zqmbW)uzM;$T{wkl*rr#4>jmM@_<F!?1lbER4stKZ1Dx<a2=-BsPk=lH@-z|cOOK_p
z6DH@sjMkSxUIpRb0e=Io8UUl;R&Kw4A-jIL84WEUogni;!XOclZ6JF<`azC>oCHaO
z06-Nt19F1e@SRp{C?nJIB1TMtOxI2Fop^i~m#&w<PUki9{q#sbM8^}^iB)mq<>^w{
zFp~Yl^$uQs4)0G}m;5T;R|hY)Mu1@nkjUpjz72w7C_6Vk!4>M##TDM3T3dboV(xzd
DBgQtX

delta 11437
zcmbVS3vgUlc|K=XyVB}yTQA#^ELoPVwJpal`4PWu*^X>kc5KCu8{3=J-YaS2edXMh
zA9BnRk|B`JFcS`>kOu@x9}p(MER+JBHZ8-@KuHP|NN$-SJO;+lB&0*9orIA7-+$J-
z>y1)6wa35zd+vGs=Re>7I(MIX#Fu}>*Y`?&L+$>|FYZ|Sq7Xk9Wq<s{@1}ol{NcU_
zPuh**0V%9HVGChPk*F6|y|9~v-7Fpu!fFuqJYmmgzERi<gx$hCb!ZiK8}rS=ZWne3
z^VE5vuopf2=9baL!tUfY>ewahZsyyBy+qhO%+sJ=VJ~IAL)d-7UdB94&@b%e%r6r5
z3SkfE`{W`S){o1M+)81u;#W(By;|6V%+pvY>@~u^g?F<|*lUHoj$hGK>xI36`Q^gi
zDC{BTY1&P~-pu@fu(t?%EAwGt-zw~F%+p%ig}sCMRl?pW>|Hs2poMn}dk^!2!rm+F
zeaw>t!@}Or{4F<Bx>i{0gndBR2f3Z3xlPz3%x@6(Az>e8o+KL;_7UcXggqwgqs)_(
z$Ao=5^P5GNeO%Zl_<=+o7xo>@Zx!}QVV}~^``QOi3;PT=lDw9%?__?Lu+Iwn9P=dl
zd0|I%qrW|Vm$0MUNPCzNc8vLb!nTF&Fi-m-WK4?0evv#NtOG*YQ^KC+cL#+X7xvxE
z)1IgwA!tNcBf`EQ>;$*d{-ih&PUb)0Pxz#z-v}(Pw)|NqKHH;{!7jZ&_&I9*&*1i3
ztkCHrqsOAkb(9y#;D4*u&8m1Nl8sKeR-@sagp+hqS=Z8cRiEhy5Rk39%bCbT+P;!?
zE@!<eP)mQex>NS*SE_TtKI)=sv|IBbxj?_#*p^e()DjMQ)oFK4CYqgA3+UMkPNr2d
znpMr{-j0?)ERlAdaE({(I#XCvEyTOUcvl@u&!n;>XjL+8J7xxSOGT58>Or^OB3X(7
z^_NsU>tG`NM5tY^(q9UlJIR7nm6qgr)N<+{u2L&7!6J|_NDs)(ld_l*YIlP~gq$zi
zB(>?!UG2!fyEanQjEyP0yDq_1)ArrYHN=J+)iSj3&ix4|rTXzWpx@Qh(OL_(4FqaX
zbs+Wnsiv-W7OA9!U^c3z98&gAn))~JS}UoqSFM}`^k6k!N-Nt<#A8l8WjmKErC*ib
z-28@*4rRlFgE^?;bvZa&b7#-FtLs*8y~c9S28PoqXN?Nc+i;s=$ylBi2(=lcxDTr#
zmB?OLlTBZ6Qt??wZNb|nka~~?kVX(b;d$T&K-Plfc>n9bt_Nu);sv9bjFYm}DtZXF
zst&y1-EYL(1um_UO(z@`O~staXe^snY6yKQBxKDLrLmx%$v6$e*?Pfv3J>cURtox4
zn?RsyuPPBwIjS3MfOT1}Lj9SR<H23@?s_2qW{WQ%SATHPP_BkMv%U|o+DfFLK!PlR
z-t)UxTq7xT<kiK}iG&l&#?vWR?LkX06HVDs*Q-uOFJr4ljjXp@@!p?|r#ye0%mP^P
zs_b|y>*+5p>dryIUND=E#N3PFE~B?4)Mf=I;wd*Zp^ngl8UtZf@geb=zKm@JvzN1R
z+qHsZEcCw`LBB&pykL5A(si=NC}4(Gfcshj?rMe1$Xq0rCC@RFjKCl#Tz;LJutIFJ
z)JAhpFz?B@6(E`7wguzOsC34v$8i9@Zp2kqh~F~Xh!<{zloG)<%8vrVHPZy_m?AZ_
zbM*MFeqpg&?Q0;rROj-?I|t>a)z}N$VlQ!FpXXb%)~izKOO?~TUb`LojHQ#Aop<Ul
zb$zM@0~EWGn(c{ba>9=8yh|s$HyUmGN!4CZIZ<*(3dR8smCOKU#>SJ;j2Cp=STy5!
zRTmsOdqAAL;5>I`!p+9BGko+1Fam(HYS4~HuL_)kpYa09nJO4^SgJ{CQgW&8U9zx!
zoOb2H&_vaI*(4ieyMK*O-IKq4Nmeo_3#J^dH&)DN&OoVx)aoA0#b{t_Wtf#rmQ4uW
zC|qOUltOp5MfQ-8SYeFS5KA~wHmQ;;?quDS6j+y0WVpsnOz69Md!?IyPp>WIYW=IF
zTXXDeLS@c|4TJzSK>&$4u3PC)u&`GZizX7`das5II-S5VKs+*o=+qQFlz=`1S>a2p
zYGeDZ@k86uPG+*ooL!QDR*A7DtP#m7$1$^B(%<R}%h~+z`qo#SW^BRPzb-@9k>p+X
z5%K(!YD`6mv;1R6kIaU)&!jG-(w9;@ttxtR((<3&f7}X!-LuE??-@P>Ld}jlX8FeU
zQP6nT!0xR@4TgQ{Jf^LJKMG5)#?7WPi8#gPqJ9axXkhGqQ~%e%u&mMB!<|O2^Nn8F
zf0uN^XI?sP_RtE;1YDXtt8O7)vDXPC(w9h`3<w~YPu&D6xS?=MnVUx3^qOu6n2XT3
zv~t-z%FcuJ&XJ%9B`d_sawbtt`35=6sM<lZ6<)wCTLFII`7>$P3ox9lx|DP2hWk)G
z)Ss0?mhx`m+l=)#CZiXeh?$f!l}B`9Wn4a(|KZAqWG+kH#%F^Z^J>G@MpcZLl4X>%
zbdh?8Ltb5yK;dK~oXT9HHe(lXxa5=yrxPLQ-c9aZ6=$>RSmuba=qnhvt9Z&r)cFjz
z!u0WECatnRk!0rcsk89a%6j@cUpD)KzU96JQazOK9{iQ>z?_?4`FZV%qvlOCtM@_L
zP&}2H$wGl@p+3C!cwcdp)eg8tXfyGc8!>)7f{vDcW^IRK%}dW@X{hNCfqN@9+38qe
zoHhTZUT!u@2^?>{V&5(0`$m38=RNg1y<=UUd{{@<T@7*`V(Fi)+b~!$w4U;zNRrIc
zCdhP>0TxoTT3@$MeKfy!eZMdFLE7sW8P0_`9SV7VQmIfH<Z3|+lq}IE6AC7Mojqk0
zUhfCFM#KuHRU8TFC_Q+gLgX14_Q<1T64975u8`%$UwiAVw>bVY7ynOs3^(6tU}a%w
z4SX-cv{ggdDVYa}12a>1;B8erk&dZxu>M&R_QS+kp-GiaMmX40h)K*OQ?4h+)oF|x
z)q954Hd_tmoG3LQfK#G=-%w}USsqIZL^wC`{KTpiIzKenVAW<EO7=N=TKeUo&YlXT
zHbJRo&RAMGZl4Zs+9kh`k8fHn_wU8Rp9c9P$P*wRBQhSYH5+C9mF$Nj%X`p^HwrbV
z4{L98&)&ZTyNC$oM$VMyC*gTwFGLtZX%MeWVZ8=OR(yL{Z{N}@zmh+{B_MMw5~K+S
zjT|4v^j`t_0tgoN0#q$bIpL7{BAWgb<j>LGoQ%439%fNCh`+$QkAVCQ$mh|kzPwjO
ztZ49RdF}{+cvio<bz$%s8WRiZ=;fCD(p!gQkRkUWeQw)+tCUz*m?y@lGepP}PgCwd
zPKiZbi1xn&DGIGv#J@*d+4<?;Z|hi`g(1MH3xBPwh0kE$(IwT_^S5l*ayLw(%nm+8
zk7MdEQLpNX6IE;ov-6`Dr^-+enYc1ITeXJ3tNuazc5V*8<0d$qn1jYTkLZb=OLN#1
zK?Hm7s!bu|8qdiAf%mbIURu>tDm{~NY^#aZFNdMkNLZXnu){L-PU)GQahV#hN+-FM
z*i3@FP9%C!KeeM-zE`*G>i2zH<hSh__T`E$K(UA5O|uf%=DBJ$I@(IJ&H8!<UYB8Y
z8)mMZj4L;L3e}cncvCH)7`Tyl6{@F_E<R2#)w3W^f{f;fB5UVhegY4Em_M5eLaVWt
zad4jiIYq>(SEja$L@&n+<wOO7SkLvI@d*&|0Q+~$fD~S_w!cH%KC6~bm$#ulgLgYY
zo(B0VB35&0n<pG{fG*Odo6+{n-WK@@{e!(*ecu(jdEaw~jVep3WhWsq435-bEEIKP
zROgYCA3s2aH(O>cd&X>Zf)ikheCiuIJls!L6-S3#>&i}6r-$2H{sjPswuNu8Tq&D;
z>IIR1bht+5;KI#P2f?At4F~8gJSm(9gUg)2^XTv#I@Fn+{}2t>zBP|E6TzuQT=s!h
zD>W73U3oJ_b&8SP(^WqbyJ&D7-X|PTSA7<9;;7Z<K*}~kj~;v4vhFYFyAJgFUKINN
z1C*ZV#)BIx10Ii{IFME(STy1M3YwsQWa{c?BA-6^ZAm)YHqx_2?WV?A0vO5HW7V25
zlQXH9`YF9o_Y(1HCew7aL&fIx4m|p);Ip(I8OX71BPgjCLB7YGryQz|$($@yT;wI9
zS4=7D&m^)`SSQ`-^d*$ryc9`8GX|*Aq4WZ-rL5X0<y5!^p!=TaMQ1WTWo}~X)eprU
z>Rv+RLX5_ps=5xs0PV)W5L8FEMl-h6Xw#JdIS0gXN;)ll_0W=m3T?M@DrnRl;jn6Q
zt$)}5cxaXHHIaYy&~6kZYeyIF3(@?Ai01i^jf|@Qz|y7I#&*r7SExp)mZi##HWF?s
zq{R9IqaCO2$FPdA18C#GS3`NlR6G?;m|HIYMEbH-9iyqp|5ak84(*Y6GCJkBLH*`v
zpYIK!XU{*o&zPLVq^xH)fsfJSIEQG@m^s6dOkpmEl-Fy<+IAK_L*>eiFr1Pv=%!-F
z1we#<zHF5XWtUI=LFo64JvV4V9K9K%!jUmlS`OKadV~nBS*U&))z^-$2w+i5fBI<e
z#)`u-NeEwfjbxpqJ5XHJ*GLhq!#Cfjs`7t0+Cp}MnN191Ex@~(?Bx5z9iU^JLkjh0
z==BGXCqV=aRX-pO`CZmzO|%J#<~PyYNR~9t3GoTSd5W?%5)1Bh8zGoVf?*UbSlX*a
z4ltfRmPi+>G~U`$o`e&a;_LPLpKtGywfSG)ey?1FbE6C~O?fii3SH*_`b%n4`}Mm|
zERv7w$4-QEbd@_Ep)j0DO|dl)+Ms8zDudO2!z{^9mk)HS6T9H5m(diwl88<?32HR|
zS6xZOlXDv6D_C$j5!!nQd-s7<6Y)aiEW%m#I$m(viBC;ujX|<~{uTpyqfD0S#CW$H
z)_*eIFFW&J7+)#PJttdG73Qo3>D0fyquuwW(4mw4w+GR(2xKt`8#~X<hrojP@dDBq
z;d+aU8tp;}aMI?S&gk69#j-~~cJiyH2;k%4m`+LJT$uPI=6jL|g(V!i@ip&g80|Ps
zT8;r)W+6#n70yABg>=YzUqgV>Fx9JHJoV#pv@$%6Jk@i`!x(*%=J5jL=^RDrU<r^2
z@w_tQu}svOchV2!)f>;WEW8W>8;yXz4jJ+-<ib)d%SX?AMCR%+K|M$Vk+In-(r3>M
zymrsz*Z)@a(8BT9tv!m-`8YMJW^|qh!rrG5ToVY_P{fEStcE{o0X<S&8sUVpdhqJ4
zYd6@}$|0RATv~7z)Ph-nYeuqa$72UbD@Ys2LLy#$7J!T-qF1QuoD<rJHL8T~kbw|8
z3&s}56~M2j5`S;QcKg{bN+yTSu8_;~=gyv%CSEeO-@+;!IeQ`7GLYp&$WXh$GoTq;
z-FRFA(g#uj{1WCl5VCMAD_dnRR$;=xZ)&)Kd<Mi5df)jWxkC3I`PSJ#5*^g<X{L7y
z3wY%>hA^iM=#NJhw|)~?st5-I`R5}}nPV;sOl)GbU!e^87l8d52qYOJHO*z1Fa#<S
zv$iRG;ipY5J&G4Ws@|?pQTt6i1gH+)v{4O6F(_1kRwDmjG#bS|hBc2Jc_DJ#CrpB;
zUy8Q0&Z%J?npiBqtNSPV<eL1*!~)rB1?d)w3P#HQDS<MXn5FNDZS(z7=%-`7%@tN~
zK>sAxnS(<M&!cUmT&oU2CPstt^{jf@iDv1>xD4kEaRz${=l!&qIg!nVQbKmMK{jud
zk<HM$&R=fk^wrQrcZ@RdtfQlo9r6|ZWxH1n<zKWLW$p=jJKNNQXqKgvy@$Tr-P$uo
z4wB<LV}1<;8-l9q49FSZycFY`^<DDr?AS~Ao(s^OjAxtj1U)&qKvwJ7$rW-F0eYeS
z+T^nZZ%zx&o$?q^hsu!jaddqaBo6}AsFjc!kWs53`u_%W>g2JrPhWJ@c)T&`lzIE*
zdfs%a+?-!A-CG8EQ%@Urcr(Vs4P}BAvm%p^JpT^Tsm&1fG~P0#LnTOi^|+a$>I`Av
zwv@@6*<4p+%pk~|u&&q??WvTvtkX?*cgd~!!Mh)mO&kt(pt~<|N!_b|bYU>au7AC*
zO?0+zpcTB@L(yq9>rCS#tpUH@k!Y7&^}8}VR{@`ei)<67da)5Ea9^*7wxhJToKZVZ
z=$~dfwU-!VV;KuK8<hPYgTS1|!OAZgM<K0YaI#&ndu2!>>+noL=eCU^VN!4N{EsKQ
z<Uak?WWU^#e<}F`*>dw%6vEtD{cL)uu9S<{=}>0ihQ>B$29`i$Y6l4Y6KXFw4w*Z_
zvCjJch|bh5>KZ3)jTAKHE6o+2XL7;-eNRnXQSYcc#LcoWS_fI`K^j0BK`NA2@GLhe
zueXq??Z#}C`l30!4*hbbL!Q<@zo%aw%!kyyGRI&F+D;7L&-T;6!mY^|MWT{h*@_wW
zfE*-3U$QuWmV67l7313=D<_R%aD1ffV76fN+Z+rPyWWBu5tDAXkFr(M*F@c!9h4*b
zQ1-RGCT?m>hihU}Pta^-7a1JE=nA87;m8U5zoG|+r*G(MGo4h6KRUDWru&QMXWCjH
zru$_RGwE6%Hy9m0HI{#UCQ*KYaT}%rimpQ?^b-Qs%s0?lN;24CJcqW@O%V%%>mqZ7
z#9rf5ctjMX%g-aUjT<;0Pv1em`)Ec*c69O-xpv!xL$*`;$c&<fuZ5n%1fx8XZ$yr~
z?SfV>x!fin)tfH&%M<#g<ih;S<!f@$QJg?UIArIsSJ%%jZCX#St?FS~KAM_U8}yFZ
zL*cDh3X7H{h4Irq_93S*9p<DjPqq5<vz@-*2>sk_d#)5FU&ioKp2)%R9YbfyF;uJL
zm>VBf)g45<1}>S8k;6y2%ZVFZGq9vFr?CAZ-m44lOkcR#DL<><ceP)h*H2%4#bkzK
znDhk5c_NgM90q?51hz!q{GIG{+Q!F!`rb~6RLAfR2UQ`JS*90TG?^dg;`<UaNJl6<
zKLcdJbm{8X*W_yo(f9BM*)3HxzwFvo$&u4|ipMYq;~S;33S+IJ&!X@1bO%TCsei&t
zPTIat+<`Iv(m^09nQ004z?YebN3j_^No`cNu<81w?#}f!T6J_yqUat2SCe$Xm+M;2
z8!GLpbz&x-up_9j8IDHVFX#{E`lOTp>)g|FQ}LYCNs`<Pa$)PLmmtD_gUp4wqx#YN
z`kEPYH3oArhd}q8`(il;9UKRN&b?@gYXiE%P3~zZ=nRMj@*AviC%92?e0^{hkN-a)
z=E^FQIz>O>DpcO3ZBC#q27(P6IL?QLZfwLT$*M&gM=y6ycicvaGJbo-+ARXdH}z}1
z_WrP(&Rh3~WP(rE!8+4KZn)m!!it6A+%AGncY&~MoNu!DCCPiCASMjiMhIA3cq*Wu
zf1pdo^~(?ZB6p2?di5zvjOpuV3O_qDSjAZuZsSazbT=(XpW5SDFO;R<1G(5xnOD38
zE8>LcZZQ_kl4%fPOG=$fmCfYnj#GU+9H%(o;%7OQKNE|pvzV%42TkQFJhpGo+5j0H
z$`*X34ap$=AcOw;mP?h^pM3AX86UvfOq1Mnv%=QE>YGCpbLGO)R;*hM&Cz;0uwFgZ
zqwhNn{t(?T(6^lo%^jm+jQ`+&as^@^{nNjH&~C0HraZdq%=@}Iw>K4O#Y*+K%f;_`
z@DrN|mJ8B&{N$}!sw_5fxbnOHF)YBopby+CkaiHh?pzCQ7sxq~X^?wHKwXeokOx5?
z0(qE-7f8({GgnN8{!z5@cf7BFdj{m|Am0X|Zx1Km-<`vOt5-q32l8W(7eHPF`5DM-
zAa8*1&0iJmq;L@ht_P$SWElv`cC~^CeytRpNUN0Eh$bp6g(}uOY7-tQKhII>2DX^7
zs1bU!8r)bsb7f7;xM;fGHH>0^a&ur%QH(E|or+_8t9yZdVxv-PC|!bCxN%$$Dp-I5
iocP?;JyNs_KAAGG_F|(;_4^-M<zL@4p}+i4_J07iB!U+J

diff --git a/tmw.py b/tmw.py
index d8212d0..f89d46b 100644
--- a/tmw.py
+++ b/tmw.py
@@ -236,8 +236,7 @@ def segmenter(inpath, outfolder, target, sizetolerancefactor, preserveparagraphs
 # Binning                       #
 #################################
 
-
-def segments_to_bins(inpath, outfile, binsnb = 5):
+def segments_to_bins(inpath, outfile, binsnb):
     """Script for sorting text segments into bins."""
     print("\nLaunched segments_to_bins.")
 
@@ -325,12 +324,14 @@ def segments_to_bins(inpath, outfile, binsnb = 5):
     filenames_sr = pd.Series(filenames, name="filenames")
     binids_sr = pd.Series(binids, name="binids")
     files_and_bins = pd.concat([filenames_sr,binids_sr], axis=1)
-
     print("chunks per bin: ", bcount)
+
     with open(outfile, "w") as outfile:
         files_and_bins.to_csv(outfile, index=False)
 
 
+
+
 #################################
 # pretokenize                   #
 #################################
diff --git a/tmw_config.py b/tmw_config.py
index 92fa45e..c0deaa9 100644
--- a/tmw_config.py
+++ b/tmw_config.py
@@ -42,15 +42,16 @@
 ### preserveparagraphs: True|False, whether \n from input are kept in output.
 inpath = wdir + "1_txt/*.txt"
 outfolder = wdir + "2_segs/"
-target = 600
+target = 2000
 sizetolerancefactor = 1.1
 preserveparagraphs = True
 tmw.segmenter(inpath, outfolder, target, sizetolerancefactor, preserveparagraphs)
 
-### segments_to_bins: inpath, outfile
+### segments_to_bins
 inpath = wdir + "2_segs/*.txt"
 outfile = wdir + "segs-and-bins.csv"
-tmw.segments_to_bins(inpath,outfile)
+binsnb = 5 # number of bins
+tmw.segments_to_bins(inpath,outfile, binsnb)
 
 ### pretokenize
 ### Perform some preliminary tokenization.

From 245aefa55cfd3e7866421ea5b44d670b9c9c02cd Mon Sep 17 00:00:00 2001
From: christofs <c.schoech@gmail.com>
Date: Fri, 4 Sep 2015 14:12:16 +0200
Subject: [PATCH 33/56] mastermatrix: add bins

---
 __pycache__/tmw.cpython-34.pyc | Bin 35627 -> 35963 bytes
 tmw.py                         |  30 ++++++++++++++++-------
 tmw_config.py                  |  42 ++++++++++-----------------------
 3 files changed, 34 insertions(+), 38 deletions(-)

diff --git a/__pycache__/tmw.cpython-34.pyc b/__pycache__/tmw.cpython-34.pyc
index ad3e06396f8ae569c71c5f7cd5a65a0f546b29c1..5abb7d2c3808db7a0455f18bed67d12402f8ebc4 100644
GIT binary patch
delta 5593
zcma)A3v^Z0ncn|Na+8}k<V{{Tgh(!s5Hx6bV+f&%7zhcNL?GVWbCcX~U+i;Y$PFPC
zuv%Ji<gZl$U2R>~`lwTSoZ4DfTdPwPiu5s4^t7Fkshw%Ns;e{FhYvdQ|NA7|K%KRk
ztn;nC|GoFW|M%YeKJ|I+!w0qMcQUim-@WU%+dr~}_?4LTmqhLL^cnJ0^&s@t3f(IX
z3(;35bRl#Qp>i=&A^Iv+caaErM7T=yRSB3Lo-g|53w^QBm#AgcLa!5gy=pJu_4-nw
zH;BW4{{fSE8-?Dgh8kwLFGHyX3U3j5kD{-dp|2JrYw&H&ThcG|+tg6=>><&&7MCW?
z>kJ5eP>m925_d?1T19x1=-Wi{4GTS}#y%nR5uxu<?Y0>%?V_(;=pmtp75yfbwaXi^
zjwVeO<{E*aa3Er~wb&^G!AMZI%H}0Op6wnCh5`{^IAGW=MK{n}+MObIkp1v*={Xp;
zwv~ND%b%dWogxE6fqiljwY?w@Vp(M)9JM+t9{@OtzxC9^5iG6>!hP1ks!ITfpR6|E
zA?v;Bj%1j`4YhYc9M9Ii0QchE-ZStxb}c#xmi4nmM*xnYZ%Mv}0;7A2ZvGteciS0Z
zUnmqX`(r+HSl&-G>0ziPuyYMF8Vdyt8kNzJfZvqJW7#~GVMg}^BK?l658|ojJbY)#
zQh3g)sQWL?*~!HdCE|_Ed1-35T=^`nY8WD!A8m+%vuNS674Q=JmUSf6d4}-o%kG7<
zR`a!)n(B5o7M8wDOgii|V^5HT7&9a=(|=|-jt3j73r|whc12~7>@ItlzMTMHY}^f}
z(X)IKOzV;5uY=+@e#2tej*s0?llufuY-Jc?IBnTCbVDJD5HQVPWXPlYOkaI{eSA)x
zCyXsC7GyC~uS;g~?6Ww!q6%Kc2Ud9C41ROP_G*pD{KrX=O5X<hjGG8dl?#VJ0RDKe
zV2Zl?HNjmI#7L6Zuljnj&{Kr&5_+mQq-m2x6ckfw9r64+k5AWGAPSoqp&adhK-}fK
z4DT`gkwFQW><Fp!`Y}`gMBq)5e-R>|{6>sgNV!&uNIR7~IOp2DRT;ER|Mbu(X=9|*
zA_3`I0euyk{D^w~G^Wv?n?4tPO3R0_dgYd`G_jw=PZE1HAw$A+iTy-jZNy}nh>${Q
zVkA}U2Ufr63VlHXHWE#zDQUa{bT?_Aj*qYW$Lj8cvUXY^;*YX1;whu%U|myuX-hN`
zs7G_v5^&=eS5>ZODP@4bPBDgkv4Gbt-)54B;m-`OGQ7b+^-S2Qqeh@P7%}7rO!^Cl
z@v+tUo+-9KK9NLHI;6u~$c0SxKNreD3a}bhAA?+u%vs_11N_OFrEm$gwSBJBJoE|<
ztUX1+p1-aJGOd;CHfp6#-149bYq^}8w=kT<<2Uw#3*WoZ15aQ<vkuu9Z|;L<@L!u(
zKq2OAIPavwbnNq*pE0O>IEn9XSgL7&i&}Q({gS0uzS2rz5#(sH!g{DB8=6k?{4NIe
zpf^QU&|=$7G2cHVqoXnTQy!Z`M43zA&5#_zvK0S)<9aAVPivv;%S`xBvQs{8ZOwsS
z<87@~umBIXeg?jb*_*nu-s8>bBh$$`E3zq7b1h+pCvc*z-L-(5MR>ODI;g|{ZCf#~
zlnED^soKiJt$L<4;kNcDypP{+UkokQJM9sGjks%bo9juQvJs!&ycP2B(&lQ|f>k%4
zftT_3H?Jo9c{@C0xvJfHxT_<lU@;}66Us?*p#VytT$9`Ij*j#5l_xldI=pF4!sIby
zry6kU6_i`X3YK4Hpk-4W6l-%e*=zlEOFj%FI5<AYX0eq^imBD=O_x`5WVRO=w((SD
z>~3z&Be2~Dxgi+Q1N%sz?M&`Ov$K|zdAPG4*5MC3%c0-;UFSalG~z^8CA@|wx|Uzl
zL^HiPN&`DN7K)m36?01_ZZX@n&kXwa7;-KCsjGg)I%c|*_NNVYb9)EFPM+=_j7Chu
z@1vUF8V&@9hE2(grH7^6pv)QTo(G-yneG~p)~W7B$mrz>{R|TSu)T-|({@}_e=8GW
z40{<8T$D;`#~vQwZRKvoIl3bU261XfDcp`Hc0Awgq=qslP2#HiafVcuW*d5wWhXQH
zHOtz@;P}6xH@j*uC=K(LsML)(JCdunphzf&Of8ENTaMuyxBMb=mM4qx#h${M^!;T|
zk!#u$AM}*Fb|pk^=q)a}lR5GcM;x_caBC%8C^Q+je7z6CrrUV7k70nIBc9wHjd@11
zXu4<cr5B`!j@F=mrAIdMl-GzX1Kb*9P-Vf-Egje2TGTmA&A7{K>mL}ei>KC2tZcYW
zpNuEh$09?@!rr+~2$wUP5njhZAlEX486pgO7(xtD{K~D1mnY)E?ZksJxpI-R<;Qu^
ziG?qO)3AEy9I9>0cP@ne)~=lcFwn)EeJsHkx8e*`@HsjSZhLr0d5X?HAx9a;8LrwV
z5hxdVg%U+c#?ev*PvVB%%ju>wlk^q!aw~D6YL<u+b?mvA_wR`XoqcNCS+S8dkcD6C
zPcQhF%jun{4+pIC{Q;P|928D{RxU{t++t$hD(A6)>iA_5I!M!m49*e26FBAVB*WCD
z&Rx!Ssf6R(itX|l=6Eb1lZkAn1S974jq(795+VDDLRG|xKwhMB%fui0YF(ENl80FX
zMeuJ}Gf)k8T5SV<E)QSzFLX_ZkL|C5Cj7`>RhbA06%8sb5+N~xk-fzYDu5Dkk?O>S
z+E>Mea#bRyooIMKuY?crIlYSRq;KfiKv%5a1m>^1lSRIdb;xi6Ksmv~CQ!atQJ`2S
zxF&q>l!l#y<&cl@!G(0Ath%`bzdiV>s`M4jj?>+<41T54ovYv{Sq8SOyn_Y#+dydO
zO~Fe;d2k)h87=^{77kZ|%W*w+1WTq=Xl2dX<x00Yho!o8Ub&S--c<QpxzU|)qn*h&
z)nLHr8jg-d<V7}!3eAJO<S>KdU25Ti-MQ$<=+59fX?Juh)%dlTGg1!sTXiERAV(FN
zb`B*iG)Cv_g*|mHRa8EU>7lZsdx(vl-tHTg@xXAz3``dnToc+wFNG7K`EVOvjFq7s
zs$0gM?eOM0#pLg*Bayx3O^JA!K39~Fl~XUWaw=Z-vKaJeprU7N_6-C=)5ppe!X>a3
zpAXl-!`7SOw;<&ZEeYbQ(RC!3J61cR$_=sFQdUJi#6X{rk8n$c`EkI`P+LWusj6P^
z<4L`s<YI1}pqaOmmTfJLhFXGwcaC#*rubs9(D?sfJfpmslP;7g<%<(aPbZ0oH@Zsc
zle}oAWX{1$u`~$aulCl!qn2BK8j`<A<M^ZP1#S29#D^IkCGcimoiHj_5^2K5l-Kfx
z7a3I6e2!^O)})&GFo9)e9Xy8Z=6`Q+4pbG@$9aCDjHw%;x_=ha^6u7|7mm6Hl`uzD
zt>|KTV05V9k>r`%l$H*BwJg@;Q`QHgdjU>jXl!1A#w!x%z#Q)8;)$_(s@iA9no^%2
zf}LTYZ)zS^@0&}n)9d%uz$tt)Tw;yxn}mX=dBv>LkGqYS8ef?6SzemjLXS~jBrfm4
zHSxZ=?95;fT{gX%{3o9JG(HiphE;ejUR;?dZC4$Gs>Z2uS2Wv+Ut<PeXRz?PiCS>n
z7M&=DZ{gh&HSisLX5!sVPw|9P4BsK3>ijsjUtsW%Y3<Z-z#NY1hH6NPb@^2udy--H
z4tzoVaDQ&eIX0<tiAdLS=sydxpaSHJ)`H0iSk0buV)Y!i5@*sBGgOXaYuTwrRPwu3
zrZC;PgyPQ+lwG5IsP3F<g&I~a{V6T;raE!91IwpOy2~G%s)kprZ%jP{*E@z_X3rAX
zt`MiJAr~{{CWfn>#I&dJ_#M@`%2R2Mr~CnO)ww&2o`kCko$B%&FZd?Iw;0sn^gOpz
zG41Bow;BEpH_pn%40)Otsk17f!b?nhnc-Rd`9bfrYl{yR!Jo0=kQZLF`VM77%PTzN
zHG-M^W7U;grQ$Hk-S0Adk3lh1JE=XVLx&yv-NQLB7tbFqhx7RM;kPI`esX8+$d71|
zoirFUZMP|X5o0ha!?Tj5i5<*lAZPl0CKYOWy(YAor2}v7VXx<!K1q+amLUD+WCWuT
z!%m9%ebQ9Vvwfe+DP5C7(UnV61tqK-T#&>6+BG0A;Fmu2W`olB0`a34fx4q96+dPx
z$D*hCx?Y(YGy~zS8*Xf8=FW{e9kP<#bf>1r@$2~3T~E>t;NH7y3PU`De<%>@2`zuY
zJ8i}@cQ^Hn6Hy*!ILdGz!$S-f!*dKT5!fk_(Qs_sIl|8}^>+;a!|*<XdhrXkNG3xe
zZvS-UlnSQx+%IBS$WXzsfT50|g<%WBHilk?eg?^KJHwT?SUE{Fa!tTD5S0;mlxG}Z
zxO~l#$GA)9{*;jSbMtaNmyc1`&NBSL*m%7^G-{A@?bPXpRGDyPU6C*H`pfYs&-3~M
zV^1jHlaYEo;E#r5Q6p%Q&n1VIWFN}c8QuVqCX}V+BK51>Jx8+Xcf*rMRwkuq=HrDU
G-Twz)XGf_3

delta 5461
zcma)A3v`sl6`s2xn`HAOA#Xx9gpe#GAz&geX%Lb?c!U5+ffWp!{C_q}_Qm`gNJ4l-
zC=>*RjOBS+5A}ekMa93O^>Nw?s6{D<LM#5-b401NMcQLuo@#4*@BEluQaq=dvtQ<)
zxpU{v+;8r^|9*MU^3;Ay;hz!`ZO^Q|-kkp~V;`}IPZYLi;#X@wRoD(I7*l|GN3u0}
z%sGlF_*lZc`OIf$PCEm7#^D0yEMSU*DdX5S#+;*>GM*`=vRz05N*Pni**0L#F-)1j
zlnNXYAHv2ob9x1HPGm|gQ|hRnTht7*e>z)p4|C37$}*-br=gn`NA{4R7N)G03(XP}
zqH=0+RAt^;=Bs1QI<&8yDQ-DDhbbLQ@yPaExev6vo;mB8vW6+0vVR`gy42y-4@Ygx
z7~byjs(zPG)eLK^$L~=zy*MWdY`hr{hkqME%*btuF{0(5*q?g=x^-pbY0HRS95bT1
z>Q=k?IBYwB_lxrUO4y^f<sSgpBfhkk!%i`-zyo{rM+*K8FdzmBH8`kS$21IsKG9sV
z1A4@7OJ0G8#9_yII4;z255RN!C*yWO<~~%})N8E?_|<ZoBZep7Gaq20p)V7JI9i$p
zkBH|>?O6lVpM+q<hXS3dzs0oZfVf#Y3r_15Wxu!N-cc9L;RB-Ikrpek=}6(aSXkjk
zBcHDb!T~XM;sSV0XcHTv?Dje0?8JxRyxue^!BUBjYAEFKyX~Ps(9@>11-Pn}mzVQ1
zsHVY))jB;v9$2q&pcT#%91@RI7LGiEO~V@C9@QV>%jqFbQ{6s%s;?0jD({EaMQPPW
z;QEtQS3$1&oyp_jqBu3VDCGpHsUvU`ysm#dxd}4DrYNqE3vK8fZnrNKjZ?=YUPLj6
z)jYQIqJL_^=r?I1-UDW&_*_~@<vv%4d%B4L+@P^vh_h3hGpyL8PYiykJsZE;W-)2{
zqADxvv$8da!1^pKQ;A~AFs4K^#mcsV1wu&67%39Nwpx&uu}HHxab@~5qs(JChpLQN
z)!!CS@c4S8yF%?{Q+mhS#qoPeAy)KP=al^uMYu|G(OQQqsG{d@QkR|JbArKba6^>M
zC>_~lLHkm0ghkoF2C0w&3HX->{9jD(nK1w<<d+C%oD;d#6W|LmzuIYiorc~JPgS3S
z9b(nYB8b(u&aAcMnqH$p=~P}t&1D2f#7DDMfJMxhZHE)${@DuH#F^Pn_@Nk8GZj+B
z@|uge=A5vO-_S(Kh9hED?F7qLOlY+$(=L(au{g#oxiAirEj&m6ur>*%5K;J2f;a?6
zG|$1wMx0v>wYYhpE66X?SR!`v1O!JspHCf`qGZm@tSpj^UmxI#S0&W=M>Kd$Y?)I6
z?}{hp6o6g4HRnM%CYH@zn)nGBfnPj6?$BSD8)LDSkmAGQ?7VqaJ2lh9i1~McL(H8&
zH7Aogu8~5yq@7yhsIO8SnIC|gB7VVmn5j=*;0L%@9A8*(eS$>p6@OpY2r;6rp%CVa
z8}4lJUdt&sBYxR2N-SM8YP5w^e2S8eC?l!e!?n;dGQEztSx*XVV)3dpaePs7dVUld
zY{Nbll?v&Q4S5#cAkHtkXq5+kPCVL}@wH!HZd{-47?igpAv>}xsTSU>&sjVITFnWl
zsloJ)jjWK)8iB1khmEJv+UAX;)*A%x5HygebVDPxG7#uChgv*-MeTOP^2OA>P`tXN
z1PSu-lJfk!$c_;>mqn#ovLMcq0Cr0XWFcSC^Y`g@E!_(R6*Oxx%5)7!XbJ?&@QX3V
z-hSp4KF!WAi0e&NV=Gb0ku2}FF)Zi}g!p7ye;BSGG_2hrPg|$Pr-|z3@~P8F;RMu$
zCON6SgkU*|$F&FiA+62jRSj#0>T!33I4S0Kl;+j)ynoW11GVBxa}oIUFPbZ%u$ctz
zBk&_Ql8j{6I+eTJY7NF8TwcR1FJ3mbd>M&&348=$z4BP34NWvahH$4`>&0csc}_Z`
zR9v48vq1(V;yq4AOo#EnxwyXUrCDcitbt2M;Z%xncXE;RA$i9!j4z}meuwQyB2{-r
zi%rhBf-u$mv@|6dhP29p1WO_Y6RwHTE8g#j;K>NoRwvnlk?BrLOLoxGMl>>5<tS!S
z0Vf)Xm@(=@s!v<m5m@h!uxbSvX`X{=T3U7w8AM464U&Axrr2iT>+~m`2O)kX%~(Zn
zzgW03Z?a_01(fD4>TD&DQLBYos|ko#(sUxU{BT7aU71@&?s24=xgrk-csq#`r}#7i
z55XD&H$ewMr}%8;_^L3y;!Ju;&q)8uBfk>`h+3r>Za1we##-X|s#a)SL>gVRvPP|S
z1l<To@&L8%G$b7vq{p=cT?9k)huL+FW=IpHMds0NY)OJ2h}|t!Saw9lp!~80s0`z5
z*rAtbeneC<SC0rqpRUeKf0JA^Boy`PSuPbe4^qa=8qyQtG+?qQht~KT0WIHvn+>mT
zCMFsqLK#t7NZ|@SQlv2rlNE#&GkH%?<qW%xXpcX1cO~xymW{O=ZOIlUAD4GBDAL<X
ztPyAO>FUIIDju2wjTlINCH!rL(60})wE<L!e<;OPlRu(REr3c<s}|&kc_Aqw$r0wo
zdeQICs*o%SlOx6?hqOSFL;8(qXS&vmHXo?@@aKW{0<0iMw<iHsJ`>%etNTf64{avi
zB!lFEoz6n?Kr%vV4l51wz;ygEcOKRkXWYeDY2I*uC=*Zqz0&D%5r=-e(`AXqvDRVo
zbooZIA?kmKP~Dj%7I|!$#ls>3h|EivYxu)@&{GgWhlU6~-0puzVuU5PyWdRp<nJNo
z@Q2Ad$qVW3xUjp81gbJUsth$Er86OA6UlBNh;V;(XBJEn%RBR6pT4g12qeq6G7pzB
zEZ+541=xLMg)+8$7iacn$aG{kYhHVat5pklyFw1WQv`kK&?2_`R^i=B#8Tu-UpzF4
zaeSmG^p{N}S2tjZEvxwdE4uXi^QMFebSrsDuS+*yqr;E{@}X*~{V7c@aJ8ylnY(gR
z1DKeaJU+F>>+!3UBx^)epa>4>j=*(@egNmM5~*veZM0!yn2W0}QSQsPntK&n)|TY%
z#1Z~I0{mh{L@jw6_JcU=tCt(iiavndYjC6WHRdKQVQI&3Ge<|ef<bT3H%HY@(rjKK
zyghm2!~4E<aqhJ5gEWaQPOiw&hQkU`sujVb`Vws?!1u-bp{)9SByot~Q3OZA(CZ^_
zPWaN$LGnqo(`y9s4(+8r^A0t3jevkqyUO6G5M7_mHZ!q|bB9Pi9NlF3DvRY2IK8Rv
z>(>xjN2QZS8N{#(eq)_GeSq$b45AhrP#ui7=5c-Y`n6CY6*ZA^()I@uawM4?`igqP
zX(N%eCW!0Z<@gqv)-wf*+XX$7;E34VQv@gVV?FyJ{RGJjNr@GraYJ!3QQL^A@n|8J
zzZbc@eS<TVoatGPg|WlJPtnX1;x8Ktr%>=StnD6m7gr6NqWTpcaC4V0A~Oz2nKBg0
zsF)e)*B_D2rwN`BeSIZhhQJs5vf!k+)K>&AiO>7~n)x^hoFsS&0mJ!Y)P7OGrrh)=
zsCk?qQrE;yh1ox+BQ;A0n<W|l60wLH1^iXrw`l`xCeN89eVba+h3BZXfs{$d(J>h@
zT7c7gE(hul*ToHdG7dZ9Oixg`W|qj(weJinlCz|zFOztTNxvl^+*>q!5qWP*A)M9!
zv*l@+X&wVD@+^X3_0l!gcro?NAsFg4>LYjd=210e7ppg9q+xX|{TOTdu}wusW^C1#
zhaH(=8m!ao9}_%JKpy3%31on6q}B@r=7przH+nSUrpEI#Br3C8ct1a-zMm01E5>Yd
z-14$(TPEBPJ@|ZHe{5S4)SM-W^9WyGQaT4SHA)R@>G5TPR|up+xsqJ-7K`WwDRz4@
zSVh+MJa|joz5O~~fZQD=^>5=OBdXmKGU7tq<=5H++!t|qrjW}dv19OcAcVgRpl`#w
ziQL2P9d_X2<c>m^C@$^DhRfo{jt?rN`Im7Ke94gYnzT1!9vN+?RZOF0y`Qu*HuGE2
zgE`X%iBWM_cH?{d6;ZbHNzDFNcNS%=CJFi(g2^{~ew9|4FVc2Rsp=+$TM70M>?Jrz
z@EpNug4Ym?Xn&V4*kk7K^J2@ctj)1xPbz{j9Df6$+~D=JmIr&NWmwJsCNJ(G$uxpO
zf*b-nfrDTs!F+-Sf@XsI2>b+`;P!f!_h6r~IYv`mtpV=myGW#$V6e*J57DE{>-(rR
z7{B>Zd^8fZHczmpyv^H1zu*`#w;EiM`u1qSU#0nj)Z%Z^{B*6;TcWzSzg$t<0={5C
l^MpJBKPQTCav{G$@N0sr;>?3%VYm3>gVXSbjpE%+{{=}&A~FB~

diff --git a/tmw.py b/tmw.py
index f89d46b..bba2d0b 100644
--- a/tmw.py
+++ b/tmw.py
@@ -321,8 +321,8 @@ def segments_to_bins(inpath, outfile, binsnb):
 
         filenames.append(filename[:11])
         binids.append(binid)
-    filenames_sr = pd.Series(filenames, name="filenames")
-    binids_sr = pd.Series(binids, name="binids")
+    filenames_sr = pd.Series(filenames, name="segmentID")
+    binids_sr = pd.Series(binids, name="binid")
     files_and_bins = pd.concat([filenames_sr,binids_sr], axis=1)
     print("chunks per bin: ", bcount)
 
@@ -539,7 +539,7 @@ def call_mallet_modeling(mallet_path, inputfile,outfolder,num_topics,optimize_in
 import glob
 
 def get_metadata(metadatafile):
-    print("  Getting metadata...")
+    print("- getting metadata...")
     """Read metadata file and create DataFrame."""
     metadata = pd.DataFrame.from_csv(metadatafile, header=0, sep=",")
     #print("metadata\n", metadata)
@@ -547,7 +547,7 @@ def get_metadata(metadatafile):
 
 def get_topicscores(topics_in_texts, number_of_topics): 
     """Create a matrix of segments x topics, with topic score values, from Mallet output.""" 
-    print("  Getting topicscores...")   
+    print("- getting topicscores...")   
     ## Load Mallet output (strange format)
     topicsintexts = pd.read_csv(topics_in_texts, header=None, skiprows=[0], sep="\t", index_col=0)
     #topicsintexts = topicsintexts.iloc[0:100,]  ### For testing only!!
@@ -587,7 +587,7 @@ def get_topicscores(topics_in_texts, number_of_topics):
         
 def get_docmatrix(corpuspath):
     """Create a matrix containing segments with their idnos."""
-    print("  Getting docmatrix...")
+    print("- getting docmatrix...")
     ## Create dataframe with filenames of segments and corresponding idnos.
     segs = []
     idnos = []
@@ -605,7 +605,7 @@ def get_docmatrix(corpuspath):
 def merge_data(corpuspath, metadatafile, topics_in_texts, mastermatrixfile, 
                number_of_topics):
     """Merges the three dataframes into one mastermatrix."""
-    print("  Getting data...")
+    print("- getting data...")
     ## Get all necessary data.
     metadata = get_metadata(metadatafile)
     docmatrix = get_docmatrix(corpuspath)
@@ -614,7 +614,7 @@ def merge_data(corpuspath, metadatafile, topics_in_texts, mastermatrixfile,
     #print("Metadata\n", metadata.head())
     #print("Docmatrix\n", docmatrix.head())
     #print("topicscores\n", topicscores.head())
-    print("  Merging data...")    
+    print("- merging data...")    
     ## Merge metadata and docmatrix, matching each segment to its metadata.
     mastermatrix = pd.merge(docmatrix, metadata, how="inner", on="idno")  
     #print("mastermatrix: metadata and docmatrix\n", mastermatrix)
@@ -626,8 +626,18 @@ def merge_data(corpuspath, metadatafile, topics_in_texts, mastermatrixfile,
     #print("mastermatrix: all three\n", mastermatrix.head())
     return mastermatrix
 
+def add_binData(mastermatrix, binDataFile): 
+    print("- adding bin data...")
+    ## Read the information about bins
+    binData = pd.read_csv(binDataFile, sep=",")
+    print(binData)
+    ## Merge existing mastermatrix and binData.
+    mastermatrix = pd.merge(mastermatrix, binData, how="inner", on="segmentID")  
+    #print(mastermatrix)
+    return mastermatrix
+
 def create_mastermatrix(corpuspath, outfolder, mastermatrixfile, metadatafile, 
-                        topics_in_texts, number_of_topics):
+                        topics_in_texts, number_of_topics, useBins, binDataFile):
     """Builds the mastermatrix uniting all information about texts and topic scores."""
     print("\nLaunched create_mastermatrix.")
     print("(Warning: This is very memory-intensive and may take a while.)")
@@ -635,8 +645,10 @@ def create_mastermatrix(corpuspath, outfolder, mastermatrixfile, metadatafile,
         os.makedirs(outfolder)
     mastermatrix = merge_data(corpuspath, metadatafile, topics_in_texts, 
                               mastermatrixfile, number_of_topics)
+    if useBins == True: 
+        mastermatrix = add_binData(mastermatrix, binDataFile)
     mastermatrix.to_csv(outfolder+mastermatrixfile, sep=",", encoding="utf-8")
-    print("  Saved mastermatrix. Segments and columns:", mastermatrix.shape)    
+    print("Done. Saved mastermatrix. Segments and columns:", mastermatrix.shape)    
 
 
 
diff --git a/tmw_config.py b/tmw_config.py
index c0deaa9..f245b10 100644
--- a/tmw_config.py
+++ b/tmw_config.py
@@ -45,13 +45,13 @@
 target = 2000
 sizetolerancefactor = 1.1
 preserveparagraphs = True
-tmw.segmenter(inpath, outfolder, target, sizetolerancefactor, preserveparagraphs)
+#tmw.segmenter(inpath, outfolder, target, sizetolerancefactor, preserveparagraphs)
 
 ### segments_to_bins
 inpath = wdir + "2_segs/*.txt"
-outfile = wdir + "segs-and-bins.csv"
+outfile = wdir + "7_aggregates/segs-and-bins.csv"
 binsnb = 5 # number of bins
-tmw.segments_to_bins(inpath,outfile, binsnb)
+#tmw.segments_to_bins(inpath,outfile, binsnb)
 
 ### pretokenize
 ### Perform some preliminary tokenization.
@@ -100,12 +100,12 @@
 mallet_path = "/home/christof/Programs/Mallet/bin/mallet"
 inputfile = wdir + "6_mallet/corpus.mallet"
 outfolder = wdir + "6_mallet/"
-num_topics = "250"
-optimize_interval = "100"
-num_iterations = "1000"
-num_top_words = "200"
+num_topics = "250" # string
+optimize_interval = "100" # string
+num_iterations = "1000" # string
+num_top_words = "200" # string
 doc_topics_max = num_topics
-num_threads = "4"
+num_threads = "4" # string
 #tmw.call_mallet_modeling(mallet_path, inputfile, outfolder, num_topics, optimize_interval, num_iterations, num_top_words, doc_topics_max)
 
 
@@ -115,18 +115,20 @@
 ################################
 
 ### create_mastermatrix
-### Creates the mastermatrix with all information in one place.
+### Creates a matrix with all information (metadata and topic scores for 
+### each segment) in one place.
 corpuspath = wdir+"/2_segs/*.txt"
 outfolder = wdir+"7_aggregates/"
 mastermatrixfile = "mastermatrix.csv"
 metadatafile = wdir+"/metadata.csv"
 topics_in_texts = wdir+"/6_mallet/topics-in-texts.csv"
 number_of_topics = 250
-#tmw.create_mastermatrix(corpuspath, outfolder, mastermatrixfile, metadatafile, topics_in_texts, number_of_topics)
+useBins = True # True|False
+binDataFile = wdir+"7_aggregates/segs-and-bins.csv"
+tmw.create_mastermatrix(corpuspath, outfolder, mastermatrixfile, metadatafile, topics_in_texts, number_of_topics, useBins, binDataFile)
 
 ### calculate_averageTopicScores
 ### Based on the mastermatrix, calculates various average topic score datasets.
-### targets: one or several:author|decade|subgenre|author-gender|idno|segmentID|narration
 mastermatrixfile = wdir+"/7_aggregates/mastermatrix.csv"
 outfolder = wdir+"7_aggregates/"
 targets = ["author-name", "author-gender", "title", "decade", "subgenre", 
@@ -168,8 +170,6 @@
 
 ### plot_topTopics
 ### For each item from a category, creates a barchart of the top topics.
-### targetCategories: one or several: "author-name", "author-gender", "decade", "subgenre", "title"
-### numberOfTopics: Must be the actual number of topics modeled before.
 averageDatasets = wdir+"/7_aggregates/avg*.csv" 
 firstWordsFile = wdir+"/7_aggregates/firstWords.csv"
 targetCategories = ["author-name", "author-gender", "decade", "subgenre", "title"] 
@@ -183,8 +183,6 @@
 
 ### plot_topItems ###
 ### For each topic, creates a barchart with top items from a category. 
-### targetCategories: one or several from the following list:
-### "author-name", "decade", "subgenre", "gender", "idno", "title", "segmentID"
 averageDatasets = wdir+"/7_aggregates/avg*.csv" 
 outfolder = wdir+"/8_visuals/topItems/"
 firstWordsFile = wdir+"/7_aggregates/firstWords.csv"
@@ -249,20 +247,6 @@
 
 ### itemClustering ###
 # This function creates a dendrogram of items in a category (authors, titles).
-# The clustering is based on the topic scores of the items. 
-# Input: the average topic score file for the category of interest. 
-# Parameters
-# figsize: The size of the resulting figure in inches, width x height.
-# sortingCriterium: Topics to be used are sorted by this criterium (descending)
-# topicsPerItem: Number of top topics to be used as the basis for clustering.
-# targetCategories: Things like author, title, year, depending on available data.
-# method: The clustering method used to build the dendrogram. 
-#  Options: ward|single|complete|average|weighted|centroid|median
-#  See http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.cluster.hierarchy.linkage.html 
-# metric: The distance measure used to build the distance matrix.
-#  Options: euclidean|minkowski|cityblock|seuclidean|sqeuclidean|cosine|correlation|hamming|jaccard etc.
-#  See: http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html
-#  Interesting combination: *weighted+cosine  
 averageDatasets = wdir+"/7_aggregates/avg*title.csv" 
 figsize = (10,80) # width,height
 outfolder = wdir + "8_visuals/clustering/"

From efd5fef23acc77babfb502a3cca8996601a2ba66 Mon Sep 17 00:00:00 2001
From: christofs <c.schoech@gmail.com>
Date: Fri, 4 Sep 2015 15:56:41 +0200
Subject: [PATCH 34/56] Bugfixes in heatmap, avgtopicscores, related to
 numerical targets and std

---
 __pycache__/tmw.cpython-34.pyc | Bin 35963 -> 36091 bytes
 tmw.py                         |  26 ++++++++-------
 tmw_config.py                  |  58 +++++++++++++++++----------------
 3 files changed, 45 insertions(+), 39 deletions(-)

diff --git a/__pycache__/tmw.cpython-34.pyc b/__pycache__/tmw.cpython-34.pyc
index 5abb7d2c3808db7a0455f18bed67d12402f8ebc4..4e549fd7bbf2da47311a88ea43b474dd225528bf 100644
GIT binary patch
delta 6372
zcma)A4Rn;%nf~rKnVHOFGLs*YOcFvSh6n>m1Q+DTYWN931VSLeAZr>XGhc>D=8yY*
z0RjU-QCNSr^l*_@EbZ2{v`XzccErV2j{UJMSS!-AJ?f|K`orS7sON0m+EtG0zR&$a
zW@PD}PR{q_e)s3S@4fH)KKH%frEjX29#Y*Od5Ya{Jo5hfk6wS$U0b8BXwOco8DdV;
zJ`hXBBI0oE@7RuPWb)JMq>Q2{rk!h1Z9AVq<sjpGMQi;Wb|4$B{{~|R#HBzhyI;&`
zh_Qp&Lk*uZc1Rp-)Y*}&V`^ss+bz~l-_Lf6H>W?#zAlagUtm8F+Kju{Q`vu?aUZJ^
zB(i6Rqdk?>THQe>_s}~(q=?Q;6?;&8x4E1h7AKkm{zItu5|~9sYFJD5TN*tiK5SmX
zPG)Ddyr<S)RTjqZL*mY0dEtXp43_W{V)d*jDS2X6nmr`0oxO@ZFZ9`Ou@|#F*EgxP
zzoK59W}!YDOY_t=oihmdHQ-@!Xig(LDo)PP+39Ta+*=sCm>s!cHY<4>&94B~1I~%3
z=LJe0N3jDC1)LTa<^?ML1;qve)23@t)23}V_}ij(ekc1lJ2c<U`lTpH2|BFJco8E$
z23!O{mT6CDJgNm<{3BGo05DNsk_hRB#uFif$F>i~;u?P)ZNCNl5%3@4r3D@AGf}bd
zu6c4zzd`MP0saK|1R!TGux6L4{~on}CJ5U2r{blBE7<2*|BVUeutbRazcR4wGxYf!
zpb?mMeRC*1p<w{^{|%VXRcdBBM|5pTEUEKLX#7-Mf77yBMmrw1(K~)_dWz{;N{>EA
zJU3b;9=mBx;m?%}Q`-BUIe2sPZFVK%Qig2`Kc!@x%1|+0qfChuDiOD`m$6+=C8;PG
z>Q*GXoxj|z=qlYJ9`vk=6pK|i*U#`O5g&<AnV~3=614196}Ba&^rE7aniLjsD&4^{
z@!-u{uZ8R6`3v^8llKyu1*2Ms^SNlhO*FSnVgDmmwrzBrM#UL%ye(Lb-7%d;i0o?U
zJb;E3fKSDTZN`#osHmuJR>n%`nW`4E3i|J1wRmh^=2oj&DdWZYTZd38p+|1rW8&zd
z8g^gy<f0Z<QmPD<DMRJhLt248D85*Hrq*(C>d$2aSjs|qepI}>q~}KI1WuZO|1&B#
z5SR`l6_4o#@5hTRo(gm6^mXVUJ%1BQe-M0WSA&m+nr<VNj)i+uJfib=(6$nA0q~Z1
zcj-dMlPcM{RW!C=TWrH(vE*{btHg%(vsP%1$0CNvh$FF3amljlm`(6yedYfO5rtH%
zsI_bc^QpW(`_3{iyXh!;b^}}lK|8O9N-o;){wPn4q^*UOVTq*#!6Hk$$*8y>T9)5f
zSBC*b+erPm2Gsd^i2FX^J#o+SM)sa~Y<UA~62D!3Kl{Guy`{VO6R1OvqS$zA_MdKX
zs*V;&{FZqB)>|D-DEh^um4Cxp#Vsor)XUl4fk3%60hF#orv-ou;vZHHu}h+8)l9Z5
zJ8xBzu?}%;^$N#v^y?6RTHVDeL`P>M>k=2E)#CO3CnsCxq|u7>Ma!_~=(UHxBD&U0
zo|d!hhg2K1ncl$|*9~c<4v>g+M4wpOS1yjN@l~`?aOwz5g-lezs#%@NyTuD@&R*t&
z?{rmOcI}^bZL63!(O>nFIsIyx$~R`0ubspOECL}(omSkyNi4HgQ>i29=H=8lPM|2A
z0lW(6Mpx;I>rtvDFkL#?GM0>J+k=JtHdL<@r`AnpK5=1P>+;3WV-k&3Y-OsSxl|7e
zs3pu#ZbnYBiMofKXr&+A$3c&b9oPu*&J=?w8CCp#W0T#mDMNNK)?M941qY}_wjrLX
zh<}Jih<O-#IhBZ{q=<K-ikQg<mddMgDSH)#WePwu;Id~GjCG5D>kh0fQieR>wTwfC
z>j#QKHQw=825ut-%9W}-4JIj7M0*h71KUL^BzKK#R3f^1^oF_Ep0%{bky3)5RJw(p
zSXd7X{_<JAFu>1>^F4Fd!Rvyh)>(kCHVe}6l)=A3)n)<hcG`4oH)7#oowuRNF&YmI
zXmP5v{(FpAbhFVP4~?cq3}V4}EFrow6$=(&^6N+**{L7pUO*p2xCT>6Ll1}In(5fA
zVYub`0qV`$r4v50z9!fWV@{zu#WqQ;WR0wjc%q#6!ow<w5WyXOFo%)-Z2cV8xDoPU
zVotlH1ihv&v{mDwsMa1bLbMK@JY{X~wXJuc3w)9502y?0X6a2*%iGZaRe8VE=47w0
zE(UGNpiP0BRkC8z!be!aW-`8AobUa~5-SR&f74K$?pO0{$!KuuU;$r;DI_)K#b4X7
zUF_cIYRE<CE=&!sS120&%tL2`oAFU`&Fyb&&I>OYPcq0T%evasGN>@MX(wjWICZlk
zh}1BPiMLl6T0-yMoZ6Pm)9S0aeXvw3ul_qIgtUtkl3`X2Yp6VveSG6LSy2ch1^{8P
z`i{DJ(htv4Z$5}hI=K~#qA1=dnE=2`rRvB=<d4V1ckifOD7~{R=bd~Q-Qk?l9eEO^
z6d(>r0JZ?q;?AUhVJpevIahR9t|)CJEiE_yYR&*RY^Ka~ci%L&D|@VOfDQCOYzP8J
zQOW>z5d?jF1myslq)E5pbsJzO;0pOUKa+=mR6%NFIoqy&FMC2f&_9<F>v4i#{r?d>
zW?f6u&}o-5`jv!l>6PbYA8e{x^a{*)1<Q|-q;LVr#1CyS^HO?rFSQUy(rtmvMoH3v
zPsQrnXNqLV<Bd2#Q2h=t<d9R0g=)r#0Ev`{wTSPiOiW0bnqu2!2TcvptJOs57Nu{p
zlDcJ7<sMb3CSF6D=T?S@b0sel=Tc3-TQP_fWlFQF(3%YOnL;J1%9&&a2xgwj)Lkwm
z;{rPpIn#LSwY!%XdF=**?x0Vcx_$B3L;&TEPD<R3NM7n;1y&7hf|WvJw>#;Yk=t%7
z%R>hlV|%Cr@4;&XfP;(6yvj<cD1#qOYurxNW}iKlG;Wx~vG4qE3Ftta2ud!wN_rDa
zXSzc1cy~C(HQg-JjZiWY;*oZ3Yb<2MQc2x(>#2lhweviO`e0M>><DpSPpXTE{cRY1
zGD%Yktk^AME=n?|5R;Xte!GV?FrS_8%N`7e8JjCUiA-}?VJaH62DU&f(;Dh?;U^<c
z#$Yb|WaQ0~8DcK-oL1z?1<A;hL4oMD;>M!=x3qfp#lgV_x@TT9IGeTYL;FGON|6;~
zGQ0w$ycJ$@YoyGaxLkNy@$|iD9rKG{MyD}B@$^pUuZksCo^ih%OEN9ycS123klzLE
zZ2u>|+J?PiP0U?2k78+pyJaRt*%3dKO~)Eo&iAZ4ghh&9_OoDdo+ekgSsCVsOI@K3
zKZFM9deWm@Igc_uT9R&#nq>LONh?E_v_K)E@OuDxuG2Q`XG_F};W~CWyLI?s=9AQZ
zD@|#t@okjg1`;ielHUKGM((JT`9(mNE*h8~8qibm5hKXICen!t77_aree|7R&+;m9
zDN#i4Gx%iDm~5F1H$Xt)jQgtyp18eke$KhZGmHR+OGl8zDxC{wJU%EO?^%V!MJ5=v
zWhu%HdSVH!KOReJNO*dLE!D)noeieWGyB~nDIv<Xw7IchvmnH^kZ4WRJFK;eJzJ*N
zevMlA5db|3{|-trQXXdFNMePxPzV1O)$gE{u4uPbsph(mXI-W}lupM-|MJ4+d1l2I
z7Ctk#@bRd++QN@QA4FBHsL@MVOf>6F?9uEx{a(f%5pNj&6^GH|+ki(2f}SfQOa_<S
z&WWAm*JG)2cV(D8h|yM<b&X77V?rBgVaJ3R`OooeNV4n07?gLD!>hb@R@3mFj?14x
zaP;#UNR)(W6TjZNx#AJTk36I3Zj8$wMb8hjOSf%d3*}``vMhwXk4{0a<wG_|$(`3_
zcr68N!W@4PA8q#+zlI{{nm)1|7v-akl{ksb0#K!OGkX_39v7=duV=@_*k}{`arS$o
zhgroFkR)&V7Bk9M(Y14$4^dz`+v&m_O70-H-?wvP88$DrL8W<A{xrHiA^y0taXvWI
zbPUF#BV04x5iJ?vsVEO6^75EA^v)$Xl2IkOR^@xZljxNNJS8%j>C9r%lNmodCC+7<
z*mL5~nUAY}hz>snJV!u>%nwjLCD`uTiXWkP9FQ+-_U^{&bJ%?=(Q~Ul`YWcCcM9Xb
z$R>90WMiOG%bj0BDR-3afiP)eY?JBKQ{0Fpquhad=?O4>M<<_0&9whA@R3|w_dw|c
zxsz0ml0Jdtm9kcuZuK$|-J`Sj#p`<-*_rGYdw#?gS=)d~P7s(5c}3BAGip`=uCyBJ
zV9p(N$bAE1)6U9rx*1737Fu=;Yh=jEy_>^u`yB<X`qrqtG*~vRv~)iSLC*kw0)TP(
zPXUsi*Q4|-z=~p&{-Sa7!n=r{Mpt=W=9cz6>V5?{A*S9P95;CA?kf6fFiP)dv;TOv
zm$jckkFx}qBO6;{9kx<}%uj!Yx}O82Ksl3~bKIfuL7;G-k5vl)zB=}@xMAOU`aD{@
zzp+!A=NyeRZG$nxbQwI9)CW^Mkr#LKVNvPmPP*<H^xcwdom*1=LLS)P&K8Sv`>WZj
z;^O|dW=ZW|CGm89l;)P|=8b^P0m!#>miIL8y@ges_K5@6&5CU<x-Qi*{+ejH_vO+w
z+8i`Uk+(<wrtsc()7TD3xd(6na1d|=@D$)A;CTYmo*YS}N3B!;Mbusdd<^&u@Hv1%
zuVO$Ifmuo)YH_;6V*{<}QIt%F_22T5#Y`EauS4iR9Z(Nw0<;2_0lEO)fQ^7n07|qK
zz8x?M$N+E`;kyYa-n7s_iYNI2RE+^9GAS<bDl-J@0Fcd(qVgDlS*(X+>Cx73d<5T_
zP3L%lOAr4ti|}7y)P$e$modIVACA+P&}3^w3#St4lpaHJ%Hb&-_T;YtOu(Cf8RGQ)
wn`Tb&O!YZS$X1?0kI!fK)Oh?JH~p1)Dm{}tMINuG$}`PlqnatA`+@2I3uWfc`Tzg`

delta 6282
zcma)A4Rlo1wZ8i%Gs#RQlRqYt9|DsGgb9#>4HEG|C<(})fsjBz#W2a-B$G^LhI1!`
zgg`_<(JC%E>O;l8=cBDlw65B*rCL|53tRsll2={T-X$%xWwqKr|6Z#Peedn>+%ywe
zy4tYj<J^7D*=L`#_x{el_u_r}#iP3W&z@ZO2e-Vv=@aV-cX^o}38!Y2^^!wu)-+8X
zEk8{|sjjKt*Qe~}XUidsZey3I=5~<8O>+M9S~{FErvEF^VfksGhHjH{D<gDQ>fXvv
ziKM){%B1^J*)y6lX^(81eG3iCU(J4o?v%%ZFVn-ayZR<drQWH&jf#)3%GP0LV?1Wm
zxPw{3!_Pv>AJ>=o?#5~^E2lpcjT*`JL@3!K?vW?w21+Cbc^RynWW3LawcF+$m4BJL
zoKB`LpZ7cciqBPyIYQ0OLn|Lp08ne?izns!`Q2>)$@vL7DrYZPN2lb#f^+mrYV+kY
z_44O=Xp@y~_C*pRK4=QU)Sm$Em-p6I(J}dStw}GWY8I{{dM7n_#RAGdhwh&PHUY-u
z4;BUTA3<{^pd0Xl{QaVU?<|^?3|59|bXyt5uB138E9#o){ZzEhLG9--09t}h`<z~Z
z;C;Y50O+zD{f6i^f-dnVh+YEx0`mO+keM_@e<&#;ySgG#Ls;m08Soz9Z8>)3O8Qv(
z7vHo<?dhkGegOCo@ZSKndZxX)GX1|G{VPK-LwqF17O$pHQe{{5lhalq@A#LF6(3{F
zCjf)Na+p1##Ds+b$o~eIuvJ-RWeu3d@<_}S|AWqtWbKj_<r<%OX9hpx&&|JF{x$H=
z{9n0Xagls@NlW(Ev=P!8x9yku(z%f=tv6Q-yR>f7!r5Bbt?eh;lcj}ov=N8q4|_EK
zo@{MI*G96r<<drYyjL4>X<>&Jc4}ds>*XmrB{eOahdKLojdnz|HVE>$OQUd>))Fj`
zr<d-mM0`vN3LIdQXk)Z8hm4RA3uUB!8ht7c)nDs;0b^g3zpoFLV)s^7GQ@sMnj(OX
z)qsy=^|B;QmH)QPcUeh>m)v?O6_D^uJ_O_o!dM+bN}gL*Mt7$!E}KVXnl^&%YLqO{
zdJDDQBAg39n+aLB{H1a`Y_W`rYOxSa*eZ_6sST|;%CSyfO1vg_H<bHS$X1|V#qD;q
zUYCzHY^e0|Aj_SMCnB9~aS=AfdGxIXTmbw^PHkN5d|YQQ&6n3VUY5rjDWrUIQy~?R
zD3*6Np0QJFJeM#@Weh8ZIYTSHOuBqz#kQieP?F73O)sZv%G1TPROw1DEjfmfEdUoo
z&>^P5C>I}Sd$))WB*bgzD`26>X9(ul_LW0$L58oss$?oA<P7o#q6W|uzrmDm$vcE!
zj;(TfCUsqq=dP}#8B$;MHTsr3ys9PlLm0+CP0JATQh#2Rr90<B_XG0cnl;WDX!_;a
zwO^rma&+yL)784?p<f+S0IeDfssmh*Z>;U*OkBQh4mGA?>taMJ<k@RhJ5ONT3OToF
z0~N_VO;xmBKGgJbgY7_`IqpE^$}xUyRmV8)8?`AunIeo!=dN<`ALZlg%gHOB>nxII
z*XQ}>aE_VCFD;{xd{jymy4Wab!<jGHY0Vu?!EBob()z|j`uz=qzJ(JCbT7NPL@(6E
z*3{nSDb$hHV5bVg$98uKw^jz-lPt6~!r(Um8!=RQvK6f|2Fqo#k0Y_Lv5O7b1aY(c
zaN}&=Ohro#)ys7)71y-0xHF42;1A*B<e$nI7xy5Q2_)c0wv1?6FO!9?g)_8pCWv6a
z#`&*ny%`!XQ$EvDI?4@Z2xbwU#q`2N!o;FR*{s1L5Qi;dzR3d#jf+NDYrfW(!NH|8
zGA|WqEbP)%y2dzhoyb*xxR6Q7sU=-nuUqTo#hfaCf3793WRYk5nd}MSBF(R?nZlE@
zl;T|QS=gs&YupsCJl<+jt@Ll2POr*En-<n1;(~d~l~!gV8c&L=SYl=Ju_i3%u4JUM
z&lF1`at=j99Y&Oe_Wzkf(MZ31uEBR@J$H*Q^O>>>Za{l8;Cf7Qb;V;zvojPmEN71q
z>F!C|F&^O2qCo}R^vz|nnh`HkS*B%p^<t_bKT|^yd8n9q4`iXJLZmu2*HX?_XopXQ
zkPo%_saZbNc3I7J5F`LQ04kHzDk><HMO)ATv&0Q*<-nFa>XP%fl+#YRZp+il?L<-m
z$%AmsUK-piwPO`g7|aySP^MT;F}9sS&-OXw_N_(!VQdNnt#KBWkVnsD&J%<3wykgV
zOv(Zk_9_>+{`tHGRqSIy%fZBC2p(n!IxlYJGV!iX8vSNVPkb;o$(C*DecA@O<X^6J
zmmY*ssDs!jy@XtRr||jeVmMWJ-BG%FJ7$Fd9SlvwnXU0eU;x)_pzG8#BA{q~aX{2!
z$g3<C23lPJ#XFs7h2?wK`>98|ww13`0aj>pg}OgDy)+eMDy&359%Fz$Kok&XuslhG
zOM5gl6dy>+(6+LL={R%Qai(0QT&sNWdEU2U?9{eWF6+*2n?)lj#|<6S(Sl_{uzwJ(
zVF2eGQo%$!fG*|TBpwF<Lx4-xNC($>%ur@1tLzAStKCZ{<R97>avjW6ea%0hYO#Tr
z_#%#G{aj=zn<j!Su%;^_%w!ubtk>b$OGtY!>+8(qnfOB=Jm0`S)4@H=d`x#teq7bu
z`K~PQog+_&JfpnhUS=Q%2&S8f&}Km{oLJ(}dYMz0<QQ4b$n@o6j?A!oS;8!IP}fSC
zgiua#w>^}JRMlP(Dn?7@fW(+{S<>#(l1x{sKxyETY}^D^k)1Yy%QvQ6F2_)*u?nxv
zyPEyj(#GXl^A?7dpjS3;{OagLj@fmM3eR+1VW%CJN#pe`j=qWA|3w+dRe3Tj!WfT>
zN2rT)3qM*;uB?X=hR9^G<%q<RSJVnV_^UN>Elc=9o5)t7NJ>Q%TxhxY)uyF0E)3Ih
zvUX#qX}QgKzhTS7L?Bhq{3V<VW3@~>Z40TKrZF-4^b&`MDk;w)4yFpjoy6?E)0pKP
z&%@(JCDqBF7?smO>vqPeWK(IE&bZx@;wAG{ex|c5%g!=ow8}CS{77g!KkVGv-Zh;r
z${V{X`Tg%dx)xCVK@7MHThFnRNyTh{ZMXAC?LSS|N?$sU?8KSbQ$bT?L(eQa!iiHV
z5A?hg`lph|HOYbL8upJ^9L^Fq!Z>(J?1P1uCRCApw$CfiMBM(baN10;x2l1-j1e;R
zZlsdZp=H<iHuZfrmV&vHEV-nRDZu)lV~&vMP%)-L$CVBp%VWg&)y+^mKQ$)KV>5~e
zZo-@cfQfj)E=p*re6z2D?n(W%?*YnF#J+|nS$cGk>(YPkpI23jNw+f$7c<RAIs>MP
z#lY-PhZ&C!B!l7(`EI|DI;AtV%{hz}FUUx&g0{;OVyZkIo3{YYYYOJu^#0E<JI7f?
zT{^_Z3x)u6tLQq9mI{d-tQs%)Y?!$YmB)C!5Q&%42Dv{zlO9Yx68|kZ_Vbj8JhG$S
zjZIpaAz_5%<Nec}_Wq8D+2wa|hqw>GKTUicEtMhnlPrs^o@Vb2UDXpX2dxKq<WAo0
z>PCB~dRjrSz^<Acp+q7&B))L-xXi}yHf(-Qdh_G?6~IE5Z2lM~+4&WT7s$tUWRoEu
zGiTEA)X&UYDf1!jm;2g$tM9?U2LZ<!f}Tq=P9<A9Z*i#Nax8crpwjMc$n3Nm8<;}7
z<%NNH^i5f`^S$wkND=A%7@ww3^~R;%7fX10>&m}>ljz2<QfO2xsmqmvJ-)9am(<my
zyP3DDkx7rHP7Us$#Y#^rCPCTT7!>r{;iN-L`l41zlMmT;41Zn5hDvhZL=y!PfAKgW
z<xo{Iu3#%O9OCzB^>X!VjCe%;=g{T+<>~*1X3`U>nZx&z?=fhaRI=c)R*vnSm4}S5
zvKk|1G87x;6mX7QTL|w*w(uKXP#53B(8r`0sj361TF$OW_kb`gci4!9MZ8;t`X|*g
zOEEfK<S2?&Z0nyK2;ae|?*dZtlabkEyS{Eu2|Xp7_spap$-zAz_#ef9CjdWU;G5-P
zw4agt_Llp;jphl!WL;0}ttvf@)3-}Lx1PuUaw(VZ4)I*-PkVRM=<_%ro~Mb^fb^|7
z3T4X0I3_F0jEiI>)-6yvOnPH$5{tN-&tHAJD};`85WqD;DN?gkOrVtI8dlX=n@Rre
zMw2ec>-JUAi>c9lkJB>y7_i8b43<;9ZkS>YBx?bedJQtTbGQPPa6q2hU0lR9s`6vD
z?Z-}o9eMBmp3d{U_;6;8xwBibjP286+xC-C^aH?C0614X4Nz3wiq;PS>1*=~ojWPZ
zbHodnq%P3()=ok8Jm5+BjRQgD@G0`w2a4%$@?(B}HC1)cON}pL#H$Q{&ukotefug6
z19*HE@MC~ds8&*Ijt3N8>kr+WM}_j}%@uS;e*fm*^0&$dZmDWg?iu5mRz_DOX}OXj
z6f?WxqJL7})xm`-(6jhOF3BGe+1KeUP2Py-P$Mmo&4)_q4cT|-+<ayI8?2sR43)c;
zxsztV<^c5DHmkR?!?$Mp*El9FUpM7uxcKF%g2bEh+qaGtMA3&gcul<-inrw9+m?(D
zLCXQaVZdF0`v57xNx&%v%MlysPYl@=z$=ix15h7P)88pI*yRBfGgt-u)fBJB(MU&4
zVhAnEY5#A>$THGK`2!Ee&jL&X%mB;-Gy>KGHUhQ++5rNv6Y$xW5V414oNq>`BQ9d%
zFh=YJOjK6l2p(0na1UA!suVkp<~JFvT(dKh7^>-v4&bA&l{MZFD#Aa@BH}sBnus%T
z2J?MpUzERC#%jVwXS_cVHzUbNJSGq+0{#?l0)7E_3s5ar-M)QJxo1XRRz7($JlUSS
iJcp;uQ{r*+ze11CGsTnR@p}B8S={UPRLIwEpZ)*!*1?Ma

diff --git a/tmw.py b/tmw.py
index bba2d0b..a9dc2db 100644
--- a/tmw.py
+++ b/tmw.py
@@ -322,7 +322,7 @@ def segments_to_bins(inpath, outfile, binsnb):
         filenames.append(filename[:11])
         binids.append(binid)
     filenames_sr = pd.Series(filenames, name="segmentID")
-    binids_sr = pd.Series(binids, name="binid")
+    binids_sr = pd.Series(binids, name="binID")
     files_and_bins = pd.concat([filenames_sr,binids_sr], axis=1)
     print("chunks per bin: ", bcount)
 
@@ -667,7 +667,10 @@ def calculate_averageTopicScores(mastermatrixfile, targets, outfolder):
     for target in targets:
         grouped = mastermatrix.groupby(target, axis=0)
         avg_topicscores = grouped.agg(np.mean)
-        avg_topicscores = avg_topicscores.drop(["year"], axis=1)
+        if target != "year":
+            avg_topicscores = avg_topicscores.drop(["year"], axis=1)
+        if target != "binID":
+            avg_topicscores = avg_topicscores.drop(["binID"], axis=1)
         #avg_topicscores = avg_topicscores.drop(["tei"], axis=1)
         ## Save grouped averages to CSV file for visualization.
         resultfilename = "avgtopicscores_by-"+target+".csv"
@@ -704,7 +707,7 @@ def save_firstWords(topicWordFile, outfolder, filename):
         #firstWordsSeries.index.name = "topic"
         #firstWordsSeries = firstWordsSeries.rename(columns = {'two':'new_name'})
         firstWordsSeries.reindex_axis(["firstwords"])
-        print(firstWordsSeries)
+        #print(firstWordsSeries)
         ## Saving the file.
         if not os.path.exists(outfolder):
             os.makedirs(outfolder)
@@ -729,7 +732,6 @@ def save_firstWords(topicWordFile, outfolder, filename):
 from wordcloud import WordCloud
 import random
 
-
 def read_mallet_output(word_weights_file):
     """Reads Mallet output (topics with words and word weights) into dataframe.""" 
     word_scores = pd.read_table(word_weights_file, header=None, sep="\t")
@@ -849,13 +851,14 @@ def get_dataToPlot(average, firstWordsFile, topTopicsShown, item):
 def create_barchart_topTopics(dataToPlot, targetCategory, item, 
                               fontscale, height, dpi, outfolder):
     """Function to make a topTopics barchart."""
-    print("  Creating plot for: "+item)
+    print("  Creating plot for: "+str(item))
     ## Doing the plotting.
     dataToPlot.plot(kind="bar", legend=None) 
     plt.setp(plt.xticks()[1], rotation=90, fontsize = 11)   
-    plt.title("Top-Topics für: "+item, fontsize=15)
+    plt.title("Top-Topics für: "+str(item), fontsize=15)
     plt.ylabel("Scores", fontsize=13)
     plt.xlabel("Topics", fontsize=13)
+    plt.tight_layout() 
     if height != 0:
         plt.ylim((0.000,height))
    
@@ -863,7 +866,7 @@ def create_barchart_topTopics(dataToPlot, targetCategory, item,
     outfolder = outfolder+targetCategory+"/"
     if not os.path.exists(outfolder):
         os.makedirs(outfolder)
-    figure_filename = outfolder+"tT_"+item+".png"
+    figure_filename = outfolder+"tT_"+str(item)+".png"
     plt.savefig(figure_filename, dpi=dpi)
     plt.close()
 
@@ -996,11 +999,12 @@ def get_heatmap_dataToPlot(average, firstWordsFile, topTopicsShown,
         allScores = pd.DataFrame.from_csv(infile, sep=",")
         allScores = allScores.T
         ## Create subset of data based on target.
-        stdevs = allScores.std(axis=1)
-        allScores = pd.concat([allScores, stdevs], axis=1)
-        allScores = allScores.sort(columns=0, axis=0, ascending=False)
+        standardDeviations = allScores.std(axis=1)
+        standardDeviations.name = "std"
+        allScores = pd.concat([allScores, standardDeviations], axis=1)
+        allScores = allScores.sort(columns="std", axis=0, ascending=False)
+        allScores = allScores.drop("std", axis=1)
         someScores = allScores[0:topTopicsShown]
-        someScores = someScores.drop(0, axis=1)
         ## Necessary step to align dtypes of indexes for concat.
         someScores.index = someScores.index.astype(np.int64)        
         #print("dtype firstWords: ", type(firstWords.index))
diff --git a/tmw_config.py b/tmw_config.py
index f245b10..9ef6935 100644
--- a/tmw_config.py
+++ b/tmw_config.py
@@ -100,10 +100,10 @@
 mallet_path = "/home/christof/Programs/Mallet/bin/mallet"
 inputfile = wdir + "6_mallet/corpus.mallet"
 outfolder = wdir + "6_mallet/"
-num_topics = "250" # string
+num_topics = "50" # string
 optimize_interval = "100" # string
 num_iterations = "1000" # string
-num_top_words = "200" # string
+num_top_words = "100" # string
 doc_topics_max = num_topics
 num_threads = "4" # string
 #tmw.call_mallet_modeling(mallet_path, inputfile, outfolder, num_topics, optimize_interval, num_iterations, num_top_words, doc_topics_max)
@@ -122,17 +122,18 @@
 mastermatrixfile = "mastermatrix.csv"
 metadatafile = wdir+"/metadata.csv"
 topics_in_texts = wdir+"/6_mallet/topics-in-texts.csv"
-number_of_topics = 250
+number_of_topics = 50
 useBins = True # True|False
 binDataFile = wdir+"7_aggregates/segs-and-bins.csv"
-tmw.create_mastermatrix(corpuspath, outfolder, mastermatrixfile, metadatafile, topics_in_texts, number_of_topics, useBins, binDataFile)
+#tmw.create_mastermatrix(corpuspath, outfolder, mastermatrixfile, metadatafile, topics_in_texts, number_of_topics, useBins, binDataFile)
 
 ### calculate_averageTopicScores
 ### Based on the mastermatrix, calculates various average topic score datasets.
 mastermatrixfile = wdir+"/7_aggregates/mastermatrix.csv"
 outfolder = wdir+"7_aggregates/"
-targets = ["author-name", "author-gender", "title", "decade", "subgenre", 
-           "idno", "segmentID", "narration", "protagonist-policier"] 
+targets = ["author", "subgenre", "binID"] 
+#targets = ["author", "author-gender", "title", "decade", "subgenre", 
+#           "idno", "segmentID", "narration", "protagonist-policier", "binID"] 
 #tmw.calculate_averageTopicScores(mastermatrixfile, targets, outfolder)
 
 ### save_firstWords
@@ -151,7 +152,7 @@
 ### make_wordle_from_mallet
 ### Creates a wordle for each topic.
 word_weights_file = wdir + "6_mallet/" + "word-weights.txt"
-topics = 250
+topics = 50
 words = 40
 outfolder = wdir + "8_visuals/wordles/"
 font_path = "/home/christof/.fonts/AlegreyaSans-Regular.otf"
@@ -172,9 +173,9 @@
 ### For each item from a category, creates a barchart of the top topics.
 averageDatasets = wdir+"/7_aggregates/avg*.csv" 
 firstWordsFile = wdir+"/7_aggregates/firstWords.csv"
-targetCategories = ["author-name", "author-gender", "decade", "subgenre", "title"] 
+targetCategories = ["author", "subgenre", "binID"] 
 topTopicsShown = 30 
-numberOfTopics = 250 
+numberOfTopics = 50 
 fontscale = 1.0
 height = 0 # 0=automatic and variable
 dpi = 300
@@ -186,8 +187,8 @@
 averageDatasets = wdir+"/7_aggregates/avg*.csv" 
 outfolder = wdir+"/8_visuals/topItems/"
 firstWordsFile = wdir+"/7_aggregates/firstWords.csv"
-numberOfTopics = 250 # must be actual number of topics modeled. 
-targetCategories = ["author-name", "subgenre", "title", "decade", "author-gender", "segmentID"] 
+numberOfTopics = 50 # must be actual number of topics modeled. 
+targetCategories = ["author", "subgenre", "binID"] 
 topItemsShown = 30 
 fontscale = 0.8
 height = 0 # 0=automatic and flexible
@@ -205,20 +206,20 @@
 averageDatasets = wdir+"/7_aggregates/avg*.csv" 
 firstWordsFile = wdir+"/7_aggregates/firstWords.csv"
 outfolder = wdir+"/8_visuals/distinctiveness/"
-targetCategories = ["author-name", "decade", "subgenre", "gender"] 
+targetCategories = ["author", "subgenre", "binID"] 
 # one or several: "author-name", "decade", "subgenre", "gender", "idno", "title"
-numberOfTopics = 250 # must be actual number of topics modeled.
+numberOfTopics = 50 # must be actual number of topics modeled.
 topTopicsShown = 20 
 fontscale = 1.0
 dpi = 300
-#tmw.plot_distinctiveness_heatmap(averageDatasets, firstWordsFile, outfolder, targetCategories, numberOfTopics, topTopicsShown, fontscale, dpi)
+tmw.plot_distinctiveness_heatmap(averageDatasets, firstWordsFile, outfolder, targetCategories, numberOfTopics, topTopicsShown, fontscale, dpi)
 
 ### plot_topicsOverTime ###
 ### Creates lineplots or areaplots for topic development over time.
 averageDatasets = wdir+"/7_aggregates/avgtopicscores_by-decade.csv" 
 firstWordsFile = wdir+"/7_aggregates/firstWords.csv"
 outfolder = wdir+"/8_visuals/overTime/"
-numberOfTopics = 250 # must be actual number of topics modeled.
+numberOfTopics = 50 # must be actual number of topics modeled.
 fontscale = 1.0
 dpi = 300
 height = 0 # for lineplot; 0=automatic
@@ -239,7 +240,7 @@
 #  Interesting combination: *weighted+cosine  
 wordWeightsFile = wdir + "6_mallet/" + "word-weights.txt"
 outfolder = wdir + "8_visuals/clustering/"
-topicsToUse = 250 # = all topics modeled
+topicsToUse = 50 # = all topics modeled
 wordsPerTopic = 50
 methods=["weighted"] # list
 metrics=["cosine"] # list
@@ -250,7 +251,7 @@
 averageDatasets = wdir+"/7_aggregates/avg*title.csv" 
 figsize = (10,80) # width,height
 outfolder = wdir + "8_visuals/clustering/"
-topicsPerItem = 250
+topicsPerItem = 50
 sortingCriterium = "std" # std|mean
 targetCategories = ["title"] # list
 methods=["weighted"] # list
@@ -258,11 +259,21 @@
 #tmw.itemClustering(averageDatasets, figsize, outfolder, topicsPerItem, targetCategories, methods, metrics, sortingCriterium)
 
 
-### itemPCA ###
+################################
+###    OTHER/OBSOLETE        ###
+################################
+
+### 5c show segment
+## To read a specific segment, better than looking in the folder.
+segmentID = "rf0166§0118"
+outfolder = wdir+"/9_sel-segs/"
+#tmw.show_segment(wdir,segmentID, outfolder)
+
+### itemPCA ### CURRENTLY NOT WORKING
 averageDatasets = wdir+"/7_aggregates/avg*.csv" 
 figsize = (10,10) # width,height
 outfolder = wdir + "8_visuals/clustering/"
-topicsPerItem = 250
+topicsPerItem = 50
 sortingCriterium = "std" # std|mean
 targetCategories = ["subgenre"] # list
 methods=["weighted"] # list
@@ -271,15 +282,6 @@
 
 
 
-################################
-###    OTHER/OBSOLETE        ###
-################################
-
-### 5c show segment
-## To read a specific segment, better than looking in the folder.
-segmentID = "rf0166§0118"
-outfolder = wdir+"/9_sel-segs/"
-#tmw.show_segment(wdir,segmentID, outfolder)
 
 ### 6b - create_topicscores_lineplot
 inpath = wdir + "7_aggregates/*-lp.csv"  # narrow down as needed

From 4ce326c01d51cec7d5901c71bf2649d0cf3cb527 Mon Sep 17 00:00:00 2001
From: christofs <c.schoech@gmail.com>
Date: Fri, 4 Sep 2015 17:34:14 +0200
Subject: [PATCH 35/56] Added simple textual progression plotting based on bins

---
 __pycache__/tmw.cpython-34.pyc | Bin 36091 -> 42916 bytes
 tmw.py                         | 266 ++++++++++++++++++++++++++++++---
 tmw_config.py                  |  39 +++--
 3 files changed, 271 insertions(+), 34 deletions(-)

diff --git a/__pycache__/tmw.cpython-34.pyc b/__pycache__/tmw.cpython-34.pyc
index 4e549fd7bbf2da47311a88ea43b474dd225528bf..f0a6ae42c3e7d25b7dfbee2347a056200d6b8841 100644
GIT binary patch
delta 3072
zcmcImYiwLc6+UzCKD_qz`<0FD_O`AXHceI`qAiWm2<kY{G&oLdNC~RUcISG%@6Fun
zBrNN$6sIqg0@buYUqDjxi-$m#kdXR=Dp14^1a2!T7AR68keW&eQ3-_l1HLo6cC5%$
z{Ndi6vuEba%sF%BJLlZ@pR-neZH@oc>1;pqlXpJ-*1w6a(x%4-|1mWG+K<Nf+Vu>{
z1v*c3@({@^$(-RoK{8LWVE9jxERrl4{(h2Wk`==rCV7gaGW-K1eUj6Lznf%0^0?tY
zN^+6ptl{q=d79*B48M;gkrY%oa)3@AA$f-6StIzA5qyCPQ&c=kCy$blRgzycvT2ev
zl1qj^#;a!%kE}j#-@L3>riNC}FWf8kZqInZTt$^L%JcnfximMQReo@ytfap$j3Ybj
z2fI%UintgP+bp%F|CWu5SM}3N{dH;4vPECHL`xwGEXvuG3sEgZ(k2<AWg(WrR0~rs
zOf4otHHgCed(@(Ni{eWdicl^}xi$)7lxxQ_9+`_$&_Ol9-6inY-Qn?Clrk2{2+1gY
zM7EKPVTQHTMzuD|b&@peTV1F%<1;6q@7RMNGl>rUT5fWCHNM-$!Gmlm<H}T!a)Yv4
zDU^d`Qn%;t5MBBY({X(`fAh{YL8Rb$QlA5th&8%Wr#+R*cn7h{_X2gCCp-(HpUx*I
z$Jv{{SD48bD+O<QlUrHKY<U_w4zXK@j9c49pV%g1b{vj%#4Vy*#4Pp3>Ob<QM15$t
zn^s=Rm>4X0E%TBp#R?CzMPFy`MdrT2+;bp{ccD933eIHH`NAR&ms<2Q-kHlQ3x&W_
z?rgb|P5bWUR~|-3dL?((I~(9Uh1BA_x|26~3Zy!OdHVOo_$^&5Pd)6X7fRUx8{Emt
z03%Xpq+SrZx0E}oUPLd?r?M~_q3YenaBpiz6tX4HxO#t~oK6+|1IhgvujHMrr~{1<
zTsNnGS$ZU@UScf=^{eUrJ-7WY+nr?nqtH*beyTh&@>{G=hq#ZY4Xp!h0?j|S8M^8J
z4|D-kfEHNY482Ho;?QC#9U#hFy_Vaz`@f?wsW|7T*O>eQWD2-#Lh`0t6K(2scFqR!
z)L(eo{Y(%nq#o6s3)|`jimAc}Lb1Y5>%l^@H$hQEEcF(<FUk@;>^kO+`}RT-m{f#7
zABI8^AYkoQoZ`qiivpYKkV1$IVWJ#yNSqv+nSnEaiC_fqGT9<PjzB-gK|G8QmK|6t
z%y5*X!%;0eH--@OT{jHHh_^uO!?13O%O2v~#LIfI%o4Z{^z$~UH8Zn=WFKb}*v2O)
z;Ybdrk6c$P<@oQJk?vO4%;@3NhUrVN6DRAZ;aUvl4$$yYyV<puYS1{yDQ~&IWq%*r
zB;*iM&#>NEytjVS7IE#)mlaYFg41nA)&~dq4&24&Am1{c-TBL}DEEe_&ZhMri$nLR
zHyJ0_qxl7NGuYHTjSx^XR}CdArHr}<lBq<$;xeF2GJKP{IsM^>cIj|=K#$FLf4oD@
z@qR8#*#0Io4JvuP@|kplG`h_p4{el~O<`VN5A!WCyPh@9F=AN}*<DDPaeGt@i+;T0
z!VyELQ#}~%LtizZFCF8Vc-Z`QG=t!Gz%&uecMrreG^{%Lw{pY+p?QncXBpGDX$Cj(
zedDfy&gBsCWm|mHHBgj0KpRyqLdy{V>73ajWVTpa-@>K<$j2l^i2)qS(%A=j<-vz0
zXsLt2Q>!!PO$SA8Ln8*88j(#lG{P8^#-`FJiBpf$i4u$g_F)AFB)F*I5MMKiSattK
z-f7jj7TBbB+AEYRMXwaN^)d>98zA8?rV8$+FM?!s^hgSMYr&K2;!TAu*=R%3_R@hT
z8#cIxjr%houI`vFc&L$XQ|_kR)2TvMGF|ix;&&9w(%X@&h7XoY-k#(XGMmBK8;loD
z;A)mh3iqYPm~d7y7n^X@>-w?9T~V&QK%f4y8k%bK`Zt!{ml!U4u#`gWsI(EhHr^QC
zi0A}lBKWXr{$C#-u-b)sTVI@t;}dDW*sh*|lArTVH+~GQzL))~2(Op_-BI;5%=}GQ
z|1LkW`mg*&F|bvQ);}mt4y)(c@aqwX-g-TantJ;8u>P(Z(Jz-)+*XkAf}K3gkUN<B
zCdRIW_4)Gn>i5b&6$$k%3?zj5cJuGz7EAEmO7DlYK6PJMU4k6eBy>o9$7+i};pc7j
zve1pvsNUs!QMG~z{|xKz`{VkDPyVWRFajT-A)z7Agu1f&<HtIMxT=5l_=S<XSZHbq
z+{f<i=GVGisB1d^#Gl1AeeB7}(FvZ_bYr{dv}3}t#)Nv8?LEG#zqBkKXl9w~5ZJhL
z^)?^pPfV_WGzR@V@TE#g>7~oXO4-kHWmQ*s!ZjxE>WK@ti%U9p;fb-^9LE`OhMYcU
U#QBiZf&W2g)Ulk-o&l%xKSqS#Jpcdz

delta 280
zcmZ2-p6T~YCgpcLyj*t{y$pSKn}Ojm0}@~cvK@f9xM-uY5wj};LrOYBjvGU+J42K^
z17k`CLrNw?3j;%x2SZ90LrON7?a7b=RL97W<HZo=#gLN2kdg})_XY_DYvyehVv%p&
zEVzh^k@4bWnZ-TQj;ufn1VFYjaWQf-vN7_5Ad?zn(T&N67e8gZF?r<@S0+E!$sd-8
zu-s&1V3=&M@T^z?NEaJO7o!lP0AtZzpoFH+X0D~pjJmf(i?ef56N~cnQc{!ia|`l|
tGfOh_^NKD4)fC+T5g=oVZh?rqlW#0jW9*y!e_02oxPS-`FOLEb695u(NqPVP

diff --git a/tmw.py b/tmw.py
index a9dc2db..c2f728e 100644
--- a/tmw.py
+++ b/tmw.py
@@ -1318,6 +1318,251 @@ def itemClustering(averageDatasets, figsize, outfolder, topicsPerItem,
 
 
 
+
+
+
+
+
+
+##################################################################
+###    OTHER / OBSOLETE / DEV                                  ###
+##################################################################
+
+
+
+###########################
+## textual Progression  ###
+###########################
+
+
+def get_progression_firstWords(firstWordsFile):
+    """Function to load list of top topic words into dataframe."""
+    #print("  Getting firstWords.")
+    with open(firstWordsFile, "r") as infile: 
+        firstWords = pd.read_csv(infile, header=None)
+        firstWords.drop(0, axis=1, inplace=True)
+        firstWords.rename(columns={1:"topicwords"}, inplace=True)
+        firstWords.index = firstWords.index.astype(np.int64)        
+        #print(firstWords)
+        return(firstWords)
+
+
+def get_selSimpleProgression_dataToPlot(averageDataset, firstWordsFile, 
+                               entriesShown, topics): 
+    """Function to build a dataframe with all data necessary for plotting."""
+    print("- getting data to plot...")
+    with open(averageDataset, "r") as infile:
+        allScores = pd.DataFrame.from_csv(infile, sep=",")
+        allScores = allScores.T        
+        #print(allScores.head())
+        ## Select the data for selected topics
+        someScores = allScores.loc[topics,:]
+        someScores.index = someScores.index.astype(np.int64)        
+        ## Add information about the firstWords of topics
+        firstWords = get_progression_firstWords(firstWordsFile)
+        dataToPlot = pd.concat([someScores, firstWords], axis=1, join="inner")
+        dataToPlot = dataToPlot.set_index("topicwords")
+        dataToPlot = dataToPlot.T
+        #print(dataToPlot)
+        return dataToPlot
+    
+    
+def create_selSimpleProgression_lineplot(dataToPlot, outfolder, fontscale, 
+                                topics, dpi, height):
+    """This function does the actual plotting and saving to disk."""
+    print("- creating the plot...")
+    ## Plot the selected data
+    dataToPlot.plot(kind="line", lw=3, marker="o")
+    plt.title("Entwicklung ausgewählter Topics über den Textverlauf", fontsize=20)
+    plt.ylabel("Topic scores (absolut)", fontsize=16)
+    plt.xlabel("Textabschnitte", fontsize=16)
+    plt.setp(plt.xticks()[1], rotation=0, fontsize = 14)   
+    if height != 0:
+        plt.ylim((0.000,height))
+
+    ## Saving the plot to disk.
+    if not os.path.exists(outfolder):
+        os.makedirs(outfolder)
+    ## Format the topic information for display
+    topicsLabel = "-".join(str(topic) for topic in topics)
+    figure_filename = outfolder+"sel_"+topicsLabel+".png"
+    plt.savefig(figure_filename, dpi=dpi)
+    plt.close()
+
+def get_allSimpleProgression_dataToPlot(averageDataset, firstWordsFile, 
+                               entriesShown, topic): 
+    """Function to build a dataframe with all data necessary for plotting."""
+    print("- getting data to plot...")
+    with open(averageDataset, "r") as infile:
+        allScores = pd.DataFrame.from_csv(infile, sep=",")
+        allScores = allScores.T        
+        #print(allScores)
+        ## Select the data for current topics
+        someScores = allScores.loc[topic,:]
+        someScores.index = someScores.index.astype(np.int64)
+        dataToPlot = someScores
+        #print(dataToPlot)
+        return dataToPlot
+        
+# TODO: Make sure this is only read once and then select when plotting.
+    
+    
+def create_allSimpleProgression_lineplot(dataToPlot, outfolder, fontscale, 
+                                firstWordsFile, topic, dpi, height):
+    """This function does the actual plotting and saving to disk."""
+    print("- creating the plot for topic " + topic)
+    ## Get the first words info for the topic
+    firstWords = get_progression_firstWords(firstWordsFile)
+    topicFirstWords = firstWords.iloc[int(topic),0]
+    #print(topicFirstWords)
+    ## Plot the selected data
+    dataToPlot.plot(kind="line", lw=3, marker="o")
+    plt.title("Entwicklung über den Textverlauf für "+topicFirstWords, fontsize=20)
+    plt.ylabel("Topic scores (absolut)", fontsize=16)
+    plt.xlabel("Textabschnitte", fontsize=16)
+    plt.setp(plt.xticks()[1], rotation=0, fontsize = 14)   
+    if height != 0:
+        plt.ylim((0.000,height))
+
+    ## Saving the plot to disk.
+    if not os.path.exists(outfolder):
+        os.makedirs(outfolder)
+    ## Format the topic information for display
+    topicsLabel = str(topic)
+    figure_filename = outfolder+"all_"+topicsLabel+".png"
+    plt.savefig(figure_filename, dpi=dpi)
+    plt.close()
+
+
+def simpleProgression(averageDataset, firstWordsFile, outfolder, 
+                           numberOfTopics, 
+                           fontscale, dpi, height, mode, topics):
+    """Function to plot topic development over textual progression."""
+    print("Launched textualProgression.")
+    if mode == "selected" or mode == "sel": 
+        entriesShown = numberOfTopics
+        dataToPlot = get_selSimpleProgression_dataToPlot(averageDataset, 
+                                                      firstWordsFile, 
+                                                      entriesShown, 
+                                                      topics)
+        create_selSimpleProgression_lineplot(dataToPlot, outfolder, 
+                                          fontscale, topics, 
+                                          dpi, height)
+    elif mode == "all": 
+        entriesShown = numberOfTopics
+        topics = list(range(0, numberOfTopics))
+        for topic in topics:
+            topic = str(topic)
+            dataToPlot = get_allSimpleProgression_dataToPlot(averageDataset, 
+                                                             firstWordsFile, 
+                                                             entriesShown, 
+                                                             topic)
+            create_allSimpleProgression_lineplot(dataToPlot, outfolder, 
+                                                 fontscale, firstWordsFile, 
+                                                 topic, dpi, height)
+    else: 
+        print("Please select a valid value for 'mode'.")
+    print("Done.")
+    
+    
+
+
+
+
+
+
+###########################
+
+
+def get_overTime_firstWords(firstWordsFile):
+    """Function to load list of top topic words into dataframe."""
+    #print("  Getting firstWords.")
+    with open(firstWordsFile, "r") as infile: 
+        firstWords = pd.read_csv(infile, header=None)
+        firstWords.drop(0, axis=1, inplace=True)
+        firstWords.rename(columns={1:"topicwords"}, inplace=True)
+        firstWords.index = firstWords.index.astype(np.int64)        
+        #print(firstWords)
+        return(firstWords)
+
+def get_overTime_dataToPlot(average, firstWordsFile, entriesShown, topics): 
+    """Function to build a dataframe with all data necessary for plotting."""
+    #print("  Getting data to plot.")
+    with open(average, "r") as infile:
+        allScores = pd.DataFrame.from_csv(infile, sep=",")
+        allScores = allScores.T        
+        #print(allScores.head())
+        ## Select the data for selected topics
+        someScores = allScores.loc[topics,:]
+        someScores.index = someScores.index.astype(np.int64)        
+        ## Add information about the firstWords of topics
+        firstWords = get_overTime_firstWords(firstWordsFile)
+        dataToPlot = pd.concat([someScores, firstWords], axis=1, join="inner")
+        dataToPlot = dataToPlot.set_index("topicwords")
+        dataToPlot = dataToPlot.T
+        #print(dataToPlot)
+        return dataToPlot
+
+def create_overTime_lineplot(dataToPlot, outfolder, fontscale, topics, dpi, height):
+    """This function does the actual plotting and saving to disk."""
+    print("  Creating lineplot for selected topics.")
+    ## Plot the selected data
+    dataToPlot.plot(kind="line", lw=3, marker="o")
+    plt.title("Entwicklung der Topic Scores", fontsize=20)
+    plt.ylabel("Topic scores (absolut)", fontsize=16)
+    plt.xlabel("Jahrzehnte", fontsize=16)
+    plt.setp(plt.xticks()[1], rotation=0, fontsize = 14)   
+    if height != 0:
+        plt.ylim((0.000,height))
+
+    ## Saving the plot to disk.
+    if not os.path.exists(outfolder):
+        os.makedirs(outfolder)
+    ## Format the topic information for display
+    topicsLabel = "-".join(str(topic) for topic in topics)
+    figure_filename = outfolder+"lineplot-"+topicsLabel+".png"
+    plt.savefig(figure_filename, dpi=dpi)
+    plt.close()
+
+def create_overTime_areaplot(dataToPlot, outfolder, fontscale, topics, dpi):
+    """This function does the actual plotting and saving to disk."""
+    print("  Creating areaplot for selected topics.")
+    ## Turn absolute data into percentages.
+    dataToPlot = dataToPlot.apply(lambda c: c / c.sum() * 100, axis=1)
+    ## Plot the selected data
+    dataToPlot.plot(kind="area")
+    plt.title("Entwicklung der Topic Scores", fontsize=20)
+    plt.ylabel("Topic scores (anteilig zueinander)", fontsize=16)
+    plt.xlabel("Jahrzehnte", fontsize=16)
+    plt.ylim((0,100))
+    plt.setp(plt.xticks()[1], rotation=0, fontsize = 14)   
+
+    ## Saving the plot to disk.
+    if not os.path.exists(outfolder):
+        os.makedirs(outfolder)
+    ## Format the topic information for display
+    topicsLabel = "-".join(str(topic) for topic in topics)
+    figure_filename = outfolder+"areaplot-"+topicsLabel+".png"
+    plt.savefig(figure_filename, dpi=dpi)
+    plt.close()
+
+
+
+###########################
+## show_segment         ###
+###########################
+
+import shutil
+
+def show_segment(wdir,segmentID, outfolder): 
+    if not os.path.exists(outfolder):
+        os.makedirs(outfolder)
+    shutil.copyfile(wdir+"2_segs/"+segmentID+".txt",outfolder+segmentID+".txt")
+
+
+
+
+
 ###########################
 ## itemPCA              ###
 ###########################
@@ -1372,24 +1617,3 @@ def itemPCA(averageDatasets, targetCategories,
     
 
     
-
-
-
-
-
-##################################################################
-###    OTHER / OBSOLETE                                        ###
-##################################################################
-
-
-
-###########################
-## show_segment         ###
-###########################
-
-import shutil
-
-def show_segment(wdir,segmentID, outfolder): 
-    if not os.path.exists(outfolder):
-        os.makedirs(outfolder)
-    shutil.copyfile(wdir+"2_segs/"+segmentID+".txt",outfolder+segmentID+".txt")
diff --git a/tmw_config.py b/tmw_config.py
index 9ef6935..2978ca2 100644
--- a/tmw_config.py
+++ b/tmw_config.py
@@ -131,7 +131,7 @@
 ### Based on the mastermatrix, calculates various average topic score datasets.
 mastermatrixfile = wdir+"/7_aggregates/mastermatrix.csv"
 outfolder = wdir+"7_aggregates/"
-targets = ["author", "subgenre", "binID"] 
+targets = ["author", "subgenre", "binID", "decade"] 
 #targets = ["author", "author-gender", "title", "decade", "subgenre", 
 #           "idno", "segmentID", "narration", "protagonist-policier", "binID"] 
 #tmw.calculate_averageTopicScores(mastermatrixfile, targets, outfolder)
@@ -212,7 +212,7 @@
 topTopicsShown = 20 
 fontscale = 1.0
 dpi = 300
-tmw.plot_distinctiveness_heatmap(averageDatasets, firstWordsFile, outfolder, targetCategories, numberOfTopics, topTopicsShown, fontscale, dpi)
+#tmw.plot_distinctiveness_heatmap(averageDatasets, firstWordsFile, outfolder, targetCategories, numberOfTopics, topTopicsShown, fontscale, dpi)
 
 ### plot_topicsOverTime ###
 ### Creates lineplots or areaplots for topic development over time.
@@ -224,20 +224,11 @@
 dpi = 300
 height = 0 # for lineplot; 0=automatic
 mode = "line" # area|line for areaplot or lineplot
-topics = ["25","60"] # list of one or several topics
+topics = ["25", "44"] # list of one or several topics
 #tmw.plot_topicsOverTime(averageDatasets, firstWordsFile, outfolder, numberOfTopics, fontscale, dpi, height, mode, topics)
 
 ### topicClustering ###
 # This function will create a dendrogram grouping topics based on their word weight similarity.
-# Parameters 
-# wordsPerTopic: Number of top words for each topic to take into account for similarity measure.
-# method: The clustering method used to build the dendrogram. 
-#  Options: ward|single|complete|average|weighted|centroid|median
-#  See http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.cluster.hierarchy.linkage.html 
-# metric: The distance measure used to build the distance matrix.
-#  Options: euclidean|minkowski|cityblock|seuclidean|sqeuclidean|cosine|correlation|hamming|jaccard etc.
-#  See: http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html
-#  Interesting combination: *weighted+cosine  
 wordWeightsFile = wdir + "6_mallet/" + "word-weights.txt"
 outfolder = wdir + "8_visuals/clustering/"
 topicsToUse = 50 # = all topics modeled
@@ -259,10 +250,32 @@
 #tmw.itemClustering(averageDatasets, figsize, outfolder, topicsPerItem, targetCategories, methods, metrics, sortingCriterium)
 
 
+
+
 ################################
-###    OTHER/OBSOLETE        ###
+###  OTHER / OBSOLETE / DEV  ###
 ################################
 
+
+### simpleProgression ###
+### Creates a lineplot of topic development over textual progression.
+averageDataset = wdir+"/7_aggregates/avgtopicscores_by-binID.csv" 
+firstWordsFile = wdir+"/7_aggregates/firstWords.csv"
+outfolder = wdir+"/8_visuals/progression/simple/"
+numberOfTopics = 50 # must be actual number of topics modeled.
+fontscale = 1.0
+dpi = 300
+height = 0 # for lineplot; 0=automatic
+mode = "sel" # all|sel 
+topics = ["25", "44", "12"] # if mode="sel": list of topics
+tmw.simpleProgression(averageDataset, firstWordsFile, outfolder, numberOfTopics, fontscale, dpi, height, mode, topics)
+
+
+
+
+
+
+
 ### 5c show segment
 ## To read a specific segment, better than looking in the folder.
 segmentID = "rf0166§0118"

From 1c915d1f4593b79319e70bcaed37895ef44a482b Mon Sep 17 00:00:00 2001
From: christofs <c.schoech@gmail.com>
Date: Fri, 4 Sep 2015 18:00:01 +0200
Subject: [PATCH 36/56] Added complexAverageTopicScores

---
 __pycache__/tmw.cpython-34.pyc | Bin 42916 -> 43742 bytes
 tmw.py                         |  28 ++++++++++++++++++++++++++++
 tmw_config.py                  |  33 ++++++++++++++++++++++++++-------
 3 files changed, 54 insertions(+), 7 deletions(-)

diff --git a/__pycache__/tmw.cpython-34.pyc b/__pycache__/tmw.cpython-34.pyc
index f0a6ae42c3e7d25b7dfbee2347a056200d6b8841..2d0224f702140357fbc0b5e5ed375c9d3a83d8d3 100644
GIT binary patch
delta 5804
zcmbVQ33OD|8NUC^WRgs_$ueY;OvpwcLrP4d)F4YhBP0+)FpEhnfyulHnJ~<RI|B(Z
zA*>P+6}&}65JeV`r!9i7QkS~a<<z#GS{1ixwUr)d^&GM1v@X~l>HpuCWQG_OP0ssr
z-(CKD|Lwc~%aK==_uo|V&&S1^mOk@g<LpRae`jGo5!jxKkN>li`5aoNGPQ_pWvn%e
zsf?+>JQG=0Hfzn1uRmg*T(&xowdOGx9?oa2`AnV0)MB}85>u~WYKd$Y(0a9$sby>{
zi1RRzvVf^oatJ$h>1ASr;+oLH)K=L)YovcR>zX4jDE4#*Q`_axb>Ty-wL;8{a193J
zPz_VpGW7=8E@$dGrZ&j-45s!lwMn+GVQMc^n`L|Eh^WP^bum-dGqq3l*UJ8znP&-G
zUB_DMP|1F#4#=^kOx?iLLD{~66sxyy{7_3VTs=F|gyxGEgPsXu;%1Y`pHn;7Z(v<9
ztRJ<HW~za00T_s6{gJH8!~(IbD-L_ufRU+k&Z;QJPq2O?>x!qImHH}^TxDcl#sZ8+
zaS5onIDrKcS-?!vw<uCj6jS3^e-uko<5}83H0w`LSbsD&Q9Mdb66T7`cpR1zqVV8J
z;`)lJtYTA*W3AWO=~8o?4v%xK#}RNXcluU$dtANq#Nmq4G_C++d^)Q4B^32#43A9{
z=PC+}lPi5*SFwnnn-h^^PZo=3XNbDFDXVw{Ein*aq%m$Jh$6^9F?@(1i@=3oH}Nm1
z9Zlmo)T$=9KoCQHCWN5bt-8DccZb`>{Q>UwuH>;a7ANMdNLif!^`ozIamPwmgRk4|
ztatjj%YQ<_@Scc5nPP%uFyk{xu^`4lrop6GApuNEI&iZ#XWsM(NEU(0R`{cMweo7n
z(4wyIft3G|glH)t6Q)9<!t=D<^Rof+#K{Gz$rd_KDuO+hFQUU|i?<idg&bk6s!$9O
zqNOSa3PoSlU*l6S-TWwwu?SwObyP<wP$ImGmKY03M5cIQ(UnjtK3_BgvPEf4im{Bk
zXNiS1KE)a#o~oG&Rob~4FF>_utgS^nQd<XQ;?vrEs29cC)3vfCiGUiQX+&dPqGciN
zNAbxblEDJ$kfrb@ZGBxQSQ<u$lmxVeRY_L(GVQaalOQRXc6}QcaT~Vg+fCxm7K@l$
zA33FoMo!a6Gqoln1Y`WJz;d@&b@ke#c?)$n3Qv6@o?=gZadsPRld7KpO&z1eL5^Yu
zE4l&A=+w^C{~VxP+}YSzGz-_-lXx77rx*zux;?%Cub@o~xJh@=*c)&=SNZw1BC%=K
zjO%ExD{vz2+fMCPf)zA9ro-nA_%SwIL1U-Oy|OdFNxB|n3#57WHf2D!c(iE>^k~7R
zs~~?FO>h$QAlMUvi4HxKDjfj_Ztq9EN}DGaFQ+MM3DyyW<VxM;8Gb?obP(Py_qwM!
zG3ypGL^2t|V1fw{kI|QiXn`W&H;WgWpF@LfYbi1}(z0_nO-}~hmn}CX<aW3*{%`bg
z)qgLEA~P}csukjtcrXF)7fS0tz$QLwz0$6`0VM_F1HSI+fNQnCzSGy^4HN7L9k1mk
z>%^lZGeMUr{gh6rqVPeS=8P6|dsDT;Hw{88G9M-{L5G;OEbHoWYQ9RxbyKU0fEa@#
zk1^g!u#{f0!B*lUf4E0%T9$c@ba-;e;oM8p$>021f;9xRlKTj{2?7Luk#vXknqrj2
zLqQOu2Z0P;=>=)T@f@WGz|6LE&_rX~6xgT@w6#O~jU?JZLi?#jn)8ha_C(%G?Hn4C
z?paT-eFPf_#_SP_gEO>39z&jrf*}`vFPtl;!zM~s@#u=z3=JqkB-CZa>9QpS>Rzu=
z<HesGCd+@wH)FDRv-XK&Hze6`@qC?IKc&qY#BOJfc?+GYf!^_&#WWSaapu`JQ)e3j
z((Y({a2d_(#YN{;#?dpTh#EBwz7nch0Jmy8R3~NVdDj%9?o*N3kqfiL?2g>*kgueh
zq{BkKl5Q%IbRTk4l<p?EuXK~N9)&`9cD~b*t=J6WLPrkl(K1$+LdC5#V-FoaR(FT=
zL=NdEJt2J}rG^BCJfY{|eVth_T|Ct}1@6|0w<`b*qI77IZ7YeE*VQ(XAMrIu%;>Tz
z4F+*&C>8c<on3RmNKUiIkMLrNg(Ni7SuT~u_tJoLg0y_h2tj_a%4WWW#)b&OEI*?w
zAy!4gY4Ft{`7IZ9o-Ej>c|H39RAKVYG|GhACEnj@OWTcWgQkU!PTuG0^afnRi6=IA
zH)4EU^j4RWS!?ZPJ;g33yD+h`%0tc?CRS->X;vB0(pf})!B}J$x6@zaXm@$Io>rzn
zlF8%tx`?~i7{tW``3gUz<#zuQV23bacB^6`FC0;$mltM`8vH&2d>DU_S~AiOiYK`<
z@_u~Zi`v-49RIWmT_Ytlb@;{zMmoB?J?p=jLL_nc#B0m4N@OV#QgS#V$5ZkFI?IS&
z>B=nz5q}8vc}QCrxE0!u;%Hx_EQD(9k-CwI;rTvVc8K611bf`rfR(Wt3R8KHl+i+I
zUKzFb&}uzutJbB$0kLe|G<aCMZ5`g?HXLi1Z#U{Y607nonm$oH(X-m}tYL&Lu-u_?
z#j5Z}w3)rT!5m^Z8L+A1FYAlpxQOj5hhK=wzA|`O^z;?LV<NXZT|3n`0G1OZcMPAG
zi>m%9iMwzl7**-^2OQo$zFTbQZ%roWxSO%Qu`B!uT6scz)SnL(B4NOq9pZ&C<Sq$9
zaz$FWQ+`PTP7(Y{1O^I0cf*MRD+I;w1`6PL@yWoy(;lG-j}bhNfYkOgYCk6~Y{;~n
zq~>vga9LCv%^Ebrd9ipf4PMe*gFgk>BEH<1i<hTulWnrJ<vV26EwomauvF>D!p9z^
zJF``^Z3-yHNO5{o9=xJ`zUespSU)Vyd>kQY^iW#(`4!Z&gkY>8sqcjN^Ok(Gv}Uw!
zO{WXZnXuL095Uww-PKRgf?pFnO(5C#8EQ%DX{Oe*1eamzqL8U$`5$PJB;$|@zoovH
z2_6@=ZT4aFw`@yOtdW9mD}>jz{o4|t@)er$I>JbplkPf|OA(TsU!cAh38X~1mE3gL
zzvz>@wkN`s!n{2T-neXTUNck-H;M;_EO1&oHRK{bx9ynJCWCuCHa<-!2xH@5!lfmi
zPW&?UWG2eHcwCx3xnmMI#albF;VmTRH6T{*v}VZKXdwm8y|mE*YVDUTv2SO^2<0Jj
z!*224o$0ZX<ZztvgChHOk7T+bvFrA_AI94yA(ws!Z}Tu-TXRPN^oYHP?`luqQ3e&?
zWIe31-y~%k38aleRcgpaI%Un<H7WD<NaP_iYR4kzvkPngh;&4w!h_nzT^f9wxfb75
z5@s&1ydmDEeM1>^5MM_N%AmpF@htB%6nEq2A*D^NJw;C$>2mqSvYZTknRlSIZcNn@
zt5+>KBx5ZdL|Q=_fVR^=1?U<NCdw;u{%EltZtEdY0QZaI!Vd3g?}$^-{1&Z!mmoyV
za_1oRy@!BV6|%bA{@X7P>6mtLZqHPBQY76y13n1LrmHTkPA;u<Xve9fpM)Z6QT1{L
z^D9+*a_{W1MUTfqE5=JRS{9yWVdds`<kmQ6*oS=|M~bDMOzq&ld%iVl>h|Y_MNL>;
z5sDfV7M^X1BKHBC$n;J7VHsETeKT$x{isa1sC{+dEEH1ZV+Vd%|8r!V$y@RGR-=gJ
zbBvN~j*2jyFSC{^K%J{+>q+wUI*K2oZND*10yHYCsOs-qMUA5lRY$2ttf~H$RAxlB
zz=tv93N!GtTEc_zP<;+(1S2}!fnZF4JG}l5A734oE9K-)x(0($__ZIv64)LScAJIY
zVd~+^%zD~HQWBO%F<`_maQsy!1OHgOaQJT}@-F)r#Ua=A;vfCXB1{VP<<Oq`4MrdL
zF;Q;(yso=$-U{)_5sUFWt*#fQqjwt3G&@7wdo+7c-qe-!o=H$dkVjBRP)blmP*2cA
zu#CV-&_i%DK|jGD0ezP8O$eyB%hB%RUVayKZ6+A~QsQ^hD`sQz*pie#EEXQK4a$1r
z(h7>dL~BMbb^ZpuN)iom)hIdf4`}oZ!C8c$#lOnqa&T|4>Y{%Z`uy&I+lT*jpb2DF
ZPSy#Lj5vEN9gc{L$L2tmnDtQS{{g4;B#Hn4

delta 5331
zcmcgwdvKK16~E_8vLOk1Vn{a0HVGjnOUN36F$94SiAV(V2F#*_O}=GW*hg}AlaRo&
zk)o!SSG*boe2Zui9Pv}DZLRMawVeXhO1rgKaI7P?jMIls$IjSu?zd#u2x9%CVfOc(
z`#SfYbMARuPW?_fa7-yWH#s?R;Dt|?pZtok4_VY-4AKkn=R13}h(iZsDlnIct^WbD
z<}($WGnlJ@xeJ-KkO8%Ecp9@#V`>>wXR>XKS&Nuj&Qyz}r_%)W3Z_=DZNRL>Or6El
zN*ogBATUK=K8sl^nR*>lmr(x}#XeQ<pU2kEXVwKw?O>{vhW@RXjT44!Ozn~zEfQD9
z*n`cIu4byAsjZTp!_+>ewoAH(sr^h{A?dkN7(DzkW?jbA0j6%0{Wr+|LFQ^??j~k!
zLLrBkx=D^TGj%gl1Cnkb!P?AQJ|W#Tb}RH@T3Kw_d@WFKj)7G1*mYJoDwHMFkfqIC
zG5}LPz)8MIG*8Ke8IY>*eC_pyd??8W#zF}WyT8-Hy?s6WU7W%*u#=}Fn3MQY>c|s`
zOBaGkEL~cwe5{D=OADY_99jB7^5?`?9w3HWw7tvX6(|=EH8vTFiAA>fym2;I#Ehnz
zoIL6{L(C+{0-~;<z8Vo|@+w~{;#|`VxK=YZdjJ-Tk(NfplP%4VDN0+5pjp`NFlzSJ
zR6z04>!*m3w$#jWyfMB+<iuh!AQOyWQh1woxNRL|R*w%UDUbo#N`}I%ntAzjNKd6*
zPtyKIq!ybK#c5AYNgMS_>6a6gix5iip<bLG)xOa@iLapUR`I}!Qar(%D=bhazF1*`
zHCpD%!vIy{)f<<W)!-a+I-g9;O$edb9+%h8=Ml45WY!ZhZ1g+3H~4t1xTa%X%|haN
z1x`f%okXuBxS6IWto3^QzHXb#9x|-6J00u%oVfD>;$1IQ@ODQIw26;8O2DrrTd#y7
zD@|BK;72g0gi>w&c5ZXn>ur7;ZtuhMtE(=z+(c72fsY`<SIQ{Q&_M%q5Z)>G`qipb
zaEd>!%7u;Mi&Zb93ZB2IEUlF${Dn@`K%iS@`|6aWwNCEyuk>=&7ZCY3XTqSUy7^-;
z3e)P@W?hlEGL-E1_B8nIZeQCv?|>(Y(dx*?+gE4jjF2%?a0eEXp=3jXVnmasDSR`C
z2d>W6-di1nBpWU1BIp)t)|i4-L>{A4ttHAqP(@Aj9meehE!0GUIq)a{xD%U027g0m
z*I@ZjeEFui6|>dAP^_iL<B)1HXGDz4HxLuDEw3f;5O@h(z$P*7CbC!jsoyl$g4}t;
z*a^C^<wca*mMXlELFxtgaz(!_9md3ntpozvQ?@SXYA4n<Vme6FW`Y2MIhFSjT|h(9
zIQ`TdAlO7OVULKtrDw`x$W!S?zr8yJj)<}DD%e8qCqC*v5!-@nL|vO{vd%xs?m<U#
ze#Gu4#~m=@n?Y@weH5~c$YL?ip)>xAfLJz~)5s0ehUt{;)QbTg-BA2<?X<igb*@4%
zCx+dSI^-?x7nP1H4N+$fI<nxj*yAXM+q4%Q-Q<~t&Ju%eb<yZ7gc{M~EX;}6O`1#^
z5AB9#Ba2m#_eNPPUbmRsQd&$ZGU7P8Lnf`ySH6l7GuIcuo!SlSXF=_4#PLo#d6I4!
zX_NwDE{!71Be_NxMvS6c;rIp<WQ%hfN<e6~I~0Hxam&8xdAAVjID)OjKjs_uaJjRy
zs$<b}C>*i#R=`lm@6z_W7ZRWMJ^2+G#NtLe(bqZhyJ$dKL8?CCd;~1>=B53R#%?8u
zQoYNaVpcKIw0oN|>Rm%yUl!gPoi@gZTTg~UNJR(cW7<RB{Qx#GzxQf`^wk|AeRp2g
zPUIO%yxz8s582mw{PwWVile>5po&V~FpHFqw?+5EZ*#$M4}8j*Jzw6ohzEwlZ2^fb
zH81bKv?qncP!dKaXSZ*et;_DhXbZlX&@L>#Vz^%%4;VGK?-+nv%<rp32?Bkk=OuWe
zuQZ1w;QI*h$M^$8$=kPI{H?D$?p|yUiRJwb^|~NRWLh|LgyL*HJ+8rvhnHSr8NRD1
zTg3#~!guvTvfW23Nu24T=aET@Ggd4-P^9n!T43Nu(Dg7KEw0|{v0EC=ak_#tNQN<5
zc0a)Z1oPyHS6SX-sm5^)@=97LwJPuP-LzW2&zlA>gP_<wSPloZUk+m0HsV<OVzWWt
zk%BSLq3OBeuS4$4hv^>4B(Ed_6)$*!4{M%HqmUK}gBdhGU8Dvq@Pt?pm;;BzK%f%o
zLxEy=T00&XgUll|XF>|75}Sf0sXK5a6kqT3`E8ydzEd0xS~Exz=Zy-AtMKP&<q=^T
zE`oWYZa6zvS3dPli2NodPW9(yMJ`%AKT6Xyf@j2I!=<2yiVugg;YIPE;bM46OuOYD
zS%+!D(*!ReV4QoL=oiJzt+|;`68Qu{G%qTGyj#=Ygc#nM1+QxNY`qI$Sd?!o#5~)y
zE$?!v!{3uOWl%K`11f7|m1B<Ajkr>bZu2Xj#ESS^r@>p=j9Z_CtMtRt%x4io1{ZmO
zkI$r@WdswINPR~{%I!sIQj?Q(O?KN+m36oK+9Rr*s+;(ETJQqF&j@5F{y9-H3@s<>
z7X+73b6G^qN&HP(AVYKH_^(sn8wAgarV(>k_1j0XlrLk&qa&qoO#9793e>+vQ;s2=
zzuF|Xo<t=J8LD5VzE=n&N4b^UG;CY6hb6^{)yyJSpAQ3<)8m*LHy#2LAz4X)$x1RL
zLjwLY0zdvgMf0EjI5%r9@%u(JKOy$JjheE38AbG_9n)7!XSh&AKS8IDis+$~uh)lq
z3X!fVJy$08Z-n|2JF2lXPuQ6c?_dz`1+i~ec8)B+uBXekhc>yFsC$UoD}KGJ_I!NA
z2oF1jWw$X&MnoKEe7`t-ze|S24dS)k&EGG+S51h4mx%8?90*58=YUnbJz5O!X`hW&
zLhZ#=hxPeyNt#vysh~*J8&OcFu!-v29<ovzipF9R>FJ@TJ}@Z!q^1cCzD-xdcUMH|
z$|G~hyR>J-<@RH9ykHH*+FY*Ar~<XoJVp00X>0kO=_D<G$3$tuXCBU$Pm(0P<|&}%
zoO-43Qf~5S{p~>MKa?u7<>K+0Kce1T@!FUPwu;kZW(a9Xd!B=icWCW<1d;G9tF4pN
z7eYWUj3`@f|LqAz+M^Rh8lNgE_D+HOME%|xIQ5;~>y>>|;;7hpL;J(NmT+0rmO-bv
zkQ33&XeWxQNH66%w9yzYKdwD{?=xV!xX<7m(b1b9brqt@ipW*KX<`2{P+9T!I5C?q
z`M)|4NkoOg8Ew{sA3-V=J7(Yy>Ys-M4jZ=NA-{|h3;Gr$NP(%!h!*bBHIo!>l0mHK
zUvdrIyhcpP_|a>t^Uv)^u;=2pyq8AxbRxc2$wZP+@$*F-mQ0_eXByDQ@y8gwM1h~t
z>K;voh7WN@C}yqGA4>3Zo5#1-%iU4ed=4F&vRx=1U-o`X66S=c#1);c_8hE-tHk>U
zjc`_ce(=u~GDV%mr5K6&@S{H;qE=8}0rA%p6Fu6e$5T3LUR1Y;POIk)%tJAhhfKgf
zC2p-^{^PrXSxAP`-8Mh{cZJK@W$76tDrC^VdWNo}EwTwp3GxYw2`mKH5;POE5m*V<
z5cml;5)2Y-CZM+}--3XOu-m$VUhd($s4qw`{%GQYn$qFOZ6YNoKZwmxvaj3OGid47
z^KB?TOmJCV9Hs?N6Hr0LUnLmN^ZYn9Wr(D3l&i<1@c_h6(eMWZXT;f`<R+0_IccAN
TD$)*J4iAg!Lkq;fA;*6K>YB~h

diff --git a/tmw.py b/tmw.py
index c2f728e..dfb7d1e 100644
--- a/tmw.py
+++ b/tmw.py
@@ -681,6 +681,34 @@ def calculate_averageTopicScores(mastermatrixfile, targets, outfolder):
     print("Done.")
 
 
+################################
+# complexAverageTopicScores    #
+################################
+
+def calculate_complexAverageTopicScores(mastermatrixfile, targets, outfolder):
+    """Function to calculate average topic scores based on the mastermatrix."""
+    print("\nLaunched calculate_complexAverageTopicScores.")
+    if not os.path.exists(outfolder):
+        os.makedirs(outfolder)
+    with open(mastermatrixfile, "r") as infile:
+        mastermatrix = pd.DataFrame.from_csv(infile, header=0, sep=",")
+    ## Calculate average topic scores for each target category 
+    grouped = mastermatrix.groupby(targets, axis=0)
+    avg_topicscores = grouped.agg(np.mean)
+    if "year" not in targets:
+        avg_topicscores = avg_topicscores.drop(["year"], axis=1)
+    if "binID" not in targets:
+        avg_topicscores = avg_topicscores.drop(["binID"], axis=1)
+    #print(avg_topicscores)
+    ## Save grouped averages to CSV file for visualization.
+    identifierstring = '+'.join(map(str, targets))
+    resultfilename = "complex-avgtopicscores_by-"+identifierstring+".csv"
+    resultfilepath = outfolder+resultfilename
+    ## TODO: Some reformatting here, or adapt make_heatmaps.
+    avg_topicscores.to_csv(resultfilepath, sep=",", encoding="utf-8")
+    print("Done. Saved average topic scores for: "+identifierstring)    
+
+
 
 #################################
 # save_firstWords               #
diff --git a/tmw_config.py b/tmw_config.py
index 2978ca2..1b84ede 100644
--- a/tmw_config.py
+++ b/tmw_config.py
@@ -136,6 +136,13 @@
 #           "idno", "segmentID", "narration", "protagonist-policier", "binID"] 
 #tmw.calculate_averageTopicScores(mastermatrixfile, targets, outfolder)
 
+### calculate_complexAverageTopicScores
+### Based on the mastermatrix, calculates average topic scores for two target categories at once.
+mastermatrixfile = wdir+"/7_aggregates/mastermatrix.csv"
+outfolder = wdir+"7_aggregates/"
+targets = ["subgenre", "binID"] # 2 targets to combine
+tmw.calculate_complexAverageTopicScores(mastermatrixfile, targets, outfolder)
+
 ### save_firstWords
 ### Saves the first words of each topic to a separate file.
 topicWordFile = wdir+"6_mallet/topics-with-words.csv"
@@ -250,13 +257,6 @@
 #tmw.itemClustering(averageDatasets, figsize, outfolder, topicsPerItem, targetCategories, methods, metrics, sortingCriterium)
 
 
-
-
-################################
-###  OTHER / OBSOLETE / DEV  ###
-################################
-
-
 ### simpleProgression ###
 ### Creates a lineplot of topic development over textual progression.
 averageDataset = wdir+"/7_aggregates/avgtopicscores_by-binID.csv" 
@@ -273,6 +273,25 @@
 
 
 
+################################
+###  OTHER / OBSOLETE / DEV  ###
+################################
+
+
+### complexProgression ###
+### Creates a lineplot of topic development over textual progression, 
+### but does so separatedly for different target categories.
+averageDataset = wdir+"/7_aggregates/complex-avgtopicscores_by*.csv" 
+firstWordsFile = wdir+"/7_aggregates/firstWords.csv"
+outfolder = wdir+"/8_visuals/progression/complex/"
+numberOfTopics = 50 # must be actual number of topics modeled.
+fontscale = 1.0
+dpi = 300
+height = 0 # for lineplot; 0=automatic
+mode = "all" # all|sel 
+topics = ["25", "44", "12"] # if mode="sel": list of topics
+tmw.complexProgression(averageDataset, firstWordsFile, outfolder, numberOfTopics, fontscale, dpi, height, mode, topics)
+
 
 
 

From ee0eeff3018091398f159be513783b6d9bd5c5c1 Mon Sep 17 00:00:00 2001
From: christofs <c.schoech@gmail.com>
Date: Fri, 4 Sep 2015 18:32:36 +0200
Subject: [PATCH 37/56] Started work on complexProgression

---
 __pycache__/tmw.cpython-34.pyc | Bin 43742 -> 47229 bytes
 tmw.py                         | 139 +++++++++++++++++++++------------
 tmw_config.py                  |  22 ++----
 3 files changed, 94 insertions(+), 67 deletions(-)

diff --git a/__pycache__/tmw.cpython-34.pyc b/__pycache__/tmw.cpython-34.pyc
index 2d0224f702140357fbc0b5e5ed375c9d3a83d8d3..182978792a8bc9a6be719c150f7c9ead6e91264d 100644
GIT binary patch
delta 1616
zcmaJ=eN2^A9R5A$-uqs-ycf~<P(jrB0pym3E6qePaLH5<@FJ0B#JMr?!qxX)0p$`Z
zV2G&DNkFWu{79Lz*;#F^^+&b2bj^R7A4BKo{3>zPv{vcXIj^~~**e?#z0Wzn_dLJz
zKJW9pc$z)@g-yI`dOe$t-}aS{23?~e!=Z4tnxRbRv59StrdnztDqBpvj(EL|^NFt~
zzQM)?#2bi5Y`lbcBk{LvTu8i$c#(}~65mLCsf}MCzKM8=jb{<xOuW>_FA^u>pg<9o
z6%lVH-eT_;Pqh=R6j(~tJ}UDO_!i=C+sP8*ZN#_Qc$q$3oY5{*o%6GhDMrrmixczm
zn|KUGK&#zkou!CDRbEm%l^`BP+(XSkqd_&Qv`JO56dp$5IQ@PzQz>e66ZcZYO-VeK
zk{Y8Z!YC4@P_#`ooFdW0UBnII!-&VJBMf21r4s6lA^;bQ<}Mb;Un*r*iA>JP1jdQ{
z+{74|NOp`wEIg3ax$ipggJcUn1^$ws<(&hV+_n(FaJhZaN(U0<V8JYgX)?Vi2rQ7j
z#Sx%LCit!Z%jF*>gA6tD`YZQ=7J2^F9H3W@UC|2clLubA%5Y4!m3tY!m)#Y0z;80v
z-^TDj_EZXn(Q-=Fx4?9HcXhYsY^{C>tdPyYqYO<V)A}1h`AjI5p-pD3O9FODU#+@c
zi&JFppCB9R;u-p6=lT=C*RrgkfZ@38Z!C1+oP4lRX?sU*+}z6Glt%0O`aAZweE>Ww
zbJ}+TdGg0?&oTI9^o}nXHpuTf4Pcc#*>wtNlDoyr4s^*Y-ES})5xKkPh`rSbl6CaK
zWt|3*II?}_<cXF3@XC-skQ1z46Y$qBv4WMBKNMOOtXav+!{tkZO9H|0SrNTsEUwCr
zf@>lAcJ_!fyW+&6&f3%mE@g72oNmyDnH845JnSDrCN6cRxvX=#l-bprHtC5MhE@x#
zs`0N;wSE>SyC$VvQ->;?idYY~nqD<?)C^UT=*S&~g(9^han2*>D9<#yNVz8B(w2jI
zR4Gfj^`gok<xZV5?A$Wtw8ByqiZh}_*E_*s)+xoBZ<VN+;kHb%GbhC}U{qScb!$Qy
zZZTV=Mh$rFrC&^INSbX6Rv!3&!C<JHnu?Mf9*jUL)6u#irO0M3>%3Ov&s{ytx**2Y
zjY)qj%zv{!7AC<ESG&`v-_*)PYic!g39XK;%r^??)={meMNo~$Xf-2ivB>LrKJ!i8
zP~2hf9HxkaR0oC~6m}AK>gB)f8b$T9b%W-vNE63;CXHX)K+9@8Ht!>Gb%jS#DuK0#
zPxUVMSeMkTtjqelntNx!Exzg<iyPu{Z{he66;C*8npWp&bz5?JV2Z-s<Bd3%2i7fF
z-tL3j>QOwUh8*Y{0IM+M7w_$1Xca|!FS2!pm>QZOeIMQgcF3%c6T$WvqNMh9@%g^3
zXcu$$|2q7TW>=F)X&wjbj+}SE3k-^agB>^_E*u<%d*b(l*Hygm$^4WvLw}Xm5e<`#
zfpuS9A){1g9Xf*1+Kbi=9klk=Egf#_a7X-fI5}oe!+Sd17jgZQaa7Fc&qj)<@6Ssc
hX_{udnP`qMlg%kMidQJzj8QuYW{Sy7Z(O|T{RfV5%sK!7

delta 857
zcmYjPeMpsO6hFW7-g~9%bG_!SCR}xaR_m{U5je$Vxma`gT`5x8@>w#`)vmf*>y~TI
zaL$Z+#-ds&%%E>|r?A==1i`jGNX8%xY9+dATxqbi1-0#cU;3lx;hgj1oZtC)IM36E
zeCrp^nN|UR<#+d27J5Mw^xUy*Tx3h@BWon5+v=#IkBRE?NWV{dg^OPy9VNZW#rdQ+
zldg4f0cj!)YA&U^Qqo&Uf8hGdT>n;TE~l0Xs;eOA80l?ptdjJ0(j6|Ya+0g(e|X=S
zrmTS-L(=HUD@3XpSS<6}h+;nKgk2z!nv$q3NNvd!19h_F-IhRFQ7pj}Eee?vOM;n6
zq=R;VVJB*vm_W1K<OI9ROtWjzGV@|_t;C97#_TMo7yFTdWLt7L)z%7OkDt?#1V78w
z*FLjwP1Y|P0-i`p$qm3GzrP9K1&J@Ow~#KS75R)@>8{!cl*m8TF<^zXue<}Sm71D3
zW0Nec`wQ491*?mJ1Jbdk3pgU)`nwE~dv668KgpwpDDbDe`*t_uU-`SyFf!!Rri;Kr
zDQo`RsX5v56sVVpji;G+c+A*v*qn~s#(UD&n#AalL+!J``<%c!lNKG|ogVYeNVeRJ
zh8SPS!!17o-^+!q<%}Q1zrE7JxUAmM2~5kcA9XQ&vZ3pc^B~VBhk*t1WzTM)Mi%rg
zWHiV-edih7lD)?VM9qo*Ia0p&7_dtoo7XM$%i?|O85hjp{#WHvPbEzGQ7_;1%BrK|
z;J8<Ej~`?N&Gq37d*(sYcPfkj^P1hQ*)n$OH*mVo9ByA?o(%nrIX3A5Lgwp{KW66o
z><?)Rk-MSDq<s5T0GKkDr8ivd1e2dFa0Yw`I+y1n5;u`ElZczzvr9A9IV|O{EI$I)
z3@FY-WEw>CtL6OJlfb0u7!4;**`7$;QPbwg=v)k%E2BloGJ$h%WacQPLMlzAsIYoT
QCE7MsWhhpG<d6#d4`;IdH~;_u

diff --git a/tmw.py b/tmw.py
index dfb7d1e..847f721 100644
--- a/tmw.py
+++ b/tmw.py
@@ -704,7 +704,6 @@ def calculate_complexAverageTopicScores(mastermatrixfile, targets, outfolder):
     identifierstring = '+'.join(map(str, targets))
     resultfilename = "complex-avgtopicscores_by-"+identifierstring+".csv"
     resultfilepath = outfolder+resultfilename
-    ## TODO: Some reformatting here, or adapt make_heatmaps.
     avg_topicscores.to_csv(resultfilepath, sep=",", encoding="utf-8")
     print("Done. Saved average topic scores for: "+identifierstring)    
 
@@ -1346,20 +1345,8 @@ def itemClustering(averageDatasets, figsize, outfolder, topicsPerItem,
 
 
 
-
-
-
-
-
-
-##################################################################
-###    OTHER / OBSOLETE / DEV                                  ###
-##################################################################
-
-
-
 ###########################
-## textual Progression  ###
+## simple progression   ###
 ###########################
 
 
@@ -1491,32 +1478,27 @@ def simpleProgression(averageDataset, firstWordsFile, outfolder,
     else: 
         print("Please select a valid value for 'mode'.")
     print("Done.")
-    
-    
 
 
 
 
 
 
-###########################
+##################################################################
+###    OTHER / OBSOLETE / DEV                                  ###
+##################################################################
 
 
-def get_overTime_firstWords(firstWordsFile):
-    """Function to load list of top topic words into dataframe."""
-    #print("  Getting firstWords.")
-    with open(firstWordsFile, "r") as infile: 
-        firstWords = pd.read_csv(infile, header=None)
-        firstWords.drop(0, axis=1, inplace=True)
-        firstWords.rename(columns={1:"topicwords"}, inplace=True)
-        firstWords.index = firstWords.index.astype(np.int64)        
-        #print(firstWords)
-        return(firstWords)
+###########################
+## complex progression  ###        IN DEVELOPMENT
+###########################
 
-def get_overTime_dataToPlot(average, firstWordsFile, entriesShown, topics): 
+
+def get_selComplexProgression_dataToPlot(averageDataset, firstWordsFile, 
+                               entriesShown, topics): 
     """Function to build a dataframe with all data necessary for plotting."""
-    #print("  Getting data to plot.")
-    with open(average, "r") as infile:
+    print("- getting data to plot...")
+    with open(averageDataset, "r") as infile:
         allScores = pd.DataFrame.from_csv(infile, sep=",")
         allScores = allScores.T        
         #print(allScores.head())
@@ -1524,21 +1506,23 @@ def get_overTime_dataToPlot(average, firstWordsFile, entriesShown, topics):
         someScores = allScores.loc[topics,:]
         someScores.index = someScores.index.astype(np.int64)        
         ## Add information about the firstWords of topics
-        firstWords = get_overTime_firstWords(firstWordsFile)
+        firstWords = get_progression_firstWords(firstWordsFile)
         dataToPlot = pd.concat([someScores, firstWords], axis=1, join="inner")
         dataToPlot = dataToPlot.set_index("topicwords")
         dataToPlot = dataToPlot.T
         #print(dataToPlot)
         return dataToPlot
-
-def create_overTime_lineplot(dataToPlot, outfolder, fontscale, topics, dpi, height):
+    
+    
+def create_selComplexProgression_lineplot(dataToPlot, outfolder, fontscale, 
+                                topics, dpi, height):
     """This function does the actual plotting and saving to disk."""
-    print("  Creating lineplot for selected topics.")
+    print("- creating the plot...")
     ## Plot the selected data
     dataToPlot.plot(kind="line", lw=3, marker="o")
-    plt.title("Entwicklung der Topic Scores", fontsize=20)
+    plt.title("Entwicklung ausgewählter Topics über den Textverlauf", fontsize=20)
     plt.ylabel("Topic scores (absolut)", fontsize=16)
-    plt.xlabel("Jahrzehnte", fontsize=16)
+    plt.xlabel("Textabschnitte", fontsize=16)
     plt.setp(plt.xticks()[1], rotation=0, fontsize = 14)   
     if height != 0:
         plt.ylim((0.000,height))
@@ -1548,33 +1532,89 @@ def create_overTime_lineplot(dataToPlot, outfolder, fontscale, topics, dpi, heig
         os.makedirs(outfolder)
     ## Format the topic information for display
     topicsLabel = "-".join(str(topic) for topic in topics)
-    figure_filename = outfolder+"lineplot-"+topicsLabel+".png"
+    figure_filename = outfolder+"sel_"+topicsLabel+".png"
     plt.savefig(figure_filename, dpi=dpi)
     plt.close()
 
-def create_overTime_areaplot(dataToPlot, outfolder, fontscale, topics, dpi):
+def get_allComplexProgression_dataToPlot(averageDataset, firstWordsFile, 
+                               entriesShown, topic): 
+    """Function to build a dataframe with all data necessary for plotting."""
+    print("- getting data to plot...")
+    with open(averageDataset, "r") as infile:
+        allScores = pd.DataFrame.from_csv(infile, sep=",", index_col=None)
+        #allScores = allScores.T
+        print(allScores)
+        groupedScores = allScores.groupby("binID").groups
+        print(groupedScores)
+        ## Select the data for current topics
+        #someScores = allScores.loc[topic,:]
+        #someScores.index = someScores.index.astype(np.int64)
+        #dataToPlot = someScores
+        #print(dataToPlot)
+        #return dataToPlot
+        
+# TODO: Make sure this is only read once and then select when plotting.
+    
+    
+def create_allComplexProgression_lineplot(dataToPlot, outfolder, fontscale, 
+                                firstWordsFile, topic, dpi, height):
     """This function does the actual plotting and saving to disk."""
-    print("  Creating areaplot for selected topics.")
-    ## Turn absolute data into percentages.
-    dataToPlot = dataToPlot.apply(lambda c: c / c.sum() * 100, axis=1)
+    print("- creating the plot for topic " + topic)
+    ## Get the first words info for the topic
+    firstWords = get_progression_firstWords(firstWordsFile)
+    topicFirstWords = firstWords.iloc[int(topic),0]
+    #print(topicFirstWords)
     ## Plot the selected data
-    dataToPlot.plot(kind="area")
-    plt.title("Entwicklung der Topic Scores", fontsize=20)
-    plt.ylabel("Topic scores (anteilig zueinander)", fontsize=16)
-    plt.xlabel("Jahrzehnte", fontsize=16)
-    plt.ylim((0,100))
+    dataToPlot.plot(kind="line", lw=3, marker="o")
+    plt.title("Entwicklung über den Textverlauf für "+topicFirstWords, fontsize=20)
+    plt.ylabel("Topic scores (absolut)", fontsize=16)
+    plt.xlabel("Textabschnitte", fontsize=16)
     plt.setp(plt.xticks()[1], rotation=0, fontsize = 14)   
+    if height != 0:
+        plt.ylim((0.000,height))
 
     ## Saving the plot to disk.
     if not os.path.exists(outfolder):
         os.makedirs(outfolder)
     ## Format the topic information for display
-    topicsLabel = "-".join(str(topic) for topic in topics)
-    figure_filename = outfolder+"areaplot-"+topicsLabel+".png"
+    topicsLabel = str(topic)
+    figure_filename = outfolder+"all_"+topicsLabel+".png"
     plt.savefig(figure_filename, dpi=dpi)
     plt.close()
 
 
+def complexProgression(averageDataset, firstWordsFile, outfolder, 
+                           numberOfTopics, 
+                           fontscale, dpi, height, mode, topics):
+    """Function to plot topic development over textual progression."""
+    print("Launched textualProgression.")
+    if mode == "sel": 
+        entriesShown = numberOfTopics
+        dataToPlot = get_selSimpleProgression_dataToPlot(averageDataset, 
+                                                      firstWordsFile, 
+                                                      entriesShown, 
+                                                      topics)
+        create_selSimpleProgression_lineplot(dataToPlot, outfolder, 
+                                          fontscale, topics, 
+                                          dpi, height)
+    elif mode == "all": 
+        entriesShown = numberOfTopics
+        topics = list(range(0, numberOfTopics))
+        for topic in topics:
+            topic = str(topic)
+            dataToPlot = get_allComplexProgression_dataToPlot(averageDataset, 
+                                                             firstWordsFile, 
+                                                             entriesShown, 
+                                                             topic)
+            #create_allComplexProgression_lineplot(dataToPlot, outfolder, 
+            #                                     fontscale, firstWordsFile, 
+            #                                     topic, dpi, height)
+    else: 
+        print("Please select a valid value for 'mode'.")
+    print("Done.")
+    
+    
+
 
 ###########################
 ## show_segment         ###
@@ -1590,9 +1630,8 @@ def show_segment(wdir,segmentID, outfolder):
 
 
 
-
 ###########################
-## itemPCA              ###
+## itemPCA              ###            IN DEVELOPMENT
 ###########################
 
 from sklearn.decomposition import PCA
diff --git a/tmw_config.py b/tmw_config.py
index 1b84ede..41d08c2 100644
--- a/tmw_config.py
+++ b/tmw_config.py
@@ -141,7 +141,7 @@
 mastermatrixfile = wdir+"/7_aggregates/mastermatrix.csv"
 outfolder = wdir+"7_aggregates/"
 targets = ["subgenre", "binID"] # 2 targets to combine
-tmw.calculate_complexAverageTopicScores(mastermatrixfile, targets, outfolder)
+#tmw.calculate_complexAverageTopicScores(mastermatrixfile, targets, outfolder)
 
 ### save_firstWords
 ### Saves the first words of each topic to a separate file.
@@ -268,7 +268,7 @@
 height = 0 # for lineplot; 0=automatic
 mode = "sel" # all|sel 
 topics = ["25", "44", "12"] # if mode="sel": list of topics
-tmw.simpleProgression(averageDataset, firstWordsFile, outfolder, numberOfTopics, fontscale, dpi, height, mode, topics)
+#tmw.simpleProgression(averageDataset, firstWordsFile, outfolder, numberOfTopics, fontscale, dpi, height, mode, topics)
 
 
 
@@ -281,10 +281,10 @@
 ### complexProgression ###
 ### Creates a lineplot of topic development over textual progression, 
 ### but does so separatedly for different target categories.
-averageDataset = wdir+"/7_aggregates/complex-avgtopicscores_by*.csv" 
+averageDataset = wdir+"/7_aggregates/complex-avgtopicscores_by-subgenre+binID.csv" 
 firstWordsFile = wdir+"/7_aggregates/firstWords.csv"
 outfolder = wdir+"/8_visuals/progression/complex/"
-numberOfTopics = 50 # must be actual number of topics modeled.
+numberOfTopics = 1 # must be actual number of topics modeled.
 fontscale = 1.0
 dpi = 300
 height = 0 # for lineplot; 0=automatic
@@ -310,16 +310,4 @@
 targetCategories = ["subgenre"] # list
 methods=["weighted"] # list
 metrics=["cosine"] # list
-#tmw.itemPCA(averageDatasets, targetCategories, topicsPerItem, sortingCriterium, figsize, outfolder)
-
-
-
-
-### 6b - create_topicscores_lineplot
-inpath = wdir + "7_aggregates/*-lp.csv"  # narrow down as needed
-outfolder = wdir + "8_visuals/lineplots/"
-topicwordfile = wdir + "6_mallet/topics-with-words.csv"
-dpi = 300
-height = 0.050
-genres = ["detection","noir"] # User: set depending on metadata. Available: noir, detection, criminel, experim., archq., blanche, neopl., susp.
-#tmw.create_topicscores_lineplot(inpath,outfolder,topicwordfile,dpi,height,genres)
+#tmw.itemPCA(averageDatasets, targetCategories, topicsPerItem, sortingCriterium, figsize, outfolder)
\ No newline at end of file

From da860ae0796b20c98fa294b44783aed90c6e3575 Mon Sep 17 00:00:00 2001
From: christofs <c.schoech@gmail.com>
Date: Sat, 5 Sep 2015 17:35:55 +0200
Subject: [PATCH 38/56] Fixed extra slashes; number_of_topics

---
 tmw_config.py | 76 +++++++++++++++++++++++++--------------------------
 1 file changed, 38 insertions(+), 38 deletions(-)

diff --git a/tmw_config.py b/tmw_config.py
index 41d08c2..d749995 100644
--- a/tmw_config.py
+++ b/tmw_config.py
@@ -17,7 +17,7 @@
 # 3. Posprocessing Data
 # 4. Basic Visualizations
 # 5. Advanced Visualizations
-# 6. Other / Obsolete
+# 6. Other / Obsolete / in development
 
 import tmw
 #print(help(topmod))
@@ -122,7 +122,7 @@
 mastermatrixfile = "mastermatrix.csv"
 metadatafile = wdir+"/metadata.csv"
 topics_in_texts = wdir+"/6_mallet/topics-in-texts.csv"
-number_of_topics = 50
+number_of_topics = num_topics
 useBins = True # True|False
 binDataFile = wdir+"7_aggregates/segs-and-bins.csv"
 #tmw.create_mastermatrix(corpuspath, outfolder, mastermatrixfile, metadatafile, topics_in_texts, number_of_topics, useBins, binDataFile)
@@ -158,10 +158,10 @@
 
 ### make_wordle_from_mallet
 ### Creates a wordle for each topic.
-word_weights_file = wdir + "6_mallet/" + "word-weights.txt"
-topics = 50
+word_weights_file = wdir+"6_mallet/" + "word-weights.txt"
+topics = num_topics
 words = 40
-outfolder = wdir + "8_visuals/wordles/"
+outfolder = wdir+"8_visuals/wordles/"
 font_path = "/home/christof/.fonts/AlegreyaSans-Regular.otf"
 dpi = 300
 #tmw.make_wordle_from_mallet(word_weights_file,topics,words,outfolder,font_path,dpi)
@@ -178,11 +178,11 @@
 
 ### plot_topTopics
 ### For each item from a category, creates a barchart of the top topics.
-averageDatasets = wdir+"/7_aggregates/avg*.csv" 
-firstWordsFile = wdir+"/7_aggregates/firstWords.csv"
+averageDatasets = wdir+"7_aggregates/avg*.csv" 
+firstWordsFile = wdir+"7_aggregates/firstWords.csv"
 targetCategories = ["author", "subgenre", "binID"] 
 topTopicsShown = 30 
-numberOfTopics = 50 
+numberOfTopics = num_topics 
 fontscale = 1.0
 height = 0 # 0=automatic and variable
 dpi = 300
@@ -191,10 +191,10 @@
 
 ### plot_topItems ###
 ### For each topic, creates a barchart with top items from a category. 
-averageDatasets = wdir+"/7_aggregates/avg*.csv" 
-outfolder = wdir+"/8_visuals/topItems/"
-firstWordsFile = wdir+"/7_aggregates/firstWords.csv"
-numberOfTopics = 50 # must be actual number of topics modeled. 
+averageDatasets = wdir+"7_aggregates/avg*.csv" 
+outfolder = wdir+"8_visuals/topItems/"
+firstWordsFile = wdir+"7_aggregates/firstWords.csv"
+numberOfTopics = num_topics # must be actual number of topics modeled. 
 targetCategories = ["author", "subgenre", "binID"] 
 topItemsShown = 30 
 fontscale = 0.8
@@ -210,23 +210,23 @@
 
 ### plot_distinctiveness_heatmap ###
 ### For each category, make a heatmap of most distinctive topics. 
-averageDatasets = wdir+"/7_aggregates/avg*.csv" 
-firstWordsFile = wdir+"/7_aggregates/firstWords.csv"
-outfolder = wdir+"/8_visuals/distinctiveness/"
+averageDatasets = wdir+"7_aggregates/avg*.csv" 
+firstWordsFile = wdir+"7_aggregates/firstWords.csv"
+outfolder = wdir+"8_visuals/distinctiveness/"
 targetCategories = ["author", "subgenre", "binID"] 
 # one or several: "author-name", "decade", "subgenre", "gender", "idno", "title"
-numberOfTopics = 50 # must be actual number of topics modeled.
+numberOfTopics = num_topics # must be actual number of topics modeled.
 topTopicsShown = 20 
 fontscale = 1.0
 dpi = 300
 #tmw.plot_distinctiveness_heatmap(averageDatasets, firstWordsFile, outfolder, targetCategories, numberOfTopics, topTopicsShown, fontscale, dpi)
 
 ### plot_topicsOverTime ###
-### Creates lineplots or areaplots for topic development over time.
-averageDatasets = wdir+"/7_aggregates/avgtopicscores_by-decade.csv" 
-firstWordsFile = wdir+"/7_aggregates/firstWords.csv"
-outfolder = wdir+"/8_visuals/overTime/"
-numberOfTopics = 50 # must be actual number of topics modeled.
+###     
+averageDatasets = wdir+"7_aggregates/avgtopicscores_by-decade.csv" 
+firstWordsFile = wdir+"7_aggregates/firstWords.csv"
+outfolder = wdir+"8_visuals/overTime/"
+numberOfTopics = num_topics # must be actual number of topics modeled.
 fontscale = 1.0
 dpi = 300
 height = 0 # for lineplot; 0=automatic
@@ -236,9 +236,9 @@
 
 ### topicClustering ###
 # This function will create a dendrogram grouping topics based on their word weight similarity.
-wordWeightsFile = wdir + "6_mallet/" + "word-weights.txt"
-outfolder = wdir + "8_visuals/clustering/"
-topicsToUse = 50 # = all topics modeled
+wordWeightsFile = wdir+"6_mallet/"+"word-weights.txt"
+outfolder = wdir+"8_visuals/clustering/"
+topicsToUse = num_topics # = all topics modeled
 wordsPerTopic = 50
 methods=["weighted"] # list
 metrics=["cosine"] # list
@@ -246,10 +246,10 @@
 
 ### itemClustering ###
 # This function creates a dendrogram of items in a category (authors, titles).
-averageDatasets = wdir+"/7_aggregates/avg*title.csv" 
+averageDatasets = wdir+"7_aggregates/avg*title.csv" 
 figsize = (10,80) # width,height
-outfolder = wdir + "8_visuals/clustering/"
-topicsPerItem = 50
+outfolder = wdir+"8_visuals/clustering/"
+topicsPerItem = num_topics
 sortingCriterium = "std" # std|mean
 targetCategories = ["title"] # list
 methods=["weighted"] # list
@@ -259,10 +259,10 @@
 
 ### simpleProgression ###
 ### Creates a lineplot of topic development over textual progression.
-averageDataset = wdir+"/7_aggregates/avgtopicscores_by-binID.csv" 
-firstWordsFile = wdir+"/7_aggregates/firstWords.csv"
-outfolder = wdir+"/8_visuals/progression/simple/"
-numberOfTopics = 50 # must be actual number of topics modeled.
+averageDataset = wdir+"7_aggregates/avgtopicscores_by-binID.csv" 
+firstWordsFile = wdir+"7_aggregates/firstWords.csv"
+outfolder = wdir+"8_visuals/progression/simple/"
+numberOfTopics = num_topics # must be actual number of topics modeled.
 fontscale = 1.0
 dpi = 300
 height = 0 # for lineplot; 0=automatic
@@ -281,10 +281,10 @@
 ### complexProgression ###
 ### Creates a lineplot of topic development over textual progression, 
 ### but does so separatedly for different target categories.
-averageDataset = wdir+"/7_aggregates/complex-avgtopicscores_by-subgenre+binID.csv" 
-firstWordsFile = wdir+"/7_aggregates/firstWords.csv"
-outfolder = wdir+"/8_visuals/progression/complex/"
-numberOfTopics = 1 # must be actual number of topics modeled.
+averageDataset = wdir+"7_aggregates/complex-avgtopicscores_by-subgenre+binID.csv" 
+firstWordsFile = wdir+"7_aggregates/firstWords.csv"
+outfolder = wdir+"8_visuals/progression/complex/"
+numberOfTopics = num_topics # must be actual number of topics modeled.
 fontscale = 1.0
 dpi = 300
 height = 0 # for lineplot; 0=automatic
@@ -297,14 +297,14 @@
 
 ### 5c show segment
 ## To read a specific segment, better than looking in the folder.
-segmentID = "rf0166§0118"
+segmentID = "rf0166§0118" # indicate here, manually
 outfolder = wdir+"/9_sel-segs/"
 #tmw.show_segment(wdir,segmentID, outfolder)
 
 ### itemPCA ### CURRENTLY NOT WORKING
-averageDatasets = wdir+"/7_aggregates/avg*.csv" 
+averageDatasets = wdir+"7_aggregates/avg*.csv" 
 figsize = (10,10) # width,height
-outfolder = wdir + "8_visuals/clustering/"
+outfolder = wdir+"8_visuals/clustering/"
 topicsPerItem = 50
 sortingCriterium = "std" # std|mean
 targetCategories = ["subgenre"] # list

From 22b24daa187d297c66a1ff3f606018f05812b6e6 Mon Sep 17 00:00:00 2001
From: christofs <c.schoech@gmail.com>
Date: Mon, 7 Sep 2015 10:42:52 +0200
Subject: [PATCH 39/56] Fixed issue: https://github.com/cligs/tmw/issues/10

---
 tmw.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tmw.py b/tmw.py
index 847f721..a16e0ea 100644
--- a/tmw.py
+++ b/tmw.py
@@ -216,12 +216,12 @@ def segmenter(inpath, outfolder, target, sizetolerancefactor, preserveparagraphs
             # segment contains words assigned to the current segment
             segment = []
 
-            # go thru paragraphs one by one
+            # go through paragraphs one by one
             for line in infile:
                 text = line
-                # remove special characters and space-chains
-                text = re.sub("[,;\.!?—\t\r\n\v\f]", " ", text)
-                text = re.sub("-", " ", text)
+                # (optional) remove punctuation, special characters and space-chains
+                #text = re.sub("[,;\.:!?¿\(\)—-]", " ", text)
+                text = re.sub("[\t\r\n\v\f]", " ", text)
                 text = re.sub("[ ]{1,9}", " ", text)
 
                 # tokenize text

From 470ddb161609ad3f91144f541df1313cdf6570e6 Mon Sep 17 00:00:00 2001
From: christofs <c.schoech@gmail.com>
Date: Mon, 7 Sep 2015 10:49:29 +0200
Subject: [PATCH 40/56] Fixed issue with ambiguous lemmas,
 https://github.com/cligs/tmw/issues/13

---
 tmw.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tmw.py b/tmw.py
index a16e0ea..b620786 100644
--- a/tmw.py
+++ b/tmw.py
@@ -448,7 +448,7 @@ def make_lemmatext(inpath, outfolder, mode, stoplist_errors):
                         elif "NOM" in pos or "VER" in pos or "ADJ" in pos or "ADV" in pos and "|" not in lemma and "<unknown>" not in lemma:
                             lemmata.append(lemma.lower())
                     elif mode == "esN":
-                        if "|" in lemma:
+                        if "|" in lemma and "NC" in pos:
                             lemmata.append(token.lower())
                         elif "NC" in pos and "|" not in lemma and "<unknown>" not in lemma:
                             lemmata.append(lemma.lower())

From a8d0a31555f2aa2c04bb10efcb320645492c072a Mon Sep 17 00:00:00 2001
From: christofs <c.schoech@gmail.com>
Date: Mon, 7 Sep 2015 10:58:36 +0200
Subject: [PATCH 41/56] General settings at the beginning:
 https://github.com/cligs/tmw/issues/12

---
 tmw_config.py | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/tmw_config.py b/tmw_config.py
index d749995..1ed87c0 100644
--- a/tmw_config.py
+++ b/tmw_config.py
@@ -12,6 +12,7 @@
 # For information on requirements and usage, see the README file.
 
 # This config file is structured as follows: 
+# 0. General Settings
 # 1. Preprocessing Texts
 # 2. Topic Modeling
 # 3. Posprocessing Data
@@ -22,8 +23,19 @@
 import tmw
 #print(help(topmod))
 
-### Set the general working directory.
+
+################################
+### GENERAL SETTINGS         ###
+################################
+
+### The following settings depend on the system used.
+### Path to the working directory.
 wdir = "/home/christof/Dropbox/0-Analysen/2015/hybrid/rf10/" # end with slash.
+### Path to the TreeTagger file (language-dependent!)
+tagger = "/home/christof/Programs/TreeTagger/cmd/tree-tagger-spanish"
+### Path to Mallet installation directory
+mallet_path = "/home/christof/Programs/Mallet/bin/mallet"
+
 
 ################################
 ###    PREPROCESSING TEXTS   ###
@@ -64,7 +76,7 @@
 ### Perform lemmatization and POS tagging.
 infolder = wdir + "3_tokens/"
 outfolder = wdir + "4_tagged/"
-tagger = "/home/christof/Programs/TreeTagger/cmd/tree-tagger-french"
+tagger = tagger
 #tmw.call_treetagger(infolder, outfolder, tagger) 
 
 ### make_lemmatext
@@ -83,7 +95,7 @@
 
 ### call_mallet_import
 ### Imports text data into the Mallet corpus format.
-mallet_path = "/home/christof/Programs/Mallet/bin/mallet"
+mallet_path = mallet_path
 infolder = wdir + "5_lemmata/"
 outfolder = wdir + "6_mallet/" 
 outfile = outfolder + "corpus.mallet"
@@ -97,7 +109,7 @@
 ### num_iterations: How many times the model is improved. 
 ### num_top_words: Number of words to save and display for each topic.
 ### num_threads: Number of parallel processing threads to use. 
-mallet_path = "/home/christof/Programs/Mallet/bin/mallet"
+mallet_path = mallet_path
 inputfile = wdir + "6_mallet/corpus.mallet"
 outfolder = wdir + "6_mallet/"
 num_topics = "50" # string

From 1c598a6218210f2728ad9e757342afa6343c3684 Mon Sep 17 00:00:00 2001
From: christofs <c.schoech@gmail.com>
Date: Mon, 7 Sep 2015 11:54:44 +0200
Subject: [PATCH 42/56] Fixed issue re numOfTopics:
 https://github.com/cligs/tmw/issues/11

---
 __pycache__/tmw.cpython-34.pyc | Bin 47229 -> 44424 bytes
 tmw.py                         |  59 +++++++++++++++++----------------
 tmw_config.py                  |  57 ++++++++++++++++---------------
 3 files changed, 59 insertions(+), 57 deletions(-)

diff --git a/__pycache__/tmw.cpython-34.pyc b/__pycache__/tmw.cpython-34.pyc
index 182978792a8bc9a6be719c150f7c9ead6e91264d..cd15355dc422cef5d1e5baab998eea1fc4a3f51e 100644
GIT binary patch
delta 9520
zcmc&)33yahmVT$IQkBZSZ$*U)AtVI^!;YXV5;lQ=kPtvHR8p0sl1f#$RV)b!fq<gu
z$I?edKxOP!v|EM2N5j~y{b}RUiY=m!+Rue(<A@{EEsfnSGmdl4eMzc-z_@(hO!(ga
z-MZ^J_x@+O@A2*DhFjk?<o_iuK4#qV|EMTW0Oik0#GjGMlj#W>4(0#eI89O9p!gOj
zOBX2a4#mAe*{mo_7b<S5->CS?m8J@1X@vsrPQ|@Rwk=ZJn-%xXvRui77dmfImzs*>
z4KD$%9FT$glADspL0gQ{5UsQ+$|g`smlvWv#Q}g5Q!*h57o~)vo0YbBYN*H8Q$}OI
z)D%d-t5Qqi61hLPjWlaAHl{wG&}zBS8XprIw_I3BXmxJdH2~vqYkEnMF&)y4aS#uM
zAZAlr4Qs}Kx%9-+GnJm;eMS<h1k*AE<lvSJC)m_O8ASm7R4ua%mbXPI4OXQgO=$yO
zIp{WWO?vlF@GD$$XDFHNb|sXdwAmGRrn1SPY|l}srmdgSkSQy(6n8dF08mM?->_IJ
zU{{x9?}yaZ<e;Z6SnTq<i>tl<;INwDYWyz84q0fo6~P>wZtD-zu+}zqauNwFk{MD7
zG;=6WMV+0wqCeL;WF1f8uNI5BW(h+ALkhzje8cuHrXnih)_#LvH6H30j81|WNCDBJ
z4zgDRRN-y8N5PF#`tMC!M;)~&;i>bi6@#hl1ks6~_s@mJnCM7{ZEAsI7r-`rI&T<k
z#j|-cpcY3B$b+5gq5<DRHu91zm}c?%YrQ^CPz>j)ndloh9M<9e1D*YLQ$>x5fdA6y
z$)bmkDfk>8O9p;Dux{W`W1CrNF!plAW^A`AA&b&xRoo`I*Cr!wE*KF#!O#{<lo^S?
zD-e?`Mt7XjmZUVA6+z`#r7cluw8(ONxZD<}G{~lgIAxRMBCNC+SH>7IFlgxPh&h61
zz@7M0=KHrnLk$)qrWFqB_z~i?85DP_(r8dbOph+4*UJVs5mVRkWka_v$;t(~gnRTX
zJLCs!gW6R%2A~}?om-$4_c@2aaXjwak+Pl6NQ(+K`@A7Bj>=*hHVv+TnRsk)A<V+h
z22aR&n5&0yzl0`Nqo>N}X=-wXJZnRHaPpAyj4a52f#5I<fONt<$R|S;s`{%TTOs3T
zG^)JSTpI9uhQ&B7B95m*+%P#S{Z1-6ty*l8%jffis+wJ)dT|%7DRyK@zA@Rg*ifL+
z<FC@0dk-Eg9%TPB5U1D(6hjQe(Z3W(GMEj645=WFs(&l~#9-@lQE5Dg$-&foxpI;?
z%qv*E{^nI7Cbe#LvHn#}^J>e{Ieac4t{L78_u!e~Mer3SkI08(IC{heGL9o7<|R7}
zB#4;>ffz%sK^#{zMiv-s&rwC079DK#HjBWTpdfJ-@jSyGTr@g2Yd#n6CD6<P;q~}K
zqLORM@s82MM%8kCwD7px><+YOZ^$F$Eabcuv4UGoUSFU_RB_SRN^87`e;I8v*Km6!
zCXUIC5>HYYCyf~jLG+E;3WfOPm`U)dI(F<(NT197PBFa2P|olgt{vw{`Xv|TGAsm5
z`^Y#)`g>f=BhZZG*)$`axj2RIj~fc}Fy`t!IHwN1Itr@fgglqite^049(Ru6QwFJ4
zRFg;4d7M^pmaAT3_!EJa)Z_}1n#3sKT`LcQ6%a2oe9mwNcZ?qjEAZ*@*|x8!qJ}nJ
z{;_+p&@+{u;1oPJJ|A-@Oc*OS`vs~Ke`7e$@CAdE(4<Q!l|0F{e<yGn#h18a!sxWM
ze5lb>!-tw~q+|V01#_#i)bkU{!K}}WbFZzQ#)42A0mV-?pn6wxuW}Bq{|7^_n(|aM
z%c`K~I&$mcYi|4!zqqy#I<enGyWu-UoiOnaFo@|B!5LPgOe`YQOf4Rl5Tm)h0-cj{
z;lFU^<ce6S$ctS22Yh7m00)cXjFGErmXM3KJQS4F7Bjiim-z3K18_GkoiYSs)mx^#
z3I&&4QYb{;o)ufBI%&mUOkLO!u_Dh1+{wUTWH2#AG30RT76v;*gkZwV?GCQhh2oNh
z>!Jy?B(Iz7!dvV0h#--|Uni{G7Q--D!Li#Ca8qeMsQ7TH6B5*ur4tOJ<k9Tmd03q?
zU25JpX=^g0klXqV6t@~2Mc;r^EX3<(jG7iEf)x4<9>NyMG1=#(^>5-IMm9QWB2pJ=
zCv1b_O@?ax?TkUwrGjx<d??WDt*HzM_w*jGA_j0TscJGS7}MJ&D7v`mdWO>sA7J6k
z@o)mGXO5-7yJzN!*}6Vq{iNa2EZv$(ZdhHx8js&iTU9>YY0~X2M!J!ZtJ>$$Ou>K%
z3Bbd%mZyH(OL2n@21BAj<f#L%Pk?LLDTxIPRsv^~$YWWpM4+lV5m?nM<ldz5DWnoO
zV|B4}x#BbiW>2v9=K-;6NOhlwF(^Lb>Gm^xj4#g4hba7Pb{@H;_&K-3e%w80VSE}{
zagao!Y(bQ$U(c}^%vbS@d(k{^TI>oQTFAe1aM8Sx@g=-MRLC15f4zvq2p*W1PKgY4
zkX}P>q=N?y=eh|Dr}3kC4YZka=MRAyYRh~-P;@(U!z}Z!xZg}1yI?+y#9J5S!vcJE
zYc@W=;82ym=`>oA#;4QiHTqdD9cT9mN*!sVQn}wnMEZ>CoJK97)=MO2HUuRdbP}K|
zn2Ki>CZ^A3?PI8`Voag%nGJRWS$ACdiDKQ?P(ztBT6bCDRdjQwHq9g{T8YsWg``wT
zMQM6P?~H~7ij7%@6oXi%-d2$Y)!`|155rv4<A9Ay)5&Y}v4nV=))5ulRi1w(m+S<Z
zHAu|x`rV$jB;jJNF30Z{<s`C-nuVfzsf+wiG8R=1qinRY(hk+?s>;V8Z!GuaIJhdA
zpghoAOpmuF=%_vRlo;<2ui|w}Mi1e$b0+Jmv(ag$W?vvACi2oITDn;?uMK%?8iQgA
z{(Q+bS6|1oUPT>A8W)$BFjR4OYi+<E3f8#D63q1;Z(V&zXNH4%i|O((1~1Kll{jr_
zA*@!LmW~47G9F#cu$sV`pe4FidxWdb!&=euK~c_)%sMSe_p?(eDXQb39uzlmTZmy5
zgXE!HP41JF>_%>2@q|lCFl1RGY{3c3Y|w(0%YJj6PC<D_)QLlDU#BCz0Ur#Ry%b1#
zkqDNZy(u9!%)!0bu{@pfzq^*dS06zKdH#|Vl%w^vj3fasqeT&wJ%YOFCr+YCk2fpi
zX$mf^53KR)eoRWRBCOrno3gU^Gc`F6Q;aEwEU+4~W%4H4ReRMv5L?5&>6WPo4?eTP
z-XUr7D)koiT<K+y;jflUbqq3@qJmP#v+|#4q@S(k(5$NJmf}`R@w)LPBi-$-reVz#
zMkJ+D!juwCJP9*OOkrqdSjpfAB|#AZE(RIMcW#`VIer+g5GI;cCz?DXd8Tsn`#N?e
znGN4^CBtDn<0^#pYEpGIR4?MWSMa2*Txw@nPvA@xYq;#-re2<jwOmKH)Jv!pW>puD
zkYY%wbZUK4lR(-2KWaupM~{b_tJ90->f6JKh17IK*QU!A!L(C#@%_V0`+Kh+3VXH<
zYNy8!9lgX0`&P;?LvGy{Ns)9y$_(N>i^$THIg>8W&5hJSEaa{=4D6uvEJTmxlw!6t
zdt`>96-9adp)sSx2JW+*fE-^hnRUyRv1BIhs=LY@v5lwdGT}S?vaSHOsP_6AfU&sI
zTUZpP;|;WQYm%F?hT?jUE7as_mMrWhAHL?z>mMeY<e6j~k*%9+0+Sl-L&5~J=meAA
zL=sGLi)pM=L-$>c4gJB6w>RVgs*g7$0F>iXjRPia;cm$K#p-O5L~`)<NFqrk<<#L>
z!$i`VQ{uD37%cM@!kxsNk6ho2HP(n@k}0;#tT-U*cb!faOeTw)SUTe+OnRj$E3?8x
z&VsLtN)LZ1meB?AgWcI`$0N}ZEYkC1-8Edu7-xKhB)vgXk7#Ku1d~HtD2}*`8zhM&
zH>_c9XmNa9BlpgJuAg5|-J}6*<}tT0M9}4xrUa*(f<<{?J_W87tWR3EKIHt2*`!Y#
z<Ut}iLa|GQ=Dm<8UEnP0q#1l`C<S_SWwBYh!ELy5N59PNw1O6My{lfddg}djY0}*x
zCW}l%BH*(-N()+9qtgt7dx5X+-&n7y^Y}e$o5jRV6Q2BM8y*nFBbZ5SR`ObYC1=>{
z3)v0gipQhIg-O~SZXLXyWKtKGBuP2=@g<(4`y5xb$0wuhP!PB+SOEL5WPP^!$KY{@
z+C)Qc$NN@|j+HsP)T;*HUscqje(dTZ2dgjkGSH)lf8vrfzI(Z(Z$SUcZGgCsI^Dyn
z&zz>KZ3s`(oo8rKuI6T6%jLs@j0N51+OrdvnyZ7QyJ#owXA#+42jYP>(S|K1eCklX
zK|G|szorcW^2p6@PmPOW>o2HlYm|ItQbrzFo(;e75W9GeeGCr~IO8t1YH846Tb7x}
z2p-tQAWi#j9<7`9yDe#OAAa6a3=d;&>$&cHL;B`D+`o@|hOp6ax?yzR@@2-`k7tzb
z7Vw63_36(r&t*nS?`QaeL00>?`s%usP$e%I(ns?lf6ASl3A%p3vxK9eG|p763tzJ2
zYSk>+GzMv*?{JB86MC)i7~bEWpTX4EOl)@g?c_u5@hHCCJ_26CnDqsqswM01hMWUD
zxo$pOlG%i`w7J;TF(7dpRce+}^5iamtJsdo8?GOa$Q`ez`699j?_#>TI1S<`w;jOt
z4f!w;AJ~xP2>bMlG7EWs)YFN~unk$n6FkFVhF_y)V-e`Ced@+6co&y#EP!XQbL08U
z$GL;b@C*S3u3vKbcX&sqE&U)DA7hA|i*FhyI+Nh9*wvW{$JDPn*TFT<@vcx$Mf{N=
z9FS!g?KMHo!uLkLfXiFGWP}Y5Zq6JMK6;rM%3-pPWWpOXDP@RjSe7MtZgU7MSbK9m
zysU1$`2dX3wdBSl1e)2$o*^iPaZMRRU*~cF{af;r<hexa=ThULgLz;}jaME_7p+ML
z6I`vQdHTU*=<IruC;bh>Zy96^eu_)d|5tM9X@>7}Xc6WfD_-JJGBAgAdY<cEU^s#Y
zZ*_Jb<k?#@=?ZGP%?Yom!){A}(wDi%YXm)Z%zL3nbvc6!(a&<-?-}HTawWNDH&N(t
zhb7QU!ltRdRs0f-j%rj?Z=8vNc!L$<4Ds|r)=K}fL7e<QjN+Mh*fQs^G?zs2*VPYp
zPyc=qyxTvdGEF7vzHb<Rop&uFjB5!O-rq!dKKGCgk-4M4p1CBD58pPHzLI#i^@lep
zM6U$=W_wmnZ<rSM@Em)2_--!k!J#`Q^@J>n*svY9?a0oNL5kYxl?`F|uA+ym+$)Bx
zcFf;7|0e|MlCYEcUjnu0j#1}!jsqV)x~~A<R_FiB0u8<Hp^P#~8IFGNanPOo4c?$~
z2D#@lN4Y3@a^bT$*4D|KNuu3SjQl~XJ68g;LA;}`c|e6LZGYEYB@x^2l^Mgo@|t0n
za={U3CVEd%72(j!oT<8sl0WRo<O{&drv$psz}SWXv%@<btLN<w9v<q|TdK^thyGH4
zQ$;ONCP&kIUlN56!G`&}?38v-+2w?H)y7>%VELOo_HBkm3@dng8BgBfpYIZoGYRjr
zT>i>QfjkNqk3Nm-cNf4xd~o;GaO(fTJ|pDrw#h7#>LP-4XV>R;rs0Xb1K~Ysm|Z`@
zFr9rdgSMe>3%znu31)oo{?cpTqTv@1>BpqiKipbZ?z<X3m)80rp4|7>CB4QfLwlL1
zAf<Xfy!B<aKw@iV`GPk{{le^#rx0PKnEk*EQq}c9I-LH2c39unZeL)B;!hXv-j6oJ
zNk762g+|NwnBK~LHf((`m;MvM?gyRlF&=wxq@9eIKUEv}({&s02kW+IWs^y1k5NKK
zOns>MDw7h5;;-0kM*fh!iTXz?Yp>yM^nW6h_E;rkR_LQPH&EGvfroaQ-{)07K=c0b
z9r`<7ErC8iR(k|}fs}V9aaoH$O+$Zzu`|;nFNk<>e+G=k7xs@deL^$isY4IbTZAo7
z6eKS*lQ*V^zKGG6RuG@z(@&g+d-3OLeuvEc&QgEPSnCaG){t=dgS7$C6ycx7@lu>|
zQ8HQM3Q>|mxq3KnipZ9zk8n`$p4X|R-X@Pf=neRTnz6aYC4xND1wxk|PPNhhDCmwB
zbl<RoneNl{&oBtYc@&4cN@SjOp5~?yBok7ZSoN{2p2xjJSG3ge7S5^WPn~ibPp+pg
zluQ{pnX-fUf@fPSY%GubtSoP5H}cO~Jn?IL^=vM-Fl=Vn#;}uNH-pM>l;IeG7DY$i
z+@ik^InK48GMr-we`NfQf5!3-B`{=T*^&MoGLN0gzjGJ{GvqN8GVrGoF@s?NLj}V!
zhH8e@3@r>7zIhZKR7XeUaa9L|U+mx>@<rJeE+NC640nN&a`BB+JdFQ+q@U+WZolxN
zB93ugZ^|o9@=qCI!vyb*Vd50Gb}@X&z*H9>GcYy98HUfW<ESl`sVDes#TWR{(LwY@
g>6N27P=}u!U6)!Mmll^nk1ehs&JY)ulol8NKUz(!Bme*a

delta 10270
zcmc&)33OCNy1sQg>1>^SPbbhJWN9EkkRSpMD3L`lAS7%9#!k8k9g^-)-C+sQuqxm?
z3j7KJE~B!@$Wu8j%8ZWs+?~Pa=(t{W#O0v4jtf3<JMaJMhIGRO26fJR&FPQ2b!+*n
z>if6qtb0TM@gI88XST%nl}{b5b>#`|JuTv6;qn;%28Qe}dd$+<skx_VYcy@naLv6!
zbJwWyd768r=ANO-BQ*Ca%{@z%M`~_Oa|^9`nl@*e=3cG2*QomGs(!83T%)zrYIABe
z;l4_9U#;HF(A?`Z_j*;HiO$oV8;<h)T6Kf2X`!J}MTs;nDw}+%zSK)ZN4(Y)t95AF
zYN1ul8%3X{Iz$#FrDcm$DoG2*uGKmc`Nb7<ds=0TRclJu+)4ConnNVhM`?p?Dd;EM
z7M>)HO46S!XiMRlemh{~+<}Fo-(7KiTw;8JZJvzd%I`#Z=y>`lAx6-!%)w4QQ}92F
zO(cp^At!T#2J93*1^i9m?^yl@7SZ9p*|fG#9zC2XMJ`ojIYnORw^?OE*h6cwt3^dH
zL+cQlJ4;)wYuDvzR|u^mUu()#mD!p*heZ@(Cd+u~EG1}uXi)BMK_BK7@Jg-rGI0qF
zviBC_Xo~&ZVd*@lVGa808Ul-*1+q6k$}&|~@AU<I^>Q|9<^YlaX@E=UDf>ZMonI_k
zX>Wc&m+hgx1@%JArK@{AAY3%M_l^wSV$Fz_o<`47*^kRkAy?C9y?bTX3SOm+YqY2|
zk<8nXZ5gP`>q142twLN!4;Bs-*U(3W<BJ;5Fb56wc#$R2gpHf02`hiIM4`wQa&xGr
z&yOPadTyd5H)6cL`QBzvKvtk?A~p4`5KHN%zE1mgu4ss2z$b&h7#If-ILI4*WnYJ-
zgHPMi&5UHr6>(ZHM(c>v+)-+$ztw3~@sQX-dPf2iq?8^imL=coZkyJTqAj#)lFJEN
zN3ymsMwJu8WqP4BBl>%N(N8+HlnfXW(JAnua3}w)0p2eepntDZTIs;NGfy~EP1vDp
z?sRRTuF0QHh{vKntasS7CYCl;YqDvpl`@D$JRURa>=zUIsC1YRS5l$#YO$R5I{S-f
z=w;`|w2i#0Ms%RH*&CF@xh%)gl76*fB0blyR9sBQ`i&HKQbqq9v6U|FUz26y)9fo8
zdLPyghh8X(m=f~7(0%<kh-~8SHSN|azt1x;-Wel78+pA(Q(W1ZH*nb*XC$<^nwver
zx>i?kf!snX%N;q24`8d25cDtf`07j+Zlgbxmx#l3xZExdhJGmjK%d&<_=@LhjATp)
z1(&G+HDiiAfRW?8zSg!N$R-_lj`y{-)CFm1#U<i;T2;|1w$WD=W#Sm+4=EB)(3l}B
zL?ivzkSVDJY?K04?xKl!y_YTzg$mCp*6mMnMYRzdSm<q){v`p)+}Gq&fF0zj?3HsV
ziZ?SDR=@Omd_g%2HMO+0a^TQL)W=GX%MCR)V!c6+RHKYT8yDV2d7J$W@^Ta{?L5ab
zbiC4TWoxU+S(G=dSDKN+E2#I#I{*B-pug4I5TNsi4G=+U8n!|7rDMZJi;qK<=ME5=
z)#&sN;8j2^;00Pd+>!Eo6xE!Ni09Zn+>!Zr6bl&)OTg1;SUgLE@*R3(_y94PV$UlS
zUxoUe7cJ`4fEdeZHIMla`h5lX7@*b|-Qtmr9%r2V3{`&y7z{>Aiz~ouZgB;rcd0s*
zH}LK+fG+_b(X}H6h<Wtjh+O-3TmjdhJ}?0>{8hGh5`8|Rh;27(4a_{HjvRHaI)kUV
zh5Q<D4Dby=$r@$Ks+RjAYX679X_3ci&B)5kHk@@l*Wj!(&GY^-bZBIYu$sft==19@
zf;{p^fQNxKXMwA=dyNj%e+%edt;E`ln6`lDLN?Lzd%QeON6s%5os{&O0$qqu`EQ;P
zC7@IUkFbVnITL-OmU&!KR^t5(vXAK{exea$Y7^8do<Z%iw0%q;2gGs4tJ#g1po@1t
z7*K4N6Vd26y*b7&h$dgqUr!WN{+pc8rV9>;;?vG6b-~@brd>GJ$<y92c6w*Tv>4Wc
zMp1xhfE5q}$iv%p060vwSzXe}SiE<j)|3pTIMl^67%5&iD~xx(*CPW=5MQHAz&jhD
zpGJ*U72;mnR^=2ap~tI6>ciC8?7$dUO0_95Hcly^a_;87R&ias?kM|CIAskD89(%*
zFcFl<FQF3@6)v{f%d^+v!2(@W%A!`HH0E+NeHqX|ca1L@r#9baBr08;>6h+tCv1)E
zgH~#xX;@x-cawm87q2D(jsOl*_Jk4Q5KWtKuE?Ve6P}-Bt|+`hrNTT)*V+_4;#`3S
zkI&5;RWr^RWoll$vMxbaeY3}i3ixGET4?9QdFkJQ;<4Oc)9s?4NY-VaP{zec;(XXD
zIUT?Xr8%QzAIKWV9I9)S{<c=B_9hc&kip<gFvaeTiX&7vX=H%|9TJxC(wjZNfP5dr
z?E!p14@@c&I=wNeP?XV+ldchaX#M2riK(p50TzjUyQ~NuogAZE2V=yW=$q<`5?tt7
zhG#pCn{rN41!jm2dV_4mXTq!42OBW~*J96nZ=)Q<jh!~B1PX7f149hK5TgJ`=*1~b
z99Eq_wZE7U@=o;$_H(aYI?;L;+D)X=X;Z~uS~{&rOs7=$pX<!+=boC<M3Y)C@MDcK
zrd?;LR?AzKj&`R|>Qk<BT8yOmUM4em5m3Z1=|HZkbb4)ia^?i=gR(r9H1?~xqCjU=
z|6%&`<?864=0U2Rv8M3~&tmF6zcZo~S!dC=HKn{DM{QN+wC)l0B#yvx^fX=03oWb7
z5cT09O|<|kP2Z5urE#pJJxn2A!{TP3si`F>^<psM0?ZMw&+S>t63#|-Eqy*CFBwZT
zV%XbPx!4$`QO?YP9C41HSs)rho|*eZ;c&EtqijoMsPVU!^Oqf&WB%h0$q^3uJPny$
z*&pZTOf{Dd^)sScoBcsK8dFE{)UAefY0%rSFd#3a&9g_HHx{ERGQ`Syl;;30NAtM(
zeqS)q;9{MyF7S987X(dKICxk&UL8i>oIK&9$~mQCNod}jp`vphdLy)zOBkF<Ml$nF
zx*9!L6;B_K2#}2=(;H8jZ!W<|G41bISJ2al03aaiFh(0-F+kB#O{}&|2{{)pAgL*M
zKNgF;RSA?bH(9J9$6PzdZKLM?@j{c8>e#q3JaA73rc4FSifsa9G)+td@$T_Q)6{wC
z**AhJD#B;-<p>*lhj<}7sD0j>3nI9p$e|d+kzG#{Du!S}Bbs^ck=)Fr15d6D<HVq+
zB`|$~e~B+b7MEJD+I&->IC}fC_`EwH6hb@AlBVZ~I6aqHoTAH4LC=lM3EAs9MI9%P
z>{kHu8LHc(c=H`?iQGp~$Pomcm`I9f&vUi(qHz;IxwS@=763p)wKk})`p9NFe!0C;
z(Jw7bKj}kruuonHSOh>%=?Am|g0x~w!H9u8j103e&OB6gYU;F);C)YoFqs%~<%<3E
zoU2r<2px0Pi~5-u+J#OXD0Ko>FgTOtQj{Hdr4D)-9@_y|0#2DD%r|8wl^9B@dUXhg
z8j?6Mc%z|Gtb_@?<X@OTS;JGD(e{}#b@6V0qoa4&6n@!lq{3!zRjAS96P?|}3maYz
zNP}+EQ1P6aGzqD~jAM9)B-PH8({pn*cB2MORUi%PWrixI%jcMMS*u5dF-BRm*B2Z%
zRIWms%NW=yc9YrEWwpx_siSd_wQKwKH}(@h(3_3LVtq(o&>+Ncn(r-52(xS&-RR91
zqv(LQFfU9Y#U8~YlS0g%KK;YQi7|<z?1v(bI#2M@JaUsRQ=58o$k5tUC~gRCZb}lO
zmTp_vXY_hBy8-eim@H8QabOP=LDVAD(BV<T1Th(r*<2vbp>vu`#TI5nE<Lj0S=U*y
zA}-=6RJ5&vEO<mS8(IV5RMfZ@5?W4T3S8qo{v>*&B|dvvRD^|6?vWju+!{LEQW(J$
zGg>w^`(#!)6C)()_L`<k%Y+ahfIL|oxfL%IQPlazh0ouxA)#ZV4qKM*Gov^~@T<^g
z9UwxZ?(yevD)WrLKx_;7u3aQ(=AwSd%647M(;9m75{`E^E-JSwJGGJCxjA2KqQi?a
z^AvM<H0>xXVc0ZPX_PFhtji(w5t=OvIF;DGIG;$CSAZDM0+oB8m9hFl77P#5t;=gV
zFt*}^vJ#4Ja9qgI;iM<K>OIX}PIGo3SNJIqC>FPcwgui7(QCN#I$G6MnW4gPWoxlu
zBg(~bDc!QD_ldRsqpi%biQmYZ0sPhE?I@|0--434)?VgX@g8o`g#IYq%8hQ|)laN4
zSFVRg=?WW+Xjf}%^Rm-h?jYvr(mmdcf7U$*COTy^euqw`?#&LQ(oJo#?4xg5QY`j_
z9$V5OlGNE+-JS*)yTV;Gd0EcHZRm0vU=ITatIBTIqC(A}lu=F7z1?yMrdCIw)bd6&
zHr4XcWf@{Sy}Ybk>?KS47hO?-vYRRy>LFE7CX<tdu9RJcss$KPnL-N<TfQLgPtdh^
zo)y=4$Prl6?%_5KI=wJ5BXnr_EK#SD1!YKazDLp8nPf7~g5kohDie=L806*L$V`~j
zv<-M12T&^ZI!Xv2__E+`T75-P7W|SC<woqN5~TOg;y>x3D~5=t=<_Rz#eq=fm0LvK
zy%_uyZ>|)nnx5?Jlf02%8!=U!%ej2*aua>id2ydA42414qXxEedWqfk&eC~uUH$>X
z-AnZ=iiW~M8&=LO+N8&b=bY3n{f*Mq65;7j2>}W;TDc`ZJFHAG@;;1rKi~oSWJQ@U
z8B?}0N4!QCtSlCfQRB+5va!MP0N^nO_BD5*{3tD2WzXD?;@yDAv2I&cl=}&e%8ZfX
z^<@5+$aitQgnTme)~e;A6E@mZhW|z>>`_-^ux`!d6pHghq}eKWSG39J7UQbrx!jKr
zM46H_j1gvx3ii}Ms+S^=3N~X*R?iXi>Y9LvqDgCu#G#OX?cHLGxeb{7AcJ9Th7AeG
zL8zGm=;;`7zU>7F9Ub)7&a8Bf=hT_TnrGVJ;X}RZ>ISzuR457`YG85Xp=Oy2cn~u@
z1o#sGrz;-@DC0j1rAGjMKr5if(~YnW3Gy#!rkrwkZBL`_8Nflhew~v~e1BsGy|^xm
z^UkCE{9-75eUhj;gcdI{oUjAf6Vnc;5tMs=9Cc3s)PQOxHD?#2;3SuBNEQ{eZbO0i
z+yAk9{^1&X_QdF(T=PqGxFt9A=C<;)bI)D2BfXnqXX~S1!qzAQav7KSdO7Lj>`c}c
zp~Z~=WkgTsr8B7CwM85=RbSg%yux03y~ykK&~iJv-vr@ys1l`a9DTxpv9lfu2WBMI
z@J*?!KG3kh=MD1ZW|+(TR+G0dp6bBQ?9|#RcGJ{f=FSI?g`AN|umdtE*P^x|>|ere
zZ1dJw`eIXo@Y2Rxi^c1q%FQvNsoQ8&a{#IAz|TG;z7=>G;?x4vNx-z57h>Uf2|g`@
z{zyd}%KV*jg%O&2Tj-33;JjmS#35v=DB@Mj5#IS*Xuy`-F<5N1Gu;$lF$RW8UG1Gd
zCJ0-BAblZoHt&p{6yDSXGb9ghs#=>`R(NOAcv>S_MN8wlFMnj@ei!STqN#IdnRqKS
zb?g0N-Yby)bwJpGsvAcGb#F1StqAX>n*Pi&gWB$T^gcoj+ls|LTEFc)@y`E~c1FnF
zrI9%-+38d=ogBAC)4?4M@v2hGcYmf@I(uRSZ$;0#*!zqD<`2=d{+6myuW|Q?nBixK
z*}q6H$$M*w_*g0BdvyP;pU>{rNeRDREoU&E__3`g?X<AehE?6-2Dn|AK&L7r{q(!>
z*}~dw(Bt{ar58tmX+j?Pnd;cm)8?OS1K&7xGk>*)j{XH2D&J%=BD9ib?ko|RbmdN`
z_<-)*c@AB-wUSbHWky-FU=%rb^`h^$I;e5imDab>@i2Y7YeZewFP*2=-y_`g*QoCG
zyG!_5`oZ1jh!0P=cop4Mo8et|C?{WFDwtQ(=Dp0X$-uF^Q*te&#IsC(AnZJGX~QW3
zRs2U*=y*%)+4=i_Z#sUGdmq43hrJ4HaQHGcB5T~nFB+`$tlu75bgfUN<OQd9_}n-$
zX=R^p8W)wchTRP}`z6L@t(A5xw*T_vePWpQ{~~$s;yE_ul#@kP+GeF-peS@x;8Fg4
z>~^l;zX7?!{BLIuUDohwueH+ZwxQ`^0;)aS#jPXv>ui(!Guz-D*ro{=n%?}MvwXQ5
zGM@IrBZ#-AQ)ho(P!-;@uKP6y4zCTZ!V!|`tWkH$b|HV?V5OUv6w#Nv_i)^}eNS=f
zozM*a^3TV7RW0Nv^w&M_i<{}doki6uUHX)J8J79ppb;08E?;22U$#WJ`r(j3C7m(+
zBL!ke&b`7JMr2-5ve&^s^^f0MB%Y;(dvnFtw07^|!78QrnkV6qYbm5wIYdN1)HyJV
znMmNiH!g<%`k-a%*b82v{lCxbJci!0q#cc*;^rR^%|%J&eym=a917?Kr~sS?m<(tK
ztOZ;L*bLYXxDRj;@C1Vq&A*nkE;FzG4x#pAz*m}<ugULF{1G4^ZXzHBkVme&dUvX{
ztqRX}Kz~3VKpCI{Faa<fFat0T&;VEhSPtj_bOKfYRx)rJ>~YomrB7}~)ha;u?Wx>?
zM>c<HnuO^H-{Q!<l)5j!^FdVJ12_P94DckNJI#}S!=nm6!+xSWu#@lL?Yn^Y0Pq;{
z1HgxXqkvBsjLg8oX8vmc-$1@u^0&151Kyz5@3Tp8OXBS1F*>-fMBGhp@5>Wz`flHH
edw*MoEsMW)TbV7vmS)Se<=b>yVoHWB@qYolMq$SQ

diff --git a/tmw.py b/tmw.py
index b620786..520c467 100644
--- a/tmw.py
+++ b/tmw.py
@@ -236,7 +236,7 @@ def segmenter(inpath, outfolder, target, sizetolerancefactor, preserveparagraphs
 # Binning                       #
 #################################
 
-def segments_to_bins(inpath, outfile, binsnb):
+def segments_to_bins(inpath, outfolder, binsnb):
     """Script for sorting text segments into bins."""
     print("\nLaunched segments_to_bins.")
 
@@ -326,6 +326,9 @@ def segments_to_bins(inpath, outfile, binsnb):
     files_and_bins = pd.concat([filenames_sr,binids_sr], axis=1)
     print("chunks per bin: ", bcount)
 
+    if not os.path.exists(outfolder):
+        os.makedirs(outfolder)
+    outfile = outfolder+"segs-and-bins.csv"
     with open(outfile, "w") as outfile:
         files_and_bins.to_csv(outfile, index=False)
 
@@ -498,7 +501,7 @@ def call_mallet_import(mallet_path, infolder,outfolder, outfile, stoplist_projec
 # call_mallet_modeling          #
 #################################
 
-def call_mallet_modeling(mallet_path, inputfile,outfolder,num_topics,optimize_interval,num_iterations,num_top_words,doc_topics_max):
+def call_mallet_modeling(mallet_path, inputfile,outfolder,numOfTopics,optimize_interval,num_iterations,num_top_words,doc_topics_max):
     """Function to perform topic modeling with Mallet."""
     print("\nLaunched call_mallet_modeling.")
 
@@ -515,7 +518,7 @@ def call_mallet_modeling(mallet_path, inputfile,outfolder,num_topics,optimize_in
     output_topic_state = outfolder + "topic_state.gz"
     
     ### Constructing Mallet command from parameters.
-    command = mallet_path +" train-topics --input "+ inputfile +" --num-topics "+ num_topics +" --optimize-interval "+ optimize_interval +" --num-iterations " + num_iterations +" --num-top-words " + num_top_words +" --word-topic-counts-file "+ word_topics_counts_file + " --topic-word-weights-file "+ topic_word_weights_file +" --output-state topic-state.gz"+" --output-topic-keys "+ output_topic_keys +" --output-doc-topics "+ output_doc_topics +" --doc-topics-max "+ doc_topics_max + " --output-state " + output_topic_state
+    command = mallet_path +" train-topics --input "+ inputfile +" --num-topics "+ numOfTopics +" --optimize-interval "+ optimize_interval +" --num-iterations " + num_iterations +" --num-top-words " + num_top_words +" --word-topic-counts-file "+ word_topics_counts_file + " --topic-word-weights-file "+ topic_word_weights_file +" --output-state topic-state.gz"+" --output-topic-keys "+ output_topic_keys +" --output-doc-topics "+ output_doc_topics +" --doc-topics-max "+ doc_topics_max + " --output-state " + output_topic_state
     #print(command)
     subprocess.call(command, shell=True)
     print("Done.\n")
@@ -545,7 +548,7 @@ def get_metadata(metadatafile):
     #print("metadata\n", metadata)
     return metadata
 
-def get_topicscores(topics_in_texts, number_of_topics): 
+def get_topicscores(topics_in_texts, numOfTopics): 
     """Create a matrix of segments x topics, with topic score values, from Mallet output.""" 
     print("- getting topicscores...")   
     ## Load Mallet output (strange format)
@@ -565,7 +568,7 @@ def get_topicscores(topics_in_texts, number_of_topics):
         scores = []
         ## For each segment, get the topic number and its score
         i +=1
-        for j in range(1,number_of_topics,2):
+        for j in range(1,numOfTopics,2):
             k = j+1
             topic = topicsintexts.iloc[i,j]
             score = topicsintexts.iloc[i,k]
@@ -603,13 +606,13 @@ def get_docmatrix(corpuspath):
     return docmatrix
     
 def merge_data(corpuspath, metadatafile, topics_in_texts, mastermatrixfile, 
-               number_of_topics):
+               numOfTopics):
     """Merges the three dataframes into one mastermatrix."""
     print("- getting data...")
     ## Get all necessary data.
     metadata = get_metadata(metadatafile)
     docmatrix = get_docmatrix(corpuspath)
-    topicscores = get_topicscores(topics_in_texts, number_of_topics)
+    topicscores = get_topicscores(topics_in_texts, numOfTopics)
     ## For inspection only.
     #print("Metadata\n", metadata.head())
     #print("Docmatrix\n", docmatrix.head())
@@ -630,21 +633,21 @@ def add_binData(mastermatrix, binDataFile):
     print("- adding bin data...")
     ## Read the information about bins
     binData = pd.read_csv(binDataFile, sep=",")
-    print(binData)
+    #print(binData)
     ## Merge existing mastermatrix and binData.
     mastermatrix = pd.merge(mastermatrix, binData, how="inner", on="segmentID")  
     #print(mastermatrix)
     return mastermatrix
 
 def create_mastermatrix(corpuspath, outfolder, mastermatrixfile, metadatafile, 
-                        topics_in_texts, number_of_topics, useBins, binDataFile):
+                        topics_in_texts, numOfTopics, useBins, binDataFile):
     """Builds the mastermatrix uniting all information about texts and topic scores."""
     print("\nLaunched create_mastermatrix.")
     print("(Warning: This is very memory-intensive and may take a while.)")
     if not os.path.exists(outfolder):
         os.makedirs(outfolder)
     mastermatrix = merge_data(corpuspath, metadatafile, topics_in_texts, 
-                              mastermatrixfile, number_of_topics)
+                              mastermatrixfile, numOfTopics)
     if useBins == True: 
         mastermatrix = add_binData(mastermatrix, binDataFile)
     mastermatrix.to_csv(outfolder+mastermatrixfile, sep=",", encoding="utf-8")
@@ -788,11 +791,11 @@ def get_color_scale(word, font_size, position, orientation, random_state=None):
     return "hsl(221, 65%%, %d%%)" % random.randint(30, 35) # Dark blue for white background
 
 def make_wordle_from_mallet(word_weights_file, 
-                            topics,words,outfolder, 
+                            numOfTopics,words,outfolder, 
                             font_path, dpi):
     """Generate wordles from Mallet output, using the wordcloud module."""
     print("\nLaunched make_wordle_from_mallet.")
-    for topic in range(0,topics):
+    for topic in range(0,numOfTopics):
         ## Gets the text for one topic.
         text = get_wordlewords(words, word_weights_file, topic)
         wordcloud = WordCloud(font_path=font_path, background_color="white", margin=5).generate(text)
@@ -897,7 +900,7 @@ def create_barchart_topTopics(dataToPlot, targetCategory, item,
     plt.savefig(figure_filename, dpi=dpi)
     plt.close()
 
-def plot_topTopics(averageDatasets, firstWordsFile, numberOfTopics, 
+def plot_topTopics(averageDatasets, firstWordsFile, numOfTopics, 
                    targetCategories, topTopicsShown, fontscale, 
                    height, dpi, outfolder): 
     """For each item in a category, plot the top n topics as a barchart."""
@@ -968,7 +971,7 @@ def create_topItems_barchart(dataToPlot, firstWords, targetCategory, topic,
 def plot_topItems(averageDatasets, 
                   outfolder, 
                   firstWordsFile,  
-                  numberOfTopics, 
+                  numOfTopics, 
                   targetCategories, 
                   topItemsShown, 
                   fontscale, 
@@ -980,7 +983,7 @@ def plot_topItems(averageDatasets,
         for targetCategory in targetCategories:
             if targetCategory in average:
                 print(" Plotting for: "+targetCategory)
-                topics = list(range(0,numberOfTopics))
+                topics = list(range(0,numOfTopics))
                 for topic in topics:
                     firstWords = get_topItems_firstWords(firstWordsFile, 
                                                          topic)
@@ -1018,7 +1021,7 @@ def get_heatmap_firstWords(firstWordsFile):
         return(firstWords)
 
 def get_heatmap_dataToPlot(average, firstWordsFile, topTopicsShown, 
-                           numberOfTopics):
+                           numOfTopics):
     """From average topic score data, select data to be plotted."""
     #print("  Getting dataToPlot.")
     with open(average, "r") as infile:
@@ -1076,7 +1079,7 @@ def plot_distinctiveness_heatmap(averageDatasets,
                                  firstWordsFile, 
                                  outfolder, 
                                  targetCategories, 
-                                 numberOfTopics, 
+                                 numOfTopics, 
                                  topTopicsShown, 
                                  fontscale, 
                                  dpi):
@@ -1089,7 +1092,7 @@ def plot_distinctiveness_heatmap(averageDatasets,
                 dataToPlot = get_heatmap_dataToPlot(average, 
                                                     firstWordsFile, 
                                                     topTopicsShown,
-                                                    numberOfTopics)
+                                                    numOfTopics)
                 create_distinctiveness_heatmap(dataToPlot, 
                                                topTopicsShown,
                                                targetCategory, 
@@ -1176,14 +1179,14 @@ def create_overTime_areaplot(dataToPlot, outfolder, fontscale, topics, dpi):
     plt.close()
 
 def plot_topicsOverTime(averageDatasets, firstWordsFile, outfolder, 
-                        numberOfTopics, fontscale, dpi, height,  
+                        numOfTopics, fontscale, dpi, height,  
                         mode, topics):
     """Function to plot development of topics over time using lineplots or areaplots."""
     print("Launched plot_topicsOverTime.")
     if mode == "line": 
         for average in glob.glob(averageDatasets):
             if "decade" in average:
-                entriesShown = numberOfTopics
+                entriesShown = numOfTopics
                 dataToPlot = get_overTime_dataToPlot(average, firstWordsFile, 
                                                      entriesShown, topics)
                 create_overTime_lineplot(dataToPlot, outfolder, fontscale, 
@@ -1191,7 +1194,7 @@ def plot_topicsOverTime(averageDatasets, firstWordsFile, outfolder,
     elif mode == "area":
         for average in glob.glob(averageDatasets):
             if "decade" in average:
-                entriesShown = numberOfTopics
+                entriesShown = numOfTopics
                 dataToPlot = get_overTime_dataToPlot(average, firstWordsFile, 
                                                      entriesShown, topics)
                 create_overTime_areaplot(dataToPlot, outfolder, fontscale, 
@@ -1450,12 +1453,12 @@ def create_allSimpleProgression_lineplot(dataToPlot, outfolder, fontscale,
 
 
 def simpleProgression(averageDataset, firstWordsFile, outfolder, 
-                           numberOfTopics, 
+                           numOfTopics, 
                            fontscale, dpi, height, mode, topics):
     """Function to plot topic development over textual progression."""
     print("Launched textualProgression.")
     if mode == "selected" or mode == "sel": 
-        entriesShown = numberOfTopics
+        entriesShown = numOfTopics
         dataToPlot = get_selSimpleProgression_dataToPlot(averageDataset, 
                                                       firstWordsFile, 
                                                       entriesShown, 
@@ -1464,8 +1467,8 @@ def simpleProgression(averageDataset, firstWordsFile, outfolder,
                                           fontscale, topics, 
                                           dpi, height)
     elif mode == "all": 
-        entriesShown = numberOfTopics
-        topics = list(range(0, numberOfTopics))
+        entriesShown = numOfTopics
+        topics = list(range(0, numOfTopics))
         for topic in topics:
             topic = str(topic)
             dataToPlot = get_allSimpleProgression_dataToPlot(averageDataset, 
@@ -1584,7 +1587,7 @@ def create_allComplexProgression_lineplot(dataToPlot, outfolder, fontscale,
 
 
 def complexProgression(averageDataset, firstWordsFile, outfolder, 
-                           numberOfTopics, 
+                           numOfTopics, 
                            fontscale, dpi, height, mode, topics):
     """Function to plot topic development over textual progression."""
     print("Launched textualProgression.")
@@ -1598,8 +1601,8 @@ def complexProgression(averageDataset, firstWordsFile, outfolder,
                                           fontscale, topics, 
                                           dpi, height)
     elif mode == "all": 
-        entriesShown = numberOfTopics
-        topics = list(range(0, numberOfTopics))
+        entriesShown = numOfTopics
+        topics = list(range(0, numOfTopics))
         for topic in topics:
             topic = str(topic)
             dataToPlot = get_allComplexProgression_dataToPlot(averageDataset, 
diff --git a/tmw_config.py b/tmw_config.py
index 1ed87c0..dea8274 100644
--- a/tmw_config.py
+++ b/tmw_config.py
@@ -32,7 +32,7 @@
 ### Path to the working directory.
 wdir = "/home/christof/Dropbox/0-Analysen/2015/hybrid/rf10/" # end with slash.
 ### Path to the TreeTagger file (language-dependent!)
-tagger = "/home/christof/Programs/TreeTagger/cmd/tree-tagger-spanish"
+tagger = "/home/christof/Programs/TreeTagger/cmd/tree-tagger-french"
 ### Path to Mallet installation directory
 mallet_path = "/home/christof/Programs/Mallet/bin/mallet"
 
@@ -61,15 +61,15 @@
 
 ### segments_to_bins
 inpath = wdir + "2_segs/*.txt"
-outfile = wdir + "7_aggregates/segs-and-bins.csv"
+outfolder = wdir + "7_aggregates/"
 binsnb = 5 # number of bins
-#tmw.segments_to_bins(inpath,outfile, binsnb)
+#tmw.segments_to_bins(inpath,outfolder, binsnb)
 
 ### pretokenize
 ### Perform some preliminary tokenization.
 inpath = wdir + "2_segs/*.txt"
 outfolder = wdir + "3_tokens/"
-substitutionsFile = "./extras/fr_pretokenize_subs.csv"
+substitutionsFile = wdir+"extras/fr_pretokenize_subs.csv"
 #tmw.pretokenize(inpath, substitutionsFile, outfolder)
 
 ### call_treetagger
@@ -112,13 +112,13 @@
 mallet_path = mallet_path
 inputfile = wdir + "6_mallet/corpus.mallet"
 outfolder = wdir + "6_mallet/"
-num_topics = "50" # string
+numOfTopics = "50" # string
 optimize_interval = "100" # string
 num_iterations = "1000" # string
 num_top_words = "100" # string
-doc_topics_max = num_topics
+doc_topics_max = numOfTopics
 num_threads = "4" # string
-#tmw.call_mallet_modeling(mallet_path, inputfile, outfolder, num_topics, optimize_interval, num_iterations, num_top_words, doc_topics_max)
+#tmw.call_mallet_modeling(mallet_path, inputfile, outfolder, numOfTopics, optimize_interval, num_iterations, num_top_words, doc_topics_max)
 
 
 
@@ -134,10 +134,10 @@
 mastermatrixfile = "mastermatrix.csv"
 metadatafile = wdir+"/metadata.csv"
 topics_in_texts = wdir+"/6_mallet/topics-in-texts.csv"
-number_of_topics = num_topics
+numOfTopics = int(numOfTopics)
 useBins = True # True|False
 binDataFile = wdir+"7_aggregates/segs-and-bins.csv"
-#tmw.create_mastermatrix(corpuspath, outfolder, mastermatrixfile, metadatafile, topics_in_texts, number_of_topics, useBins, binDataFile)
+#tmw.create_mastermatrix(corpuspath, outfolder, mastermatrixfile, metadatafile, topics_in_texts, numOfTopics, useBins, binDataFile)
 
 ### calculate_averageTopicScores
 ### Based on the mastermatrix, calculates various average topic score datasets.
@@ -171,15 +171,15 @@
 ### make_wordle_from_mallet
 ### Creates a wordle for each topic.
 word_weights_file = wdir+"6_mallet/" + "word-weights.txt"
-topics = num_topics
+numOfTopics = numOfTopics
 words = 40
 outfolder = wdir+"8_visuals/wordles/"
 font_path = "/home/christof/.fonts/AlegreyaSans-Regular.otf"
 dpi = 300
-#tmw.make_wordle_from_mallet(word_weights_file,topics,words,outfolder,font_path,dpi)
+#tmw.make_wordle_from_mallet(word_weights_file,numOfTopics,words,outfolder,font_path,dpi)
 
 ### crop_images
-### Crops the wordle image files, use if needed.
+### Optional. Crops the wordle image files.
 inpath = wdir + "8_visuals/wordles/*.png"
 outfolder = wdir + "8_visuals/wordles/"
 left = 225 # image start at the left
@@ -194,25 +194,25 @@
 firstWordsFile = wdir+"7_aggregates/firstWords.csv"
 targetCategories = ["author", "subgenre", "binID"] 
 topTopicsShown = 30 
-numberOfTopics = num_topics 
+numOfTopics = numOfTopics 
 fontscale = 1.0
 height = 0 # 0=automatic and variable
 dpi = 300
 outfolder = wdir+"/8_visuals/topTopics/"
-#tmw.plot_topTopics(averageDatasets, firstWordsFile, numberOfTopics, targetCategories, topTopicsShown, fontscale, height, dpi, outfolder)
+#tmw.plot_topTopics(averageDatasets, firstWordsFile, numOfTopics, targetCategories, topTopicsShown, fontscale, height, dpi, outfolder)
 
 ### plot_topItems ###
 ### For each topic, creates a barchart with top items from a category. 
 averageDatasets = wdir+"7_aggregates/avg*.csv" 
 outfolder = wdir+"8_visuals/topItems/"
 firstWordsFile = wdir+"7_aggregates/firstWords.csv"
-numberOfTopics = num_topics # must be actual number of topics modeled. 
+numOfTopics = numOfTopics # must be actual number of topics modeled. 
 targetCategories = ["author", "subgenre", "binID"] 
 topItemsShown = 30 
 fontscale = 0.8
 height = 0 # 0=automatic and flexible
 dpi = 300
-#tmw.plot_topItems(averageDatasets, outfolder, firstWordsFile, numberOfTopics, targetCategories, topItemsShown, fontscale, height, dpi)
+#tmw.plot_topItems(averageDatasets, outfolder, firstWordsFile, numOfTopics, targetCategories, topItemsShown, fontscale, height, dpi)
 
 
 
@@ -227,30 +227,30 @@
 outfolder = wdir+"8_visuals/distinctiveness/"
 targetCategories = ["author", "subgenre", "binID"] 
 # one or several: "author-name", "decade", "subgenre", "gender", "idno", "title"
-numberOfTopics = num_topics # must be actual number of topics modeled.
+numOfTopics = numOfTopics # must be actual number of topics modeled.
 topTopicsShown = 20 
 fontscale = 1.0
 dpi = 300
-#tmw.plot_distinctiveness_heatmap(averageDatasets, firstWordsFile, outfolder, targetCategories, numberOfTopics, topTopicsShown, fontscale, dpi)
+#tmw.plot_distinctiveness_heatmap(averageDatasets, firstWordsFile, outfolder, targetCategories, numOfTopics, topTopicsShown, fontscale, dpi)
 
 ### plot_topicsOverTime ###
 ###     
 averageDatasets = wdir+"7_aggregates/avgtopicscores_by-decade.csv" 
 firstWordsFile = wdir+"7_aggregates/firstWords.csv"
 outfolder = wdir+"8_visuals/overTime/"
-numberOfTopics = num_topics # must be actual number of topics modeled.
+numOfTopics = numOfTopics # must be actual number of topics modeled.
 fontscale = 1.0
 dpi = 300
 height = 0 # for lineplot; 0=automatic
 mode = "line" # area|line for areaplot or lineplot
 topics = ["25", "44"] # list of one or several topics
-#tmw.plot_topicsOverTime(averageDatasets, firstWordsFile, outfolder, numberOfTopics, fontscale, dpi, height, mode, topics)
+#tmw.plot_topicsOverTime(averageDatasets, firstWordsFile, outfolder, numOfTopics, fontscale, dpi, height, mode, topics)
 
 ### topicClustering ###
 # This function will create a dendrogram grouping topics based on their word weight similarity.
 wordWeightsFile = wdir+"6_mallet/"+"word-weights.txt"
 outfolder = wdir+"8_visuals/clustering/"
-topicsToUse = num_topics # = all topics modeled
+topicsToUse = numOfTopics # should be all topics.
 wordsPerTopic = 50
 methods=["weighted"] # list
 metrics=["cosine"] # list
@@ -258,29 +258,28 @@
 
 ### itemClustering ###
 # This function creates a dendrogram of items in a category (authors, titles).
-averageDatasets = wdir+"7_aggregates/avg*title.csv" 
+averageDatasets = wdir+"7_aggregates/avg*author.csv" 
 figsize = (10,80) # width,height
 outfolder = wdir+"8_visuals/clustering/"
-topicsPerItem = num_topics
+topicsPerItem = 40 # can be set
 sortingCriterium = "std" # std|mean
-targetCategories = ["title"] # list
+targetCategories = ["author"] # list
 methods=["weighted"] # list
 metrics=["cosine"] # list
 #tmw.itemClustering(averageDatasets, figsize, outfolder, topicsPerItem, targetCategories, methods, metrics, sortingCriterium)
 
-
 ### simpleProgression ###
 ### Creates a lineplot of topic development over textual progression.
 averageDataset = wdir+"7_aggregates/avgtopicscores_by-binID.csv" 
 firstWordsFile = wdir+"7_aggregates/firstWords.csv"
 outfolder = wdir+"8_visuals/progression/simple/"
-numberOfTopics = num_topics # must be actual number of topics modeled.
+numOfTopics = numOfTopics # must be actual number of topics modeled.
 fontscale = 1.0
 dpi = 300
 height = 0 # for lineplot; 0=automatic
 mode = "sel" # all|sel 
 topics = ["25", "44", "12"] # if mode="sel": list of topics
-#tmw.simpleProgression(averageDataset, firstWordsFile, outfolder, numberOfTopics, fontscale, dpi, height, mode, topics)
+#tmw.simpleProgression(averageDataset, firstWordsFile, outfolder, numOfTopics, fontscale, dpi, height, mode, topics)
 
 
 
@@ -296,13 +295,13 @@
 averageDataset = wdir+"7_aggregates/complex-avgtopicscores_by-subgenre+binID.csv" 
 firstWordsFile = wdir+"7_aggregates/firstWords.csv"
 outfolder = wdir+"8_visuals/progression/complex/"
-numberOfTopics = num_topics # must be actual number of topics modeled.
+numOfTopics = numOfTopics # must be actual number of topics modeled.
 fontscale = 1.0
 dpi = 300
 height = 0 # for lineplot; 0=automatic
 mode = "all" # all|sel 
 topics = ["25", "44", "12"] # if mode="sel": list of topics
-tmw.complexProgression(averageDataset, firstWordsFile, outfolder, numberOfTopics, fontscale, dpi, height, mode, topics)
+#tmw.complexProgression(averageDataset, firstWordsFile, outfolder, numOfTopics, fontscale, dpi, height, mode, topics)
 
 
 

From 443b7ad58a57685dc60474b1be7f5ebc8c203ea9 Mon Sep 17 00:00:00 2001
From: christofs <c.schoech@gmail.com>
Date: Mon, 7 Sep 2015 12:01:11 +0200
Subject: [PATCH 43/56] Added wordle font_path to once-for-all variables set
 initially.

---
 tmw_config.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tmw_config.py b/tmw_config.py
index dea8274..2c6ce7b 100644
--- a/tmw_config.py
+++ b/tmw_config.py
@@ -35,6 +35,8 @@
 tagger = "/home/christof/Programs/TreeTagger/cmd/tree-tagger-french"
 ### Path to Mallet installation directory
 mallet_path = "/home/christof/Programs/Mallet/bin/mallet"
+### Path to the font for wordle generation
+font_path = "/home/christof/.fonts/AlegreyaSans-Regular.otf"
 
 
 ################################
@@ -174,7 +176,7 @@
 numOfTopics = numOfTopics
 words = 40
 outfolder = wdir+"8_visuals/wordles/"
-font_path = "/home/christof/.fonts/AlegreyaSans-Regular.otf"
+font_path = font_path
 dpi = 300
 #tmw.make_wordle_from_mallet(word_weights_file,numOfTopics,words,outfolder,font_path,dpi)
 

From b812b671ae3c67a71560c46f94cb3ff71a44b97c Mon Sep 17 00:00:00 2001
From: christofs <c.schoech@gmail.com>
Date: Mon, 7 Sep 2015 16:20:15 +0200
Subject: [PATCH 44/56] First alpha version of complex topic progression

---
 __pycache__/tmw.cpython-34.pyc | Bin 44424 -> 45250 bytes
 tmw.py                         |  98 +++++++++++++++++++++++----------
 tmw_config.py                  |  14 +++--
 3 files changed, 76 insertions(+), 36 deletions(-)

diff --git a/__pycache__/tmw.cpython-34.pyc b/__pycache__/tmw.cpython-34.pyc
index cd15355dc422cef5d1e5baab998eea1fc4a3f51e..542f1e316d8e1415d6620d15dd64ad309c0b2a72 100644
GIT binary patch
delta 2001
zcmZWpZ)jUp6hHUA{C#~%)23;fcKu^lg^Cm`PK2RztV0KGOgo+XFlUw*OVTuHZ=Mq5
zg;3*QAN-(Rkj)LIo11Q^a}0)p_(4DTK^O%=B%&ZH3Zibp_HD!8xk)<N^776*|L*yB
z&pC&GkYD~R?Z2IvefaoaPahe$V1IIAFi7oR3?F{C;jR9tS67qz%3Qed1=WM(Nh(@Y
z43Hlr*CIDSE|3s4g{+6j2N|YJJ#4TDun74fDjF4N9g^gS=>-Wv^bSF!EwJwpSVpj;
z-ebJO3(nfKAjyrA8-Z(4ce)~n{GQuIzC-hS;r+>H0p_P|z2%*k5(3GoO$?p^m=5ci
zW5YcI;Hn(cF}<o&kJXrdoRwY$)(U%BwUGDj?;^9Uo({*&df6^9+aW5;h1`6uqVm-@
zODR3^?!ApcSR%_7{bCHz78!m5BVt6V-(f!&VL5(DyivU70$!Iy9+4}!<msj$;Q<XG
zZQ|-is3^&^SO|_Q*Q-k`;48AY>?~+--EHy$<OR8c&}Sp47ZEC0Yuw_`?O&mKjC>eD
zy^Y-r>&D5CQxVr2Ge_HGgxjEipl~2a4|W=Yq+s3u#n3=+;Jy%z=>QydsEE6avn{c*
zLVG=%iapdM<Y+Ly;EkY#sMW}GI15~FQ_|f21*-RQU0}oSB@Rw>&|-{=aNNsw>Iw1^
z<R|!KrokD20n>fmb1sa=$c>ZRORmvu3d?x!H*u^ago!FiF3v<L?e>$KM8Lu*4UmgJ
zSgCbZirm55SZz~Y1CAj|lRHFihTLIzkj6~f1U1AS-4V*T8#rDzI|{R?BkCq;(IPb}
zIeMHO>kJa-T}FcpQ*nff8_1Vj%<2}&oY4+xG!nGC-6M4*0_)?H=F+U+ZQ7lnwBLse
z@1k_Q-$Vp22x5{avYYX^DT`}ikf!uUC$o?4Qs1Ff)f&Jab)3;-3@-8LTxRTJ#*Q!u
zFnN@*w*gwgQubiJr0+YmFTpd|617&ae6Ucis!e+J)Wn$Th1Sb~ndS0=+6;7B-ZD5n
zq0gU66&~VqRyFS_!Dc5J{K;OnGkAo_&lx<=q=|DclSu}n%v)kC&tNNqWd<t%t$3-D
z%X$@cU^%O@OI2NeV@hA%XGfAM%Q`iE{;hkqsuttF8vXK%OS`kmD`ZP^ybPRNWofyT
z_wr1shvBQ$15;hOs<QV#^b;rU)(74jYm9}kYv;hDKwLT^V`XH&49TQSie51)#zh}C
zMnVjVINpTVh;LdX@z|g@3P%iD{UU*rQ{bs9h$v^XHT-y$Gj6uYFF@Mc#7qoE7mX#U
zp>~pQ5pM^~F{TO-i#STn051q^eas!Y1|7uP7ds(L%@A7eFmE7q18k^ox+W~%G1v*P
zh*{;Si&E`5J~_942d^65KaLRqhcTM%dc-tiwFQk+tQk0knYLeq1ZmrKgoIu=T@1I2
zT+`p5p4Y*n!;`#5P4?3<^)2Ij7)&5(^%;Yj&`W1_ZhMNW&AmfW8?_7JvG7DYUrWRc
zH`V#FjHOY)kQk7*P*?RoXV&^&VLN!Suuy%Wu3cPvKPJR={nH1piIaNqY)0JBhtB>6
zxOi^6IIl09ONpBozdm<KY+eslUF8`6WpE9km8ve5@>x}$ar2yBrCRU`m9o0d3O5+s
V)SrI%Rcy$4z?pHR<0J;0*ninglSBXj

delta 641
zcmYjOO=uHQ5T1Gamuz?AnzXSb81N8la`2{j5EZO=s6Ui8)PuR$wBm}Uyrke^_YiVX
zsHcI(;Gy&&8XL6|@G5xr4?G!9LP~|!BBTe8&RY?D%Y1L%`@WfFzW46Cxc^PqdyZ?L
zD*r6aCSKc{YiS++E{ivp<2*;rr$!@1zb38H6ofj|1^62HIs`gYG^j8dP%)v2A~c{5
z5Ez^T^5!#Pse^AqXhxjU0nV|0`WnMbAx4RZRabuCKV6X}#)3u1A7>JX0DbeOtS&Fr
z_^LrUyYEKFg~|+lIypuk*NaJq3qZ3mi8-NA4tl@ooG|29E~hiAEg}1KX)RAvPjez;
z(nRZ6a*L_?xGLnO8d!98C0z(Cs0g?%KwgGALKs(*20RWz5QiWJO&vlT>Kb)h<8zCk
z^jf6vK%hl>Hisu>XEbOEHE0-vd#6?HWO)L@5vZFGj6?+I)_>jkGU`t4XFjbzN>RFV
z(d#GfF5kJaAg?aYR+nzg*Qngd;DGY&Y5LMhB~F<9I7KytZlgo1?IWgV*eJKD@ci?^
z>x$(7U|T$tJ3V{DMck$F?n*XLq^C%BEsiE0P@g!8a!7BxTR5aAFD@K<pj7`HcC|P<
z;xNkn9=?2oV*_?kDl6r-f*%TY=>4m#x2w{jf_()=YIIX%Z*I7wPS(jg!g0J&$NdB5
Cx|db}

diff --git a/tmw.py b/tmw.py
index 520c467..475bbb0 100644
--- a/tmw.py
+++ b/tmw.py
@@ -1540,26 +1540,31 @@ def create_selComplexProgression_lineplot(dataToPlot, outfolder, fontscale,
     plt.close()
 
 def get_allComplexProgression_dataToPlot(averageDataset, firstWordsFile, 
-                               entriesShown, topic): 
+                                         entriesShown, topic, targetCategories): 
     """Function to build a dataframe with all data necessary for plotting."""
     print("- getting data to plot...")
     with open(averageDataset, "r") as infile:
         allScores = pd.DataFrame.from_csv(infile, sep=",", index_col=None)
-        #allScores = allScores.T
-        print(allScores)
-        groupedScores = allScores.groupby("binID").groups
-        print(groupedScores)
+        #print(allScores)
         ## Select the data for current topics
-        #someScores = allScores.loc[topic,:]
-        #someScores.index = someScores.index.astype(np.int64)
-        #dataToPlot = someScores
+        target1 = targetCategories[0]
+        target2 = targetCategories[1]
+        target1data = allScores.loc[:,target1]
+        target2data = allScores.loc[:,target2]
+        topicScores = allScores.loc[:,topic]
+        #print(target1data)
+        #print(target2data)
+        #print(topicScores)
+        dataToPlot = pd.concat([target1data, target2data], axis=1)
+        dataToPlot = pd.concat([dataToPlot, topicScores], axis=1)
         #print(dataToPlot)
-        #return dataToPlot
+        return dataToPlot
         
 # TODO: Make sure this is only read once and then select when plotting.
-    
-    
-def create_allComplexProgression_lineplot(dataToPlot, outfolder, fontscale, 
+
+        
+def create_allComplexProgression_lineplot(dataToPlot, targetCategories, 
+                                          outfolder, fontscale, 
                                 firstWordsFile, topic, dpi, height):
     """This function does the actual plotting and saving to disk."""
     print("- creating the plot for topic " + topic)
@@ -1567,11 +1572,35 @@ def create_allComplexProgression_lineplot(dataToPlot, outfolder, fontscale,
     firstWords = get_progression_firstWords(firstWordsFile)
     topicFirstWords = firstWords.iloc[int(topic),0]
     #print(topicFirstWords)
-    ## Plot the selected data
-    dataToPlot.plot(kind="line", lw=3, marker="o")
+    ## Split plotting data into parts (for target1)
+    target1data = dataToPlot.iloc[:,0]
+    #print(target1data)
+    numPartialData = len(set(target1data))
+    ## Initialize plot for several lines
+    completeData = []
+    #print(dataToPlot)
+    for target in set(target1data):
+        #print("  - plotting "+target)
+        partialData = dataToPlot.groupby(targetCategories[0])
+        partialData = partialData.get_group(target)
+        partialData.rename(columns={topic:target}, inplace=True)
+        partialData = partialData.iloc[:,2:3]
+        completeData.append(partialData)
+    #print(completeData)
+    ## Plot the selected data, one after the other
+    plt.figure()
+    plt.figure(figsize=(15,10))
+    for i in range(0, numPartialData):
+        print(completeData[i])
+        label = completeData[i].columns.values.tolist()
+        label = str(label[0])
+        plt.plot(completeData[i], lw=4, marker="o", label=label)
+        plt.legend()
     plt.title("Entwicklung über den Textverlauf für "+topicFirstWords, fontsize=20)
     plt.ylabel("Topic scores (absolut)", fontsize=16)
     plt.xlabel("Textabschnitte", fontsize=16)
+    plt.legend()
+    plt.locator_params(axis = 'x', nbins = 5)
     plt.setp(plt.xticks()[1], rotation=0, fontsize = 14)   
     if height != 0:
         plt.ylim((0.000,height))
@@ -1581,25 +1610,32 @@ def create_allComplexProgression_lineplot(dataToPlot, outfolder, fontscale,
         os.makedirs(outfolder)
     ## Format the topic information for display
     topicsLabel = str(topic)
-    figure_filename = outfolder+"all_"+topicsLabel+".png"
+    figure_filename = outfolder+"all_"+str(targetCategories[0])+"-"+topicsLabel+".png"
     plt.savefig(figure_filename, dpi=dpi)
     plt.close()
 
 
-def complexProgression(averageDataset, firstWordsFile, outfolder, 
-                           numOfTopics, 
-                           fontscale, dpi, height, mode, topics):
+def complexProgression(averageDataset, 
+                       firstWordsFile, 
+                       outfolder, 
+                       numOfTopics, 
+                       targetCategories, 
+                       fontscale, 
+                       dpi, height, 
+                       mode, topics):
     """Function to plot topic development over textual progression."""
-    print("Launched textualProgression.")
+    print("Launched complexProgression.")
     if mode == "sel": 
-        entriesShown = numberOfTopics
+        entriesShown = numOfTopics
         dataToPlot = get_selSimpleProgression_dataToPlot(averageDataset, 
-                                                      firstWordsFile, 
-                                                      entriesShown, 
-                                                      topics)
-        create_selSimpleProgression_lineplot(dataToPlot, outfolder, 
-                                          fontscale, topics, 
-                                          dpi, height)
+                                                         firstWordsFile, 
+                                                         entriesShown, 
+                                                         topics)
+        create_selSimpleProgression_lineplot(dataToPlot, 
+                                             outfolder, 
+                                             fontscale, 
+                                             topics, 
+                                             dpi, height)
     elif mode == "all": 
         entriesShown = numOfTopics
         topics = list(range(0, numOfTopics))
@@ -1608,10 +1644,12 @@ def complexProgression(averageDataset, firstWordsFile, outfolder,
             dataToPlot = get_allComplexProgression_dataToPlot(averageDataset, 
                                                              firstWordsFile, 
                                                              entriesShown, 
-                                                             topic)
-            #create_allComplexProgression_lineplot(dataToPlot, outfolder, 
-            #                                     fontscale, firstWordsFile, 
-            #                                     topic, dpi, height)
+                                                             topic,
+                                                             targetCategories)
+            create_allComplexProgression_lineplot(dataToPlot, targetCategories,
+                                                  outfolder, 
+                                                  fontscale, firstWordsFile, 
+                                                  topic, dpi, height)
     else: 
         print("Please select a valid value for 'mode'.")
     print("Done.")
diff --git a/tmw_config.py b/tmw_config.py
index 2c6ce7b..272ee90 100644
--- a/tmw_config.py
+++ b/tmw_config.py
@@ -154,8 +154,8 @@
 ### Based on the mastermatrix, calculates average topic scores for two target categories at once.
 mastermatrixfile = wdir+"/7_aggregates/mastermatrix.csv"
 outfolder = wdir+"7_aggregates/"
-targets = ["subgenre", "binID"] # 2 targets to combine
-#tmw.calculate_complexAverageTopicScores(mastermatrixfile, targets, outfolder)
+targets = ["decade", "binID"] # 2 targets to combine
+tmw.calculate_complexAverageTopicScores(mastermatrixfile, targets, outfolder)
 
 ### save_firstWords
 ### Saves the first words of each topic to a separate file.
@@ -294,16 +294,18 @@
 ### complexProgression ###
 ### Creates a lineplot of topic development over textual progression, 
 ### but does so separatedly for different target categories.
-averageDataset = wdir+"7_aggregates/complex-avgtopicscores_by-subgenre+binID.csv" 
+averageDataset = wdir+"7_aggregates/complex-avgtopicscores_by-decade+binID.csv" 
 firstWordsFile = wdir+"7_aggregates/firstWords.csv"
 outfolder = wdir+"8_visuals/progression/complex/"
-numOfTopics = numOfTopics # must be actual number of topics modeled.
+numOfTopics = 3 # for testing.
+#numOfTopics = numOfTopics # must be actual number of topics modeled.
+targetCategories = ["decade","binID"] # two values, corresponding to averageDataset
 fontscale = 1.0
 dpi = 300
 height = 0 # for lineplot; 0=automatic
-mode = "all" # all|sel 
+mode = "all" # all|sel ### only all is implemented ##
 topics = ["25", "44", "12"] # if mode="sel": list of topics
-#tmw.complexProgression(averageDataset, firstWordsFile, outfolder, numOfTopics, fontscale, dpi, height, mode, topics)
+tmw.complexProgression(averageDataset, firstWordsFile, outfolder, numOfTopics, targetCategories, fontscale, dpi, height, mode, topics)
 
 
 

From bc27d043f15f4eaaf39d989ddfc40928e50eaafa Mon Sep 17 00:00:00 2001
From: christofs <c.schoech@gmail.com>
Date: Mon, 7 Sep 2015 17:10:44 +0200
Subject: [PATCH 45/56] Remove comments, add link to tutorial

---
 __pycache__/tmw.cpython-34.pyc | Bin 45250 -> 45234 bytes
 tmw.py                         |   2 +-
 tmw_config.py                  |  46 +++++++++++++--------------------
 3 files changed, 19 insertions(+), 29 deletions(-)

diff --git a/__pycache__/tmw.cpython-34.pyc b/__pycache__/tmw.cpython-34.pyc
index 542f1e316d8e1415d6620d15dd64ad309c0b2a72..e9407ad37a3d9472034fbe6bd950030a2fdab8ed 100644
GIT binary patch
delta 79
zcmV-V0I>hU;R3SZ0txO64GIRN?Nynv2_&Kcg0n871OXLl0CWrhWC8$u3jl)x07P9`
l0DTVtlU1YP0S}Wmq$dF}vtOhS0|7#lZ>J&w0<)B-E(r>&7@YtB

delta 95
zcmV-l0HFV};R3?p0txO64GM^y?Nyku2_&Kckh3nL1OX#>0CWrhWC8$u3jl)x07PA9
z0DTVtbN~Q+3;=u&02zY;00EOnqu~J^lPRPp0Y9@*qz?lDM3Z5sA_D;q0kedsE(wf4
B8(9DV

diff --git a/tmw.py b/tmw.py
index 475bbb0..30bb384 100644
--- a/tmw.py
+++ b/tmw.py
@@ -1591,7 +1591,7 @@ def create_allComplexProgression_lineplot(dataToPlot, targetCategories,
     plt.figure()
     plt.figure(figsize=(15,10))
     for i in range(0, numPartialData):
-        print(completeData[i])
+        #print(completeData[i])
         label = completeData[i].columns.values.tolist()
         label = str(label[0])
         plt.plot(completeData[i], lw=4, marker="o", label=label)
diff --git a/tmw_config.py b/tmw_config.py
index 272ee90..d7f5224 100644
--- a/tmw_config.py
+++ b/tmw_config.py
@@ -20,8 +20,9 @@
 # 5. Advanced Visualizations
 # 6. Other / Obsolete / in development
 
-import tmw
-#print(help(topmod))
+# You may find a tutorial explaining the purpose of each function 
+# as well as its input, output and other parameters at: 
+# https://www.penflip.com/c.schoech/tmw-tutorial
 
 
 ################################
@@ -38,6 +39,10 @@
 ### Path to the font for wordle generation
 font_path = "/home/christof/.fonts/AlegreyaSans-Regular.otf"
 
+import tmw
+#print(help(topmod))
+
+
 
 ################################
 ###    PREPROCESSING TEXTS   ###
@@ -51,9 +56,6 @@
 
 ### segmenter
 ### Split entire texts into smaller segments.
-### target: The desired length of each text segment in words. 
-### sizetolerancefactor: 1=exact target; >1 = some tolerance, e.g. 1.1= +/-10%.
-### preserveparagraphs: True|False, whether \n from input are kept in output.
 inpath = wdir + "1_txt/*.txt"
 outfolder = wdir + "2_segs/"
 target = 2000
@@ -62,6 +64,7 @@
 #tmw.segmenter(inpath, outfolder, target, sizetolerancefactor, preserveparagraphs)
 
 ### segments_to_bins
+### Assign each segment to one bin over textual progression.
 inpath = wdir + "2_segs/*.txt"
 outfolder = wdir + "7_aggregates/"
 binsnb = 5 # number of bins
@@ -106,11 +109,6 @@
 
 ### call_mallet_model
 ### Performs the actual topic modeling. 
-### num_topics: Number of different topics the model should find.
-### optimize_interval: interval between hypermarameter optimization.
-### num_iterations: How many times the model is improved. 
-### num_top_words: Number of words to save and display for each topic.
-### num_threads: Number of parallel processing threads to use. 
 mallet_path = mallet_path
 inputfile = wdir + "6_mallet/corpus.mallet"
 outfolder = wdir + "6_mallet/"
@@ -228,8 +226,7 @@
 firstWordsFile = wdir+"7_aggregates/firstWords.csv"
 outfolder = wdir+"8_visuals/distinctiveness/"
 targetCategories = ["author", "subgenre", "binID"] 
-# one or several: "author-name", "decade", "subgenre", "gender", "idno", "title"
-numOfTopics = numOfTopics # must be actual number of topics modeled.
+numOfTopics = numOfTopics # actual number of topics modeled.
 topTopicsShown = 20 
 fontscale = 1.0
 dpi = 300
@@ -240,7 +237,7 @@
 averageDatasets = wdir+"7_aggregates/avgtopicscores_by-decade.csv" 
 firstWordsFile = wdir+"7_aggregates/firstWords.csv"
 outfolder = wdir+"8_visuals/overTime/"
-numOfTopics = numOfTopics # must be actual number of topics modeled.
+numOfTopics = numOfTopics # actual number of topics modeled.
 fontscale = 1.0
 dpi = 300
 height = 0 # for lineplot; 0=automatic
@@ -278,37 +275,30 @@
 numOfTopics = numOfTopics # must be actual number of topics modeled.
 fontscale = 1.0
 dpi = 300
-height = 0 # for lineplot; 0=automatic
+height = 0 # 0=automatic
 mode = "sel" # all|sel 
 topics = ["25", "44", "12"] # if mode="sel": list of topics
 #tmw.simpleProgression(averageDataset, firstWordsFile, outfolder, numOfTopics, fontscale, dpi, height, mode, topics)
 
-
-
-
-################################
-###  OTHER / OBSOLETE / DEV  ###
-################################
-
-
 ### complexProgression ###
 ### Creates a lineplot of topic development over textual progression, 
 ### but does so separatedly for different target categories.
-averageDataset = wdir+"7_aggregates/complex-avgtopicscores_by-decade+binID.csv" 
+averageDataset = wdir+"7_aggregates/complex-avgtopicscores_by-subgenre+binID.csv" 
 firstWordsFile = wdir+"7_aggregates/firstWords.csv"
 outfolder = wdir+"8_visuals/progression/complex/"
-numOfTopics = 3 # for testing.
-#numOfTopics = numOfTopics # must be actual number of topics modeled.
-targetCategories = ["decade","binID"] # two values, corresponding to averageDataset
+numOfTopics = numOfTopics # must be actual number of topics modeled.
+targetCategories = ["subgenre","binID"] # two values, corresponding to averageDataset
 fontscale = 1.0
 dpi = 300
 height = 0 # for lineplot; 0=automatic
-mode = "all" # all|sel ### only all is implemented ##
-topics = ["25", "44", "12"] # if mode="sel": list of topics
+mode = "all" # all|sel ### only "all" is implemented ##
 tmw.complexProgression(averageDataset, firstWordsFile, outfolder, numOfTopics, targetCategories, fontscale, dpi, height, mode, topics)
 
 
 
+################################
+###  OTHER / OBSOLETE / DEV  ###
+################################
 
 ### 5c show segment
 ## To read a specific segment, better than looking in the folder.

From 903edcb4c9e30ad0013d8403e32e64ba8868a9bf Mon Sep 17 00:00:00 2001
From: christofs <c.schoech@gmail.com>
Date: Tue, 8 Sep 2015 14:34:50 +0200
Subject: [PATCH 46/56] make_wordle: Simple default for colors

---
 __pycache__/tmw.cpython-34.pyc | Bin 45234 -> 45195 bytes
 tmw.py                         |   3 ++-
 tmw_config.py                  |   4 ++--
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/__pycache__/tmw.cpython-34.pyc b/__pycache__/tmw.cpython-34.pyc
index e9407ad37a3d9472034fbe6bd950030a2fdab8ed..1df52ea958c10136a77ff1df64dc927dc84dc080 100644
GIT binary patch
delta 13359
zcmc&)dw85xnV&N=naSkdHffqBZF6bbPSX4RPD@L%C4`owx6hVwGWn8BJGY*1LfW)t
zxGYkj%VRkftRO79ifEUj<7IIb(G?dE)YS*w{R9M`RkyB>$m4=854-I9d(ULPNt$Tg
zKUR49%bD|?%X{AQdoSmF-{)U<-~GBf@SC#ok}F^O&F+8sT;Q*L!ch17<`6#R`vKt@
z|N2EnVBfVJJHyg4rCpT5U!%lI%V;W;4)<F|g)W;hGj1l*mZ9GD@1IuWa=DDsqp46&
zviDfpJesz>Ktsig8$^@3t2pCpp(e6eRR%u1ZVD9#iwVB@=<&6Sm-n6=_AX2xO-nyj
z1%0+ZX)R2J(*qLiY+qke#=~h@ML%S%dNnY0RUL};c<7HL2NP)$)EiIsn!1DEGW!$Z
zxG5XatSJx8s=cIDw5weu2Y2&K-rSgIfvMyif}mH<MR__P2xtTp41=Gw1R?ro`3PKM
zvY0AlZPrtI)LX?Pq57gAkbZS{rL}z_6@wLWHfqqo_QcGDY{k!c_&GT;C?(Ws_On)_
z!=^03)2RT=CQAWj>guW)wJe~c1YbHV`%O~z@v7EkJbfFrwf)kJK@Ud7P*V0<v1r7M
zCVI`IW7pE2{bAKF++vFQ%9M2(C}z0${f!5YFPXpQq{KRgi?${c=0X{ua&W2^i-oDC
z3b_i9Uzbsyh@}rNOeYVU39Ml?o>l_N0Of!R0Pp4`l(2re7(nW1cgZCvE(O#O*uHQo
zWhQ!MJN*dO$Z2@MOJA-YuCB|h!0$2AunO`s8_+{vG-mRa+P-Lle}Fy`dXg&vP^s;W
zMH8l+iDD7!Y>cYxKyH6b`4ly>R=d=ynss7h)~Z=pB<8D+PrrH_Xl)m<qPMZi90*Qh
z<q=nj+_uL$W@*WxAG<V?jK$1II+{#a@(Q)6zGgbBjCHgIEj;mP!uCWHX*pHhP(Lp-
zTA8*loeV{+8-p_)9ofE~Xu?YL$ZM#->;SN;ctfthnC?`s;Um+F_F9II7Ds>W0Q3Xm
zP!;t@l`q-XXPIej6wsVel+K`+QN*2$fF2_hNt3S_jEAT;(PQz`M2`_*E2TP&$YonZ
zjUpc0fVrk*GG&zI=M7oX2<WL%tiS^+6i)PpupT)J3u$8WV6$vi_cWXp2eY$hED&?%
z;~`rfo(0{ud*Nc+E0aT3;by9#nP2ebIxm@Z<r>h@_K^B)pESedNhHV(_YzNmI@)IA
z;gsz&tw=a!+TOz^?O%;LKdWvb7-JM$gFV(ZYHWK^l5h`pku>{r78ew<kE&#|`rWKW
z;z+ioF)cD+OU}N6W!kiOb66fPM3G0(iMP$+gR}L_b@+#1vEKItnzMnh2ZLx595Tuy
zF*D5EcbsD>t01R`rsk!>%62#R3Zn9pEo(At@PPt@XQI;n_RxMsOv@T;)R@QiM#8aJ
zu*@zdLrcb>N=QTW4EOib4~Kw*5DoTh__a-1+83-tJ?-&St^YXf`>3tEWA;Wdoc;Ff
zjoy8{+3->Ch`>b;{k`ZWuswaUQ--Nzcsj4Ub~vzpFmX7M97=37y!2#`;n{Q5^@b0{
z&6^F+=B+ybRBc+FhP!hM`RoJ1O~Jfwf^K;OhV{ZH1chB}rIV>xl;U7cbI%03MX{RR
z)}U23S*wiQqqAJzJ!j+C0ZI&8WYJ)0xroY=4ZSFq93oBe*1`w!{^=lqa1g9^ESw{h
zT{$99CPe*`(qi@&sG3#Ejy6vUBf!&gl=f3BC!Z9S>uLDh1Gqw?h#%OVRMN7G*r^+(
z33F(~P-P>egA$0`H`5<Cv3q6vNpVN^3+;D_Oq!~?hkYDAN`ii^7T$J;&yGfJ#5e)F
zG*0|6(;?0PhNw=n8AfkcjVv@o3_L{MQszf9N?E>;Hn?NxH;UcQE?6e+%D%hcm+o!j
zip3J~Xk#{t#j290Fn=JLNDZd3d|9W0OS)R}y^LCrBSgfAMy!zbcOf)1)Wb`r?KCDO
z2h-G5HweLZ<g0p<kz7Ax@~Bo;I!vhzFD1`K*$g?V)viU=s($HgaYk)gdfdmpaKHM-
z(q*I8{NB<gaW31mtks>lm%4Qla}P&JbL0yJ6r=#hF(56BG6Rx3CSg+-*$H{^cnjbp
zf#FNaD5WTJ2WqY+u*q&yv2eufmdJ0SZ@%~5dmQb!6n%?+2CMX@uilUL#c*f{Qbuu<
zR>p#W@v;j~z0p`QBCki$GfZNhrIMlkdqu5p58CWd(^t-_K8?z9t!4^^lw(o`dhN=F
zsR!^NKtqOjDYl1-a<0m(Tu^S5q)f_UIkPg<lPeoWDe1M9-Qq%a#j5$@s!f>ZQNSYr
ztWkc9pgUNim%(arxXZEmR<z<(V2$#u`qb*iMV~|wwnkyz?6*B6Cl71~Xt$BgMn|P#
z>aEqw#h0^fYl=jM=f~7RpPuv_#`-eg0synvMU>w5o56tmJgV*o{4MIM;$e$4t7l-I
zLSgbFfG+_aN2{{JR)xNm9;(OJ&Jj<km)0IV!1VYqdi_1%Nx)YCES*<N$BTXlwf{g6
z^vJK#AG?$S^;YsI+h76y+>gSU)MM+K#nagr*2RTS&!eu`usy>>b`<P5ho>;Z(}2Ag
zzsMS(AUv}EI@Es^kk^F7x^4fUWnPJt;p=z^$qHpaizyM}WZDdAjVS-d+2UK-`5T`T
zb6_zAW&%BQ%I)amJ!Xa_+lhRis^2tId_%3>v?@5E?Ck1UPoT8CPknS#b1RlXw1U*O
zUnlUE_H^v5S=Qz1^-UW`k+p5}wc`6~{>HlOnay7mtrN;9pTSHpWfJl5mLQ3EcFT&1
z)jL*1bo@5pp8(GSo&!7&m;yKqm<lLZiTn;Kcsra`ynwnF35>Z_(O&W((Y`3zBiTTr
zUw#)4zXvFAaVoX-diM`p*>7!K>&`Gc;G`s0+o>0YePFM|XGY$pcf^8WcjLTwL{M&{
zr*=tSlqT7WqmQA>I;Ag6xpJo&XZ1{EUgX1=VTa3=LF$s^Xuk`?df=qk=6UgJ(Cm7^
z-vD+K7-do)v_j#d(OimOhKCNDS!HFm_G1tccB1uGkCv>wmoQezX7+Ww4Ebq1+W<HZ
z_!JscIm_%a$w7~#$7sMRZB`rJ*WvzGm%96X-|f&UbX3ZIPN3~c0Mk1Vwjy-mqTT7<
zMyFvA*DhhtoeuYKdYhDKVu$+V_SsFw6lgVMMUs+I^nRHfOqrw8wO?%yx?gi;YjzZi
z3=@)Px(d_z34%rPHOvs86RhqpS@l0ggCC(msb2D{sKC07Nvx7k!Nf)lRpnx9VKwCh
z6py*3C4cH_qt2yx9y5WK{0zp-0R9e8NWpJkvvT&&s4`MSfdB9pnnFOd5E^@ht1IYM
zi#u8xCJ$J#c}thCnBTZ!b?f}br7K#4JQyu5qwefjKFWrl?U?Cx%U`;(zv}popgWf@
zbuMT(N+|RuWvFj35s|;3!Q}}8ySOh&cLH>fzKA^8_RulRP!-qB(}vuIc92op+_>MQ
z(<?bMqnM6@<e|dH*gz?Li>2w@8@C3MLpTldR3re6Sd@;YiFs^J<GI~dH1b>F8_m9G
zzrG(Sldrlq)10R<F_V$){onEP4FGd<COQV7<wz_!*lSeuntkEHSQ;q?C8&ma`?^`9
zw9>e<K}=U0cDB2J@5&z9xe14m&+e+%8MO|-wrA(|4*3Vn;RIr~-7?hWP9JoXq!SoP
z9ISNmWIEcLeQCRBkS-JGR!c_v2h#F3^cpkVJk)XEE~a#+Ke{Xtj_Io=Pfzlw;g8Vp
zV;D|7==4tYp=dnZZ(7UM(d%1;P?LI}*J+oIBCO;282)1f-JD|BB}`BzBXMOsKlC|j
z9Q4?91|IQd>?`sfqL8$Zqyly{(zv$Q>Y!thRYFf(p2?y{)QK5x=~eG`eQyDK0S;?)
zc0`MU`E{VecyuR1u6l?1L-*VwOlqjvyPHPs+|J$AHGA^2xGU&3Y?^zrTb5*x?XDI*
z&NNyNaBDih`U#b`Q4DmJou5{I9Id=$Z`vF9P<n;xa0QdrDIQ7fN3}=2v1j^5$Y%TL
z=zz3CViLQgmx|rVov~!@q}GH%oAHRX<UIa>x_a*nQI$3KenhOm9#N{Ar0kn6V=fx#
z^lJbte-JF|PNo*nKg#qQ`~LB(a&4o0P%YgbTy1#L-66WmOi^HiDFi2Jy&_WB-W23`
zlZ&(lx=El5l*nOA{?4V;{<#OnGr#|&*4pZdWhK^H3@9g{TN=0qI%}J@Z@^?ev-Pnt
zzKwpoT83sd)tD)yT4Br;E!l%co6rt~t*8V;RsmWFf)#cp*^VS4Ly;t?B|Ug}8iH5p
zL}d#(nSPuG=mr4KngGoJM+Dv)7B_$gkXbLG39?OR)LQ|)z;|nno>Gkm>ct$j{=ipq
z*29FM?nsEoqsk)~5HW*-6^nWv6%!grEewRF+nzS#R^C#2tp`EjGV1CgLzyPZ+~q~0
zM9>jp_~6Sr3DGGavzDn$C))AkVd_F2yNKMMDXE#}DXoD+KF{1m&N!icC5u`?PYV>(
zgf*~!Ag_AqhU)2$Lcj_wpnIAKxa-K$s@$?os2A5w%hrc7A~Xr3Oa@FL=p6QvQX2=6
zcra5NY2DEX@s;`fAm*5g#<hU?_*soo4FGKCW%g5n|D!YSe|?}n!_3azac~Ya9nC>(
zqB6k@{G17>2Q&a00kZ(((Hv1mI$RHEJ@CFWgLp$nNY+ruvP+15Av{BTQN0+RE84Sf
zh4&Q_y$I8D%$$o-8=#$l;>2u}*>dt+m#wG^0!C)eP3$n6^D!2VWn~L(!de)3_bSUr
z=*_$Sg8E5h1x2>Yhz-(BbN|txpxHhl(Wipw19ZBfYs&(jzYQ9ZyC+gO5n(}=k_X|?
zTxSGe1RY@0z1m-sJT9r}KLy1n0g$qjj`<T>!a{1m8M)kp_Hg3*7_oy&biHq-k5Np1
z7C%<ey?J`&a=8FqN(m@9<V{Drn$zfxCEEe=^<A8H8tU%;#+q@rKt0`G?{<0BEB&*@
zvaEYxil{MsbelzIWI7a(2c^uFT&SViqU+qhcB%eoQx#TSU<X^(N1_dy^pi*#3Ck!&
z2#Qc=yy5caG+bZ+%zd^3#{eeeK3kB(eZS^DyH(~slkW}Y=J>9?<4}v3sZxiU#M<n=
zhbn})T0L@j+CC2G?BJI{n`@!XQcZN`I1k1gXO%jBoL#uX@v)><lVVL`m0A{?E7lQ>
z>(tlpdhSo9Ihx9W0**N6EaW%(N@)?~1P|n52<!P1^kzSvq)^ozRbP#l*Y2U5Jnz3b
z(V;X%uFt*}Zz>?RKEi8XH=4x3@&ftJLX7(W?9KTw0=~-8Ldx}!nR(504LEeQ%Wzjl
zArg+hJ)zHTY^zJqXDMKO%6>N4Aj0a|<P3VTl(^@JP<xNGRkCAWPZHSf*pNJ-K6GS(
z@7EA$m3sYS4U1RPaCXU#@PHgP2k=7FAu&C1T}0S${P+)Ds#0!f2laAy+d2?6VL%4>
zu6F|KaiTbyk{fSRAHA_bDY<~{0Orp0x?IOUp3i6HJG{^3H_S?=Jo{GWA`%N*pX99N
z9pN4`Cg08rnwQp|mb=N(l!%)l++f4`TFSCo#kTA@>m^Y=dWjM`IjwFTTv6?$=TNon
z4bzENJ#=K&h?ajlIIj^}mRADkA6<i?!~t^)N?NO}e?qHrE44jG`rMw=rP~=SmoM5y
z^v*MOOirj?j;4-pxeWE@+{!5Rjs()HApBU3s+)#ph*Rppp;pn6{pQdKkzw-rdd&!4
zkL{p7x%XC`6r&txvk`-B19TA3J1q{aj>lnVqkS4wh`rTs*?GU&?QB5ZShwR+UprPS
z9#StITOc}>XZX#~Wn5)&u%b(h+(qhs%=j{ZgAnewY%jU*-k{g6#2IWa=h6HU&rn^*
z+txLqF#`wlNx9T<FSXU7=NoW3SiQ?#BI-pOTuv9sA91UHURRxc;doc!r3db%II><u
z7y2H6^#(bllbqx)J_@dtSn}Oyw+65W@Cg9C6WxMdPg9PVNOy8KeX&u6W7~CoB<Zf4
zsBGf47l$5wY<~;`?Bbq$eRJ(4*V5F$p42lKjpZ(Qbc^TJ8z)=EzHH4+_lfD<)W5*l
zaOTK<X_I;;(_FcV9vl9xBuh9kELW?yGdqGm#Jn2_3QkJVeQx$e2QVDQq?cSJDtdE6
zh7U|Wx_uawCY|W;DL}GchT{dv<?k?{lXb!XC(bn;m*bavG24ECL135jIeaHY6lBSq
z4;mXPdRh30nN!bLL!rs+T1sH>m5RQB+7cT?enIzVFRRVBw2Fx8zvVaY=dA`c<UT-z
zfU=vNDE9zhnzlb~rU#Pr!W{)k=S#U8&#=8?gw?EU!cv>n6Sr2^{T58rr-KrACH-G6
z$^{+z`m(RwdR%aX){f=@^ihBh12{d)J3o3GPnUbRn0*cp9|3U2*U7JVh-$nJ&MO>q
z0t@v(+>5P?4`hCA{>4K5_^(<jjMCi6lHUrb;@fA8x<^=i`*eDXdd=-E;!rku`{QC$
zerM$SNddMm2H$VVpJTTF0XRX<;T^~`JOjygqxybk>J&QcFbRt_2_y7Piu&LkkqomC
zPKQ`%-Kfv&nuWPb;e#}gya5mb@cHHgDDA-1d?69WPsb1cIXpBA&O4<tih(%YbJpCC
zx&Z)Iuh}>`O6wX~h$f^_f;tY1)}>zj#Z&ckt9J2JP{gxS?hNFPLx(V6oM6O1=Vy>S
zE2o9MXk-Fd2$qiJbVO`|CK%W~RzMPm{VV9k)T*W?E>DRlcL#8!Ru9}&A`*Ymym305
zIK2ezm!|TcuByEjVwSO(xeGEl@uYe!UYC9O+=9!?9doXH!b8x631kuqW`C{|NV{V6
z?ThrRMGKCJ93n3#g`~yUEx0!L?U|{i&Mnt|)p^e>`tl@k&z)j=KB1IXV1g6WM{YuC
zvzqjwD@L*&$}N|3ZduVyhn;Yb6&XlG)0gCz5(&`Q{PMCgp_^3knVo;BtWxGuA99<c
z{OQlnkiDo6J=`kN*%kNt3n@TnDE5GV_D<7nWeV!$REW3pJXAhrJ1<Flcqfa9vEi6n
ze9sJVn|f8%iNWk2RQ8YV;WK9!7wlmTXRh%71wM0&ntxwiPW4mts+f)N+;SIS!UWXu
zd7O%lmw<8<c7koG&gc;3^nD8J;k;`#{my6k`r5XVFR8ZW@4H8K2|c+xpjo|pUr?M?
zbM6m{quHJJe=g_Rj$n;xfRj<ND1)dwN<co(*$9^BbY3V&k|65X;CQciUwFfs91*Bw
zPqL!%RLtBdlL2xlmPI$lPpCW3tq~{wLJp*0aXF_kmsT={X5$|`FFX(wudt6e`Dglw
z;CTEZUDGWh;s41oxYenLuF6={y&z5e|2}!v(exA0&wRK|yvaV|rtFs<?kVsQ@&Q_S
zr%tx!LkGyNM!tR;&u?&WKn$QO2#1=EkKk~e-$5hywZM{*!cxlo!*Ut<`z7%8^dS^~
z4!kt?1G%g|tXy)aQqMmUZ1Vdk{=q~LrYDP;geCNvi2nE1i+Z=bIa~d4uV?uq)FAif
z>LiLBFgR~`5I^}Gz>x&6Mn@t^$0t`6c%h8L5~#!$NA02nf6@6L=)=0IhQ>#e=0;Q@
zT`M30U0dX*4K|MJycS^!`uYa1s%e1N@hV^PZPOb3cJjF{{N&|vQuibLWS7D2Zwh7~
zo!QwH^bJ5bO|NU|>-ccoDo6n*bcgVS8T~3}irxp_)l2*#P3AZuHq#*pkRdQC5`%I2
zE<YU&^GhzH3}2_xr*izJMlPorj8c+l<heY6mE3~vI?h$%cT*mk)sjcs#4L5-(QCv9
zv!8x6Bz(hYd|Z`3wqCy=n$VlxjD{a17)4pfoATl&Jm&v#mpU2CQqMkCA-dF$AKNVM
zP<5X<VU81HY@anSn2yHm;z%-etdBmSpckdyq28#}pKfX8I_ss9U51z7_&gb(+2020
z(?9-#ze>ov)jOa0H*r$^^s}}6sRA9)IN0;~<}|IAzATNV?Lb<F6BZ791wjUMV^Y!U
zav}$%NOV|o^7`?z`P|#Itx=K%fx|L9sYqCF+o+S#&F8D@KLJ_cJ#f<y5I#|f|J`$h
zJfkk0|B2o&PVCQMTZ;9`i}zuix%FX##cP7#44lOKk;L!ZviS+jSBCkJy?c<o&k*un
zjJ<E{=cS!&Tl+Ck3!oiP3z!F34A=-b2p9ky0a$=xz#V|Q0A~ouEeys}$Mj|JIn-VN
zd>QZ*;Az0O0N)4vz~%C~Bz@v?`tg~3VIbc^HGdW1qrK`)#^r0M`5EAsfZqcItwz6O
zE<vdZ&;pnPm<wnF;4P7yPk@ir!#zovkZVxI?<tm{#7X)(lrnrJJo*|;cA{^&6^W*f
zEsW^v0s53c7vw>7>jLZpL;!sN-c20SWCFm+nPZXpghrmg)02Rkgsa{q^TsLfK;<dG
j-2{eu`a<n2@K}N)yqN|5pb08zR&QSj(<gcRo-qCwt+>Tk

delta 13391
zcmc&*3w)H-m7hDAOfr)vB!K`4On?AGc!oC-1bHY3F+dPdWSC68BqQ_keiI-;9I>`^
zwZ-Zc1!<w$Ro2$W+Oh4=cDJ>)+N!P3KJ2ws?9W}BKBZl{U2B&;|MN}eo5aM@-QRD^
z@0T<8yU%maJ?DSUx!=dFZ#lpCmecdw;u5d>AAhs;_+y?=x`kGEyQdKSy89KPw-;PA
zTYI)|+|Uw~rXh8H0>9d8CS^DgNCrDiU22t$h!HhnNmHx$3wBf&I2;aLv_BDOi?{bA
zjr~dOVydErHDbEDr7-25L0zO*m3i)L8cW6QLL%QfePHRF`RxaLU9*z=lhQ*~ezz`&
zo3j$ZWS2xg?e2)nXfP=&=s}KGZ+J#6@}W2pjRm22cPvSax}x!R!x~Uv8lACV)R2C3
zo1P^Ns=-??=BrKKYp&##T$wr1153%-MEou}2jx1DMv%eNpl&>o0R5S6A`UT9j1p9*
zmLIB4*AxX@<vbmB*e8fckGi$YT(^jd{!%#;HN3UFh!K;s@Hm%<9vSME5;8S9S+wC^
zLl&d876ePmQjjurX?g8v=F*2}mV`SQl$}OWs{`fr^LY7r)TRrh5rHst>AtvZHzVPY
z5stMR`*Y=jIp(KdE&q3?7^}Wo(UgL8dJC`iU2|aWj3oyp_R*WaIvz7-$s#KIM_Ft!
zGvhG&5)gY|C9z0y&#Yv8j}gNjR-ttyNEt{u$OsTV(n^%Df4KyN#L?i8OHphBsUf1>
z!9>D{wafYR;IEPs(BK1ErcPCkNnMQMAZ}O$*_Bxi9pQ+<=c?V|7(YOA5^|C&Kp;}>
ziiBf^oPuIL%WROSv?p^uhWt8pG8a44qN*lwdD^U+l`j^k`^H|n*2|+=&}&&_Ht|=p
z@JL}v!*!n7V_KX;KDsCrk3@`6G8~VYa)X*Zu4*icjAgV2J@TUAn9d8wlCoCq8aFL9
zP?*}Cj0ZyIUjKMoM%vvLj+wDGxt+$#9Uv?!K9CKV)0t?ef%}u;c2m1)bNKZ@K2RL0
zqVe1m@8~d%q-7bbbNR^>M(KPWq&?Ogfl!hFqdOX)+E|;(t+6)kVN<0#%*bUe!aAQP
z*Ra`yj3;z)cHMv}wZ~c-#ZokwfncmXfc?m6*vNDU+B%;Gb)@E~=t)nmohha)Ks8$)
z%-`?S&RKJ`OUCz^xreFtjep$D_|Tx{PPk+V)KTY=__SLZL4p$r<p#z?6Hpz?W~0G`
zb{l3Wm@u?!k3r{GqaK<tW)^Eqq1oMLuBFb}g_1mg3G$`UnXx#(kT+44Y*2rgFk2i+
zPx2*23Y*B-m%rFDEj}FP#|2j85e(vEv-QC~de(K|gTK%^_gm222f{E0)+BUDmxLll
zkagci$RZX&Mh<lo8^!JE)`{(cRQZWXOHyp`o*aW`MWyp?qw@+Gra9QCc^y|M7>W3c
zbs-sAJOWXI8`831XD2<_3aG#u>{%CBHfdT|a2e|9j7M4WkJ7mhIJ(1=my6@+XC^Oq
zZRf)VM!hFIeR=fT=fr$@9kN*-<Wln$8+$z$b;tI^;`?IDwToJ=)OlB~*-XxbiYr&@
zyp^lhgHW|$Hfv||D#GlY{uTbLZ2V5yi^*KT1i#RQW-^|Lgvl9ZB=@wxRe06d={1(f
zMp`0cc(k`W6ae0aY4c5*EGZXLS+b$$N8<ZPQhc<)Kt4aq3BVorD{TvBH>Jz^8A=6c
zTwI#0y*Z+0kuuO`m4dJg%d)F3Acszv6qM`0kJSJu)cM??^Ad5gWd?O!6f^eq8>;Oe
zNx4tcSSM-s!iIw)oqoLG7LmGxs#<&9>^{8y0!u7>?6yAJs(dH*=Fvq_(jOxk;4t7Q
z)mhq13gOV@{TmIC23|{ODYLWbBIYk(8Qd|98=&3aojFf@CH?-)cbsd7WQ#fC*#;dH
zvsEr1$NHXdEYY3B_N7nx=Wdyl9i>M@IRc~@;gA`y@GgLkTAi9(-J(atyOT83>JR{S
zWUJcaq0BfvazHOL8Kn4zx02OG*#JH&)u!2%YFy)F@swK8c)-n}LWlZh<GcZTKHpd;
zo=?}!t9PcpO~aZ=bN7Tva)gD9kWWKO_hdFkkpb}?k+7*gmVpb+Eg+vpA9q}aDMXRm
zP}7PUvfD%?7&2NV;+yc<^XJdAx8qXy96kEWtwUejf&PU6G&m^&2ih4k0*#lu(dr6E
z;vtF9O6T<wo8Li6{bj*ucZf=In;N@tLgf>vEV0B)u8_h_3PU$8tQlpZ!9!C9*!Su@
zD#|%3wQy#M_9hGpWjV6a>WPIl1B7&DVXNp%FIY4~tl5Zl-URs(2=*vnBGT&jTH9c3
zwsp4>9UcVXU0{#$JL-YOzS%!T5w=Ed-smJ`p%51nYY&|^LfY_vFif3WJYSqmPhXNR
zQoKHv_Peb`pT}HhLHa;glKB+lb{c+<{57hc1^EdPT^<dZBw1?()>8;fJ_qtH$S=^V
zIJZ|Wwo;>dWa$*~C-uXn`**T>JcCic2l)fYe}FJ|E{i+f^vkF{N5r2e|C4@n5jpDB
zgeYra0R@?d+?CXWO%uhR)8A`~3b(b6x_H^T6f3eVU>h9%gcbe_(vJD_%`S4n{rmT!
z{*NG8Nk~?df^O5e1R=wJp%I+r@<5NeB1AfwGy;}HRKfDe;(U6>@@K^qSWJ$YKn~6F
zN{n&!7(vN)BDbn>E5?iW)zTG<{KN8IftgrNP-*#Hb<c{4_1Fff6}Z*~Rse5GAKDqL
znb&3Ntrg1$DC_i<8=XZ$&0JoczJ2AFME$T#N(WhZKaYD>`H9D)s}>9|-@!bB5rHY3
zAbB7zkbDsKOo!2h)03R7NH^LCZKD8ng+%n!@^CvrNVp?R21!N`>y#cedO>n9P9;`v
zc9x3tw^uK9rdU0IQWCpuwl;->U|8bJk<aNpF|*g{oA#dY%PrKZy&Yj%WE*myOxcR*
zz0}wrb7j&Qd-tr#yvbLw!ZwE^h1ey|?tT}h%>zoY(evin@&Q`%2Ow?eQY@`ZD-hft
z&cyhGoC=GqI0Iy05r$&P4dJz=10x_Z?@9(c-iADbw)G%ygS<gRm)qOiVh}?2$H(Zv
zB5hF1F5BQ7Bh;;zy|BR&p)JxMp&PA}5hm-mCuoK!<)YJRT}wGJG*^2Wx|6{+j&I{K
zN!p?AUpKi<S3s%(GZdE;qj$=9cfuGLul;hJ-&rTpRqG2y3eLqkw#_Jci1_nm9X9Hr
z6xJFqS@f0Ai3%eAB5TXzQGtEy5iF8G&ceEisxr<sx0-?ia>(4%loJHWLl>bvVt|_R
z=iuT2kbeVVd1Zw5%;gK`Po*V8`P71c_;t}AGE1|PNV{+(9gb^-quuFf&vSgl=@_-e
z?^3=ElNMHXnUQIYjdN%C7A~l-pW&--udny_Ax`g`M1;~3x1_i>9;NJ*e#w)O7#>i8
z4f6+>`^gRCU3qejNI$vZc|mtHpJ<-hpuOa!<1*0E9Sh0XkjV*}Q5SZ^>H2_j?!Ia~
z=26C`)!WU}ER)|&brQRgV~Oa(PJ<F%!aZF`StEg2u!l{VBE3kGQsk)F72k)%nU^94
zXiR=GoFpA(!y5{rXJOd_Gq~622zOfdDaCTUSV?QL4_Xfvd?0hdM-vEZ@e~a7;5<T+
zcz3%lZ8tiC-H{}s5sGEC`rZ{2rsbRrx{r<q_ZWPv5hKtc<52=|M2%xqMN5P8QjuQK
zvI1$$p-tmfaT;SeaGlq(ZiBoGOWUrz0z9#O;!w3Kgk;o(jrhwlk+ynsQ}wpbqu-#h
zb5U1^VTBYOb%tZXh;=8E*B0MT=_5@w$Q;nO2=#$*G}vjF^HlTZNn*2_8yRy)`(vvt
z`y>V(mZaHWI0oUEM0;7GS$Rogn059nHg5e&rM8S(#!dtPlB?5LyA)<-L&oKlHJM&&
zaO8~?Rbq^&b&^wiWy^~*86%#dhGt4F*|*uNS$+)D(Ul1S`8M@J>(qR#snvU}b@K<E
zRvFf`eB8RE5jD4F7bTD%DXN_#opNXTimjEx0()N7LM7e;gVZ?M%v$o8LvSH~Oe@Q+
z*zKz*Z9}P+2rg`rRzOIb*pBK}RF{$zTRBwULW&-v(398#=r)lGlOiE>krw>6Q?WJP
z5{YNBJ>D7z;VenJIiPB{)rz)s)3#$`ERK!B-8e<?bfMFC6}Vc2<y+KmwvQDrsKOoo
z#X2w98ld~n1i3+&gMWl2XZRKEN?^(5;MPT(po<5(Zwc)&<w~K#JEra&$^<XL3bSc6
zZ8nJV4v+xm@Gv}3YHnzEml5vlN?I1^17~Z|E$7VCuXl_W*Quhbri#7kc~?yrjeP!W
zGCR??4P-A7f2l4bLy@G(kQp&*vV`Rm>b9##rFNlT5~Lf1ZJt-;qhRi?Mgv&1IL_gM
zIMbz&0&v|rt;0|pu;Fld>E5+O6pJ#VRiZ)29-1SB^dJ9#(?t$7dFMZEvMh@Ah<YHZ
z9!i5)#h^Uo@Sq|VG!1D*s`i?i(I>zOq~ajEsTRe~65$oHSG8R8(>1o`NLE_bWJ-UB
zT9-8!>*{<0K)Sdt$f&8hZdb)cjE~I3?%^afnr$ndV8aU7>iB?qa@T0*L7~3iRGmJ%
zD<z6Tn1XzkLmKM9?%GAHk1x>DvI~`A5Qdu$lsZ9L&`JeIC;jn1xktUYdt8duHFM9w
zVNev^p{=B!L=qs^fy6-KASMWA%el?61AMp-X)CbDQU^XZ))N**zbY-w)uqAOOB3np
zOju8vV+p1$C_`OAnS5M*E;v=(kbXY6U0AM>wfai1wHqw-qI3}C1|sAE_Mz-U6YB;0
zvi+#L9;Bb63`4eFpTP)bn)krwI7qq6O8T{%d3;_y7Fr-u8Beg&(py?%Xip%wP}6WO
zfCf<}#{3Sg!X?JIs&iQswxR#V^fDusgTl5GqC_^?Zso<ylcm|53H$*(f}Lhc_lH&T
zL)3u-a~VQ^z_XQ2P>hXhelyuaX*x0uy(pgoqJ{J2jp%a?5rX|JcuNKqJ_hVfdkC+$
zaP+BFu+!H#thh(PQo1PHqift8Lph(P(w&pV;q+^r6{1SJ>8=ZJmAn8<3TuIwcZ8dq
zi-cMht}7p8Ae&SyTw__o2$E3D)J5<(AsUM}VFj7xax8<5g3ZIWjA4y}P0Kb4iWMOd
z762OsYhTl3RQe#)p-f@CWeV!uJ=LOCt-ZERsC3u0r6SczBYMkxaBWG7^?mfRZK<!B
z@XrSO!<bp&0E#R-V3p#f*aY|#m>r7~bhk}lD4nlim2an@L7m%EAr>l6WU9E6bpGeo
z6&0h6t##J*i=}p~@SYq6%ZYigdFYY62`q5B`Vp}GVMajRHKS*o99V0(G)j?VHe=kX
zJ{zqPN7G-9*5#Okbpc?Z<UrN;m*g;(4UeqAq!K(0!JRyc2DS!Pa*wU#x)?WYl*}PM
zX^6=)*c+Sj%^34BkRcUa9g5e8-Rfk#R@|L_@x*m>S8(0*GKT(j*swFQPo7X6*UfY@
z{2x&-AFY{l8%?LZ>w{gg*XY7KRa?uc#nFdY!}b7WLY3}a)&Ski+>Bc|HxS+4^IU`^
zV}C*}zey$b)~FR`p}KQ#mHL=0XM+J)u%=z8>liBH;X|+uBop@dh&2nD&!7Zn#Q_NJ
zh;ctVYD2Kih<w0WRD)S7q*`azi&N>i`9o1Ta7*$dG~2Ced-sAO&VpD<A$6>KTE8%#
z>7M3;Fy#pl$WDG1CHAPFM#&Oh{a+ED{0#LyMq*o+5tY-PEE6T_e0nh&>5;=q&%|;z
z#$15(GVtO9+icHg9FDq-_)cQBq2Q%jUB9<l99D8)y|_PpY~S@F#fs)`H$r%k_Bk4u
zxov4CcV%PC-I(zd$o)j<?H4;*8(<h)Y*^s{vJtEiR~eMK3!^`PGL--7ihj>Zr+o#R
zchvZd`a;i4aa#SP=WHg;;y~>lOvm9L&6aZw!m%4&fMjlAIm@cSg6z5I_M_y$I7-R=
z2c{25*>@cnHTFqYe^TZ2ig98(WjOL-8X9t{mzyfn_Z`@hdlRw+%R-_0yhP|L0hW=#
zQg&Fv7*c^c`>u&u!iAG{{mek}8Xgf;(%tAo)V*KjTjN{lV~%oUYgcq^BcyM)d?uT&
zb|G%GZdu;K1Yh8he5bSVgO}Bh92_N{RZkzR7ikqY$E001928?80dF>LaQMg&vr(N$
zO)NWt84Fg^^1)cIyiI*CwccNbA?s<qoE#{;okDnmm9L-;bJE+cVs-J2HO}QiU325;
zl&#S#(Zvol*;fwhNo|ZMkYB{W$3ebCM3-<j-9kP9;J}WQ?jV@ckdfJbR`Uz2JgGsw
zcVmrsU5!3eFP>D(4*k}OC2`0y4e}%ridr5-`RgD)+NUmv8p*CW-AAGzNjK%AXv0Ab
z;wm>C!^kp5u2qq41SjOJe+LzVO*`_u&N642hZ2wyF<!{0(&?KH2zI6xW_+Le$ZSd?
z4`Hn=e9?ss-R04I*6l8v0Y`St{7Q$E$TnEp0-#^kL@vz62JtG@QS!n@yhEsmZ=O`D
zi!!MpzXecl+*~_lAfW21w(fAGJ%H?+)zOl6fvWrHB=KB&%|{;*3$y1Wzf5ascLdOF
z%2`;r8DuD(dqnvUPbxox0fm;%g$z=<orgmyR=P99EGeD2*GjxCrOVs}KY>ZV3i36O
z<skn8!XA7ZO8*M-zf;6iPL5L~pT$HR$=Ta{8g<WrV8fQ)jD&>xH)07%dr`-Z%p4M!
zM^QA5@26P0S;cX7j#1zExL>@K?)$h$MEG={!#Xb!=?CHL!jhTcQ16@Q^eqtPjhSZl
zZO#WBZYdbdF5Ed^?YgB_y!>|y^~NInr<b3dl>Ywg)^g{De06WMQoNErethkPg?Jfc
zu7LKlF<fYz_X@<86X)sDfp;ddwGln;0)c8?SfuyoPfiot)uSgSh#yd-_YY!hHp-L7
zFcYqY<#CkmQSaP#aes_L0pEPYD7v(jQj}nu8S0ARJAfQRqE>Q*L<a|b7ZkS~RBwE$
z<?j{Z$xNU^(3lIM>T9?AMMQO+suw>_7u`{id*GDYuo?W#=SsJ1uR^@7Abfb7?+*&o
zgJ>OO8k0KJ?P}t<oAL?7aRovkj(bqy$s0q|?Ub*jUr^}}J<$_K=j5Dd3MVRm2x8;m
zfD_a%%9C?%8=L(V*1lOg0pAkuw}uZWZNTHuazH?t%6!&0C}AzKqB9@nO@cWV6-y}_
zlw0?#mVaBD&A<B3k;V*?-Q>*G4$}T`G!Zdc$fnh=?kcA=^8LH~;`Q{DyC2Kk?fd|1
z{uty+klo;eS$qwTuM;82v(J$E{P4JwkC~5@cQJ^!5oSL}9)cOuq`TioRp9s%@#fzF
zWpXx_K^D3iz948JG3m(!mnv~`wED)qUhz|gnKOSQ%=m{=N|O2i0b7LHb#hJWM>I)J
z%=rJENsfW?44DwArc=|!y9_gLr9W}1EeB>K;;?2bBx4gr4A%cm)KJh<iB;e<=xV`M
zQ9emlJ@(33tm*%TU~Ye*ks|+~RMC428@NQr*@!4VlMrO0|H0s-K6|fUxYcv_P7v>;
ze|fJfZ#jpA)(a?n1!Q@ZFN3!i@c0Ny>~36mwA>BV(Whew)YQqNxi<|tFoo@Fv0vxM
z_?ylJm}eO1l0QKs0=Jw)ryL>6fZsPHk}bv(^!*LqRnr6$1-;9czXF?UK<vnFHy(MT
z9Ntx+H^T%c*9IO}f@G5oT}mGT1e5d@mOhdXM$MdH@PpWxfktE?DeLV_g$0Xzf<CPM
zq1Y%j)_$9aE{%0Z>GS+#ILNQ4bTPh9rPrSP@<lGAsdN!>)ZZ>mO#TZDx6~Cf%*9hx
zaC*8JuNqHZF5XSwa5^A7KSPf<h^UjtmRL#6ups&l`o2qKfQIsh7({u0yfqu_?F^nZ
zc~~SJiJg^2jQv3O43utG4?nn4oK?Sj@cQxr-iIWt4t#$6BHU<7UVjiS7vp_4K1#;N
z^*6c6>(U>8yk9QlIrY-#-%jx-ZpAUWx{W62b9Z`|Zrds)avULW-t$M=FnwVfPSQuu
z^zPn7jGg0NpmeNSlv<IBP+UbxlpW^}72TJUp)Mx-TpcF%;4@5m4<DlceSxV4?W8O>
zs&84f@;<fap)ukw>Z1?6zR3>e|AIp)w30DCg&~%;@($)~&>95@+-dGd)ngA=rT!ap
zZ6Ewa>DAzkgWwq`%?H8f4LP6lUyjEfkefk{fZPsp93&0$6_BTh5Dauj6Ft_=?enPZ
z138P&Z{(j)<Zr*uqf|r;$x;viu^tf(CUM&n3AfEk^q@p{`21^G_$u_oR}XSJNFB&j
zkU1cif@}ua3bGTV4P-A!4@1VlS3)vHE%<6Z*cO*D`6=|{7Zrz5QXqGN90j=>Bzwsv
zPt&6=F+<@*&#aJ@%hN{!RzW_2z5{O%<WnL=mDvPMzKlK`u-R7m8tM?s$TvaGfV>6r
z4hZa1o(1_m$T=dq%G^WWg3H(}x}$@-n_;}Ame6bo)sZ;!jOKYEa#oNA_2Q#Jk+0_d
H^Hu)?bLD;8

diff --git a/tmw.py b/tmw.py
index 30bb384..aa15584 100644
--- a/tmw.py
+++ b/tmw.py
@@ -787,8 +787,9 @@ def get_wordlewords(words, word_weights_file, topic):
         
 def get_color_scale(word, font_size, position, orientation, random_state=None):
     """ Create color scheme for wordle."""
+    return "hsl(245, 58%, 25%)" # Default. Uniform dark blue.
     #return "hsl(0, 00%, %d%%)" % random.randint(80, 100) # Greys for black background.
-    return "hsl(221, 65%%, %d%%)" % random.randint(30, 35) # Dark blue for white background
+    #return "hsl(221, 65%%, %d%%)" % random.randint(30, 35) # Dark blues for white background
 
 def make_wordle_from_mallet(word_weights_file, 
                             numOfTopics,words,outfolder, 
diff --git a/tmw_config.py b/tmw_config.py
index d7f5224..bbd06ef 100644
--- a/tmw_config.py
+++ b/tmw_config.py
@@ -176,7 +176,7 @@
 outfolder = wdir+"8_visuals/wordles/"
 font_path = font_path
 dpi = 300
-#tmw.make_wordle_from_mallet(word_weights_file,numOfTopics,words,outfolder,font_path,dpi)
+tmw.make_wordle_from_mallet(word_weights_file,numOfTopics,words,outfolder,font_path,dpi)
 
 ### crop_images
 ### Optional. Crops the wordle image files.
@@ -292,7 +292,7 @@
 dpi = 300
 height = 0 # for lineplot; 0=automatic
 mode = "all" # all|sel ### only "all" is implemented ##
-tmw.complexProgression(averageDataset, firstWordsFile, outfolder, numOfTopics, targetCategories, fontscale, dpi, height, mode, topics)
+#tmw.complexProgression(averageDataset, firstWordsFile, outfolder, numOfTopics, targetCategories, fontscale, dpi, height, mode, topics)
 
 
 

From d64754223849f73d37f8957adbbff8b455daa558 Mon Sep 17 00:00:00 2001
From: christofs <c.schoech@gmail.com>
Date: Tue, 8 Sep 2015 18:13:22 +0200
Subject: [PATCH 47/56] Fix stdev sorting bug:
 https://github.com/cligs/tmw/issues/15

---
 __pycache__/tmw.cpython-34.pyc | Bin 45195 -> 45451 bytes
 tmw.py                         |  24 +++++++++++++-----------
 tmw_config.py                  |   2 +-
 3 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/__pycache__/tmw.cpython-34.pyc b/__pycache__/tmw.cpython-34.pyc
index 1df52ea958c10136a77ff1df64dc927dc84dc080..0e49b07b889ca56e94fa3393fe1c9f93175c0fb0 100644
GIT binary patch
delta 13896
zcmc&*3w%`7nZIW;naNC^kOV>qFySRbAjm^N5CnNt5MqERScPFSa}x&U<+&3efpmP-
z)m5v-qgJIqE%n!0ZL76Y+qK)R+lTa_qE_2(Ppw7kZkK8ww6)vX?z-RqyOX&yV{q;6
z@3-YgzMOlW-~0Q{cW-Wb#dXuqT>f9x)cSufetpUF&5!zT_6V&$;+aYCtDf%>JmNiX
zp7!n7ys0}ZEmP{sG=6Kpm65S@C=>3tbe-Kc<7UE4W-P7#;=QcJYZ!*EK9~;mrlNxx
z^I%518LC88o0zR`tjc=k&=l!c^}gGew^DPUion+&IJE5SMbY6QcW35cM*65L=+WMk
z)tL@w_DIarp1zb!gfp^{9^_>8vTxjy0Gd-V=#8WXk{LqOok&GZdx6(7`;*~>DTA0c
zy8t>=hku4xq_+5XUdkuA^Lt_jj*{~T1l{s%wA%p|0F-ZozR3ha^kaDl7-Fm#C#X-Y
zyt*aVRvmIT@OGf^lpr8|>ZW>Y!xCx+>*QSYFliNWGbuaqIG=zX8yS!iJT?27wXq>n
z)?joJ0FIP(0QKs94U@(*mH|97Cp?+3>^C!79cq}dkdI$TW7;dtIGCa94y0t%ipL^m
zEEzQq7V`yrEXuvm@NX{Bs=nN~JPYm&Rb3v~d1(HebB8797^+;GN}8Rrn%co}Hd>6#
zM6A95z@e))8PDwN%%t|2NzkwcqhkT;0U7|t0I*1#&;tGPTmWK6has1txg4O4fcAvb
zX)_s>i|8TPET>=~0I)*c*EAt}KAL6NFbfJNGave5ag*h$J+UM|fO8UjlB)o~Qtgh%
zlBS%AW+n5ij8)o~myapGN|UUmhFa3RTwIj1nma4SIqKuB?_2NZ)y(Mi%rXZATbOyI
zu%zL-!Wy(}%)uXB9ZALGW+W3!B`vv0&70WV$}D3Zt;39pL@cQ*V#$o0q;^f5l`Una
z_GD6_h_ydB+2N7)^v04_vRCe)_3|<RW)%x$6ZUkaqqOi~CKk1{hls;(ANT{sp)Oj_
zL#e(#%gor8!8%u&$zqkR<VD(N?-7b*$YBg5Le!h=wRkkyt9@*$)Q26ptwl^%^5z;g
zo0h4xt|^=wvZVIeN26JX0V@<vMnj-S&H^E`!Dw4P9qN|0JH%ja`lPvH<~itQ%Y*p`
zUE0-owsy<Z0jpS;>cHeLc-TL5sQFVaI2Y=uD~NsCBh4_m6A9&pk4dDVI=0Ow!fEX>
ztw=a+YWF^q<kzM?Gi5?2YfP0j&}*%y$=Z#UJcJD@rP-gixS){NP?zjbznd~o+>)CX
z$cQY6$lF)2#x^Y$4&&nnD)JUAVzD{;;E0}e9sVI$Wy}3WOb-CC8v|+*I;3kOaWl-i
z?|8^+W<j10?Nb+sBe|ZbQ9-Kw__TAgZ1BD!gJ(r0`Sy~$BBo`P8#SNfj)dd!V2!RK
zLrcZMN?=1;7Vht-2S))FP=h^duWgf-?F%kMKgoEU&Hn_+y+m}^PhTnS&K;k=(!GO)
z4IlNE@SUol-=|#IucA+O$zg7FUA%e7ciup9Uov$dxl+4n<WgO6>AI~HT&THvwXRsb
zb|V0Fn^u>0b*&+veMN9pu)v$3OAcW(H+(`+=qf9dO2=aq4D+0OGT0;hs&#go&9bpJ
z%h)|S<c{V5Z^gEi7HyW1OQ|i{&@1Dq1H>s7Eqox$&kh0z2f-%C!Z}Ro`VoY(AzGJ`
z7He-2tC^+jXbVa~z6{55sP<AoC!Z9S8-R~J09U9hc|cdBQ)J8R)OB^zJTPLYPJATg
zet_1xNV^wz42xXu3mrF#>}}N5Gvwj$;SYLkwy@Y8eRfp&cF^Y2)d|ucGZW%8;124u
zwV4#c&<!JmhDZbVlDCwF-E=kM7qSiRF03oj?r+asD87{Y%iN#4){lx7W5m0a35wBb
zkYB+0zF0CnkOBEJpn~(aO)IR@<DncOQjA!{3fcZHgo#?+H@~G@k4X(=XsJCR1m98U
zil!p@b$V>cEGrYH{Dw&>=%VZZ9!+Y?ye2hq!F2HrwQ9j352p%!>T3%YmeBn6f_Cv#
zu6^MQSN5B<tcx^vUyL|MzK}iSli<=ldBP|&AiU!eHg$@5;Kpz_z(+C1laet?QRH^?
z^q`0AHXRQ~%pQsSCU*Mmx8LSy$F10#^cZZgg}$^8^Q+*{5TwXC5M_)AG+yq;s5=%<
zMI=%yT`@#xehV%2pXZGCM5raVtJcL+njS}It<7c%g_Lem8oGIL+c*mYKH4(Gu~%16
zQ=YA|i|5vAf7+x}mNP4@9$(y6!laXnd&H^SIZNh<b(?X{PXT@a02<|U1bTvgn+*2N
zj_x*L!h--z1!$DtQlDHJnD-Kzur&(vW<PlrN^!BV_K~!a(#A^CF!korMdEaB_PLcJ
z%je_hpvOM+DeQF`;1mFJvXXM#elzHkZ=mZt06!w28xmoQIBTE4dJ2Zg?*jY=;NLN;
zrg&DdZ>2%?@UofWkLvr&4qn0PaU83D3-CLD-vcmqZW}u$`g!!eNg!Au|AT(%Y6{eA
z$)l`?1$grh#V4r;mroUc&ON<6Aw2du>iiWOvaHAsgB|DaN1X6yfGGB_wDwRC9-%*g
z{yzW|I3Zb4yaSeb0aAuPVGx)V<3NwPF+w_-F+(;-ly~KH@pf*`%I}Dou$Uq<0Ux^L
zrC8%0G{chZL|&pMu9__VqL!^%5<DaBjo69#1eKQGR`;x$Is;^oS^;bAwG((t1~AUf
znsHsBURkxWL|JFA-t4LtYVOLG+>zCfi5X`^QW|98BRK9`6C@n(SaZ&q`CE=75E0nI
z1yBLt2B-w!$aFoX@b#pK6zRcu85v&mRT0o<HN>LiLSlU}GDtFlWWV%b&<{}L<5YU>
zR#%<KeRJ(HSC-WSUP^-6E}JL_g546|99d3piMd0rz^u1KP;R49?eB}xA=`2H$+zvC
z-cN%gIamHV<516<%tXF`6Sf;h7P(7`!~HI7TLCY{M$g2v<-=*o7Xf-PrAFE}tx))2
zET7|-1r=slP2Q0KMHq_BH>B6L4vYcI{Hxj7F&XkC#x?@H2JkWg-QbY9%_I*!k{@FN
zv$R94xNwtef>1YIcw&>yLWiY4z%)81&rH^DU)YM!m5Zd)v!3q6&|K|j*PRLXa(<hV
z8PX2*i4D`+btAYMvLY!-IeNcL4W!M|eC-z-g06OvYu;ETvIs7=*tVkOBM_{V?I6@g
zS6F+!WY#x9CmIO^t8J1eqXTs7G0c)s(ZRZzy7IxbxSNs!3dlUul2ZinLsw%wZUS2J
zRbcT+fPV#Ge&v~V{G!FKvxNGui!Q4|525)<7233@#2oM46mZQExqsgDl%NXA&0TXl
zw4Y*VN{0Fdk`dVfjE~Y{UDcPO%RSwQPqpB&g6>mVz1B6$Hpt!7C(pv<Rsy=J-=u3O
zc_m#%cRF%4;XyVu%ERIry1*u^J*fk@*78wu4%Sp=Vj0pPHjB~xXD5#JTH*a>U##D*
zcGSqpVl|z~5n>}y2ms6l9!mgN$7W)o58R5xQv*?57d89B1Mv)U1<EwFdivriB|Q1V
z#TUBPh}^2~Rk%l7y=CGWzDL**R9AFw*d*6tcPExM;#fA2CLHFD;5O(6uEF|zs;hpw
zrDgp?m{+!TKKk0Rtcuc;{#Y^`w~Hbby{UtAf1|C+^>M}4MWRd1k59NQ`sf<l9ta=9
z<{c|2$9&EKw4W7-m4!5bF=bEg_=(q5cH6jScG@u55*>PI+gW0Z`rfu@=d#5er(?S4
zqQ;TJ*~^X!pn(by<QBK96Fp~DVn?ms>S-_G?#xSOi5<C%FKH6CU*s)q1Lo7&o+lC>
z>nXU5a)kULNZkTNUqEXcT9XK%wvn-OC&I7?-QDP}BL?(vR=bU~E=gHh#!SFJokjXY
znL47pv^`annmwuRcq)G}${Uhz`+Tu0w|x6uq7~ewEHFihE!8<ry&o7X<fGItceIKV
zs_L@fQeBbh2~k-vP0;~*7#w3W4DMFD)7X9)Fmn^abRVZOL}Z^OR|plm?5rzB)BTk=
zVIHj}!giv43Bcvp!^d_>H&#=7_L#B$JsDf|1HkM8Ok*PQl)Q1-dLeTQ-#=R{U=gwk
z@4&2G01!j4PS=y-%5c94&Jc<g^HDyoZoPcmiuYq)8sJI*HZwkp8P0fZ#{iJCaV<Xo
z)ywCJ0X6oDFK@BcmL*Ozkt&WRYgPt`nY=iJX<PK@syb=s0<m9p?|f;UqpFe>iM5Mv
zZ=;>S+JqByC3y&{yL$7gqpsZ5IE5VuA9uy5^xVH|k!w(>XO_3*PVdT!>JVn_1_-M|
zyC*GS)i^=B%P2Zc05;HGv?2iA7^Q~MPe1%m_N!-iPt3CB<Zt2}sAHH8#UWK85C=#A
z>;>2dkOD{ojHX6#q0-SMpDiDj0Sk_)9g%HYv2fp^NY{neiJ{z6;T^&b0IWz`aqw;&
zvLCHMfFT0p1TC}!7-BK9$IhVdAi&6(`6C@wI*ApG7Gue}U!+eos@2CMYsDdWL(;6t
zH@EJ-)EE{<#%1iSFb)eNT46-p#<fBWyNtus={BO3M$}_Oy+*Xki27*AB``p}euG#X
zt;X2m{Gh>u+qTmlTXqoG7OYWMMwexW#?K8<xX)0E5pdF%&dyGWB2GTfBhp59%JO0w
z8o3=gT3O_hX(io6m{+4|yXa80W?(FA*l_u&gx@SCh)FJaSgkN8xQ2w9a`Ob&<wC8#
zZF)B20Uxdv#xON1jlEuDZ<Uep8GHTcq@IwM*;S49VUN-8Dq#`vXSmYH7#OAb0$bGN
zXIyKDVdA6DIP5h-wWIHzE!p1dY!8`~Z(Ii<(0-5+$$`$PHLf)(XdaHO9~trxChu5G
z3rLRFxr_<JRmS0}5kglND6L{jiI6Ir5Sr3pMC<S{>|+X{4Md^OAc5CI;PoW%2KD;R
z<s|}l!bCyf1(#D4g!z*3JCYDxp)*nYM$Rf>J0<8q*GOn^l%Nnx?LbGOLlVE6cJa{7
zc#sMK71X6Gsdlh<k>tVx={v<iI*8+-eZ#qo`Y3=F!k}cw&2|Tc(mslO+c@R5W1kB5
z2O7%lOWoNY$j+hZMg=LEt3~)+HKIkd3%_fu@KI}`@QcZ$e16eDqkeZi&6C7KBjUj-
z`sbyeD*9PNKh}Q|AWb4)2tJej?<HNM|758Z)Frr!9%xFzPP4Y5Lis90#?6LG3PNg9
zZ7-oNQf%6Vx@rntw5qnOi?p8Rl8kBlI%5I}H%juSRbyzCi}xmzpf_TfYN+^LR_JT9
z!mz$+psQ}g1d?nF(Xqz(Og&B}V#Y@sjqzk)IH3Vn6m2pNv)|!;=>*z0YLGfLcfof=
zQ}7&_J!Syu0Ne<vlJ-!U5-$Y(2<mhrK~%pjw%oN)s1>pHhO)@9MI~cx#bJW<ENSU#
zq(Kqd0B<tAd#E@-RsW*6!6Ac#hZ8r>7&18UI3dGphYZ#w4jJhnbkE5G$^!)nTu-(u
z6zVtoT56}cNey5ahQ}{d_ulanZ-ur%L+}rYwx51jTkcWU?7dap-#bRN?Aw*?q3)r&
zxdF;#_knk*<g!5i65K`Efs@Wa{Umm!YZr$d_M!n=E;-_`4KM{9b{Kaj;*^^JEz@lz
zBlYWj6Flbu`+%y8x4Xv*`C$q=uiAA<4&X(m@8sXMtM?V<9;II1E=GO;I9C9Wy1lFC
z=j!Qr+%!{^r1iw=5++q?N6G}3tB)m`=^e-y6YVAGHQ%aOeF{#=My$37u`<uo1azo;
zV8~MJ<Pi)+Fu+d6=QtT%gYqif;E`!}B;`p=;`ntH)?5QHY9Lovr`klHx;HgRsNAzh
zucQLem9y(Po7zBoXjlAzJgWMxoa^C?>PGea9c^ddMB8cq#_%3FWbVO>bjLnu#5jc%
z$VsZ|g{s@Xq5}$@e;r~c7NzOLTfXyfcRiSvD{oNg{cUQMRi$p<->m*lHn6F6jTD66
zYc@7oBE~{+tSX<>2M9Gg6*`F)E9RA;(XIhl<2HqR&G<W$`VMQ7IH)#QGsMwc%KE-&
zDkVvh>oJa~9Ruf7J8BS8cMZ%MVdn9HSphIpegpu#lOIKklhnJ=vY9vIe_)=xo95n2
zY}=4$mCM;IALw)?y$z2K$}@5=h2tE@^d7jE_elYu%@%#qFxvC%y9c|C=95~~;r%V*
zLrNZ)A&%wlI&fHIS<yUEGlI9L_tC<Fcd0D$E($vy1vo~4UU`<e81^oXV}bnaFEI!!
zsypNHHh_2d87iyjMg)WWtCWk`yc6dA>eGXB#r^6>gQxR%ea<!R!FGH-qTPyWUwq@C
zs#m`B&R6|595@MpN?x^xCQ_v*cxZOXt$f#^ajnPU-QazZQce`JMT<*5NUI_)_5AXt
z+{X`XD=vR=z5#_gg(dW92zy|zxNgN!yQx>Q4{5}sQ($D2P-V2=Ij|%77d+xBMg{Rt
z(ex3O?@3)kU+^@bsBm%L_LN)}&F3ty(Om6DB4(G?Ucm+r@JcS^ZGPt)iid~CiD%W5
z!!yJqDrHT`xj!&0S|0*#j^BvjkzLGYbu>G*{zmNRT}#J@lSA?*^>lV)upV<Z(s@NU
z{MZf`yOkWquqVCZtWoD*)#h3z)XuBMXB~}Rfho@2jRWi%!>{8byz*fz{2aie1avKz
zbh;@9ptd0cU`Sn*F^!mc@(Y^pwW|Of>aDBV#4Bq2)icCb)QYQrWnbPPWcdidR|rtn
z_zASX48YrZ6J};lii*%^NP0aeKaDX6stl{*VgnGW=i<|v_yE!xirQ~N#bDEvP`c_}
z^*&dvsHF^2K9S2^b4YM7wSC4jG)FpF^)SvVxG$;$QURCtvrGDH2AouP@q0W{A}1Yi
zEP${2ypbEbu|d2*$Iz9ZjrbCwKKsFGb-FrV{@_<X>g5kknpx^lb#w1PEFKM^j>PI{
zbGufxUpGx0&#k-eVX?R%C;53gOMBvQ-Shzm4&Debn$A6>g4a)LxDgAgY@Lgkq;ywY
zAIY-Pog`#Q>8$;BS;<zq{0o4`u;~{8z67ug;2!`uf^S3Xae)7wB4&%q2G#OOY{coG
zL+01e_jLdeZ0pTfaA<@On@QS_J`QBosE&CYU2yQx9CNpzIIjLF>WLeI;_2L}8+;<p
z(mjrIo+dEjgbN2tMka&te**X_0OQ6;Gx`qZWx}lkf`x<om#JMhP7=@j%~HKqq<Zwm
zyI-2>CpR^?wpOZRi6(I(clX`v-&=~8_soqjC^m-on&+JW+lul$T~}Hcl-&z3<2C@O
z=6lQZ{`{d?VwZa4=oIlBWqN-vS_@g8yd68?ZG*f6tvl7vZ#{n`N1=po5hr|gJ#?=L
z_gaxXNqqcKWJt8ekVsd`lfL(qw_K}U{&4r-D#ereM1@>qu?toIc!a)7QGNH#5HIAa
zZ}S!loNhO427k3&sfPS4*xL=j!b31E7v4^cmYK%1esx4mop?hfIdQz$=BfZy0elq`
z2zp%d#oP&%dsm4bz2od6iDo(Ld>%(R{;))X+D&ze;#zuFu+gTQwG-}J;-%)96H3S7
zrEpG3LYd8f%UO2It+unX0QOB|5wnW9R7lG0s)-%n))($yBXXoQWxSi7e@&0FODvI&
zo84s7>h(Jt#69XScLv1|b2C5kXug>G9L{_JU@O3G;K3-qh{qoiAeZOJknw!?ypzSu
zB4sMda2sQdVB{k=V_EcO<1rPw`&{wkzk!!2A}sG&=*365c8&ofMpM!0<SteE-tp>*
zV}9`}yP22%N;eZ6O)2TT|ATMws$KW4%YKhGDaslDzi*Ngz_U>=L~8kcv&A3S&HN;H
z^L@QVZbl*x>yk`9n<#Rykq_ZUb3L5#BWMI`%O%OG$5FXZ5*Yb#Wd2BF0cHMSsS?Mk
zI=DsRT!1V;e<8?c|K*ZbA3GKlHR`*^rih>AesRoQv653ly9$F(zwD^;d2rwa9v?!B
z!;Kq{cDSKF`d$!;n!5Ln;z~o2n*#Y-9MqLb{*ZS*_Bn&=lHbQ561O6uQ|Y0AKrk?>
zCtHdm=%Xuqpg<c;74!jzd>v@618_3C-FRd|IlXJdY<3fTxenuT1wi4lq3h_&k8p-w
z?$cjJgcDX#GWbs4n0Jl1fn@D>kBzp)u>=D+`(1sb`~l8BNkG>n2NLu*8JSp^U$*KR
ze3VRINb(Ctxq`OR)r8T=xU{hOYglfpD|lG!PnGw9*<!j{@W4gljob$w2npYhFhdhi
z_uh4`eW^L46a5+HzCoZwLzy6UqD&u?<~aKW!003Z`=m2^(y^edK5ia_ch9@h9qO|W
zt`@&kzkcv=LrL@@4y!{CO<sb^rVId-`Q;jXCV?-}@fQ_0cqr=95C6qRgOI1ya}T|i
z<?kA6l2q?ar0H)Q=o1acR_P+g83NyXeoF+?XT`A$ed#X4Neel4QFwvUv1-xPid2O1
zD!N2DdH!hGeFGWlQnJsrF+vY9!=leSBJ?LWqfk;`oYi-)S^X)s?=us`pVW1q`SBJf
zng0`nQe|JpSPG*YYu`H<vodaZk+{>|kEusL+njw1d+jLyjQDck#!2uTv=#y2+w-Eo
z;kXEog8&}{xCP({z})~jfG+`jg8;dKfkb-HuG~I_-ctal@y(U|Bbxld-`i+a6GE~M
z0FGFXNrW?~_QYeoo#{cesQS;pZ3{miGg~p!L*k0268w#k{f*5WjI9Ca0@wnu17H^b
z6hLMN083xP%0twLzg7tMrespyj2ZlT=~}dI1^6()5dZ~HsB_8t=~35Okyv`LGh$!o
z>Dv^$As@rs(jr1uK7r1{9Zf!iN6y(C!+a5aNN41a0bT<52|&R_<uCF0D}d7kbhEXO
p{surMJE=?ujkjX>3`0V>CA3FE>@cI+5@S-&J~Aw(sm-5T`QQD%rh@<g

delta 13561
zcmc&*X>=Ubm9E#V-n84YBukcUNnYFXg7-yO*w_}>7-1s;c452I>Mlub_r|M|7g?rR
z4ok345_qhEusD;15SB2VNd_`Q$ee`9LLecB%%qYKU@~ExFv)PpGQ(shbH7`yu9hq(
zBtIrPN1tB3^}c)W``&%8N+<utbMg&O`7bJ~$}fEN7gzlAw5Q8Y`Gv0T@y{oG&i@MG
z?Sb=`>7hNB?%WlXrXg)#7Jv0}(~^m7#EK4@dbN`_l19o%Tc)ml7uY-9r)ipAaWot0
z&%}>e#!<^I1L`UjY!PkhwqVXbhpNb+stMiMSx3p?Ai+O;>iCB9R>e<@lr6Q6S~5Uc
zVZR;7m`k%!Ye=G=?H|a<RMe8S^dKA6YoUg<O-Rl_VIY<nPFo~sSt=7ZoCX2Y7)(b~
zhHOQ(wj#8vj`De;L+vi#e<hDpR_GHoFqE865H6DokZuNq0j+?NZt&Ph5TRewPoRmZ
zqCrre>bhZizNI2kR?FR??2I6g0d=y*+`g2O;c7V-IjCTJlSW$3!{b6cPK^yq37r~)
ztlGqgA<I$P0Kky40#K<gs-4-$Qo5S(TTwY^kj9VK&RfaDFQU43KpIIXL$5xPk#RGb
zh#84=+&DUMLLK?<*8bEZ>eRD!ojGV{BzSe}{^QFRuRkF%j}hP2Oxjo~LzE6TIAXCd
z(<no(1r+C{SEZBIp`}*lkdek5)}gcpPzk64R0G(gQ;@>^<#_;7M~5btBe?=lPhk6_
z*{qR{%MN-7*URZB;Hj@t4^3;zt;X{NX;=ltk#*={AdxiKuC_ms<_GwYgr4LY090z1
zB@<~w&O*}1I-8&>J5;cbA)lj4<~mKSt?v|Fd9!}0Pb^lan=k5yZ`(drbT_Nah46G%
z9yu&|xa~EMnU3Vpk6jVVB$GzWN@UWeyg)6RQQyoeV;!wW4R0!uw!Mk8B^%U%84Gjc
zm1+B}OeAI=4$pFRWc&LQX*1m~FQN8wCxBJO2Dt!zda`ldPwOucH+4Tvj{e#q=m(BN
zS=1g`{>;FDX;{w6faY|cl|w7t$Bp!m(?=v`5n&9cB9xo%H@P(3uZMW4QXYEbv@H_4
zk2}|~X0tMr)hmnRMog)PoS~7dMu8cLrsEOJN6yAX+IV^J+H6-JYPn17&(EE?M9g1|
zLSA_&3wvzO((~*xnK@#X8dDva^*Mi`@$%Uhu7^9?UQ(a!mqwK6M8dhjF{vzE$62$f
zXx8={W-OXD?6N}!*{@zbKD%itJVp<i!~JGARkq8JlHds2Cyl|viVF)lKv}X~y*qoE
zxF$cR)e<>)OJRM5E1jjq#$kENAd0*Ojo56iKe(o6Uk5*ggO1(PsLl(7X$+!C_>f)|
zOBzx3eU~9CSOo<=w6(7gX1=dIF36RinX^8}3qDk`;Mq~hzWrpcm|>a|7d6JQ%VN=F
zGF)i~X`yA3P$i@x_lyn>(u3=O!w?PYSr0gC(sa0>6ZvH021oxXvimsG-7>dJjO1UK
z+f}xQjSY@^PlV2R>F*g2f$bfTJu*ru-P?27r6Zy9htr4BnIq{gy^Kn()V)`3yIl7p
zxp}kh-Mn=NfU*s<NB8t>A<o_x-V`qCChU<1&}|twAuQ~mX=Sp>1Z{=|&3!f8CxWVZ
zQH!IpsgBB+9^L7(^<2X-0ZMf1GpRF6E~B*Mh3-pcj*zC<wBSItpR);II|xs6S2%A|
zcFmYTxd^q(NRz#{MAfWPCfcG?=pi1Kx9R|G=)_4;c|LVtC;%1eJ}$7m*^FuXnAG)(
zv~gr?p~_ZB2PKemUq^pD<lSpJPKaCbpX<0y<SfeS8}ak@Q63IBYGJdx{_JYxF!~AE
z6)Ex`!-{YiaD?(4Z^ojxYsV%UArCxAY$=Pq=@l$r#9461&~BV}e{IQ1aa;a(OWyKy
zPbwBm#Jx>06pK|WZ^ihbL^?ZcVfwO3g_rlvDYnuZ;T#cij6}?gID8jDMO{6#eEKeZ
zN@m!ircQ+jxTBaA&%_Gt^r_=&nO2nI8=gwh7iBx-n5K3wo2F*0m@95qn^qk6GZ*ev
zUtY0t+?;>EqD|bFZ(BLfle>eO^^oTtN|5G=3k4LU0LRfGO^hM~k~=A}rp_=4m7#bG
z-~@s0&&ULYC~^mKE+VjL-DZ=~n9(N@-z47o{qKLzyB((zzoy4<tz-0cdr?0KLc=Dd
z2NN_i76cwIdr?}JNM>U4awNSYB<5X|($)X0ZuDP?Iy+SJn%UD%A+yR+Gi`+wXi^w@
z>6(^?eJBV~ml2+d?WLq#pmJ-LRO#hegF;!3taSCvnwD`&dVNiwcq+eo?P9TQ6UO-z
z;1K}kC_hHf7cO_E!D?}Rm-pryP>W}QIm)}#C)c$u`vj6$YqZT9gSMCC<c{qSSsNj3
zVq6%ee!Xs$_(FcsdY{Pg_!v6ucLu!=eSHD&6ab^xK8kS%jc`c*HL~sj{1x(RQ&E#N
z>x{sD3Wdpg0iOpviCUGVwMw~_3e}Sv=8Nals~e8)WB0fdt^OA94B%M+OIPMd$CG{#
zxnCp*d*$ELAG?A!>aD~m-B<yE!b9mu>eHR=;`#iGohjjW#!(k++@51cb`|WB!*dwn
zdB9cZ-)9cdCOkI(CglGepr{FndE0?u)3^{J!<SG9$x3BFjk*~5q-8`Lji^A^T=A9s
z;;yfY`B*U}%LIDpk=xOx?3fXiyiVlBYR0Bn;>&8ornTX7%Fa~JdV)*KyVXZGwa>#e
z$gLo?9dH78(_uO$YnFADdSg@9IA>k7`BL$1wYaM(fBWXY5%bO|qx>pH!cry?4{ZsP
zh<9ySeQxzm6cG(y0Q@81Yk;o<z5%EM90D`|N@gOzi3~P}JBt^Q_bmc_L2V*V43Zc~
z(0ZgbkRFsTq3~ru3B{@G*2_JwX!);f-QdZwcYvfMX4~UT3U**t;)5fb={>Pz#M8R)
zJrS1ORBD$GBxsPUaP-lgv=h>ohFpcyj5~XFWS-<h7-5H|<q*1Lc(>n;ZoMEWUh_Qp
z^{93^;9<ZO1bU@(4qB1u(L^D{uSB8i&8)IY+u<>Yh;`!VZ3b$x@?J$><(rx7cpCCi
zlx+k&4)`Q0)VkB`HHe_c!edlmmA0#m7w`0ZUsES9erczpLRY2Ca{_fw0NA}lQ8Pv-
zF0xKvHyw%LxOO=+w-xQ@@HQhY@(%Th?Q`4oI%qXw#xjyZ^g)>!&Kl#xwV!PddtTS_
z^*e$h#}3IOZNqQ@g0N4%juAq1f_2(UR{cMr!uL?2!kO}G$iTeyDXfx6$-sI&Wfk_;
z(rk(eXg}tfru?zCh#FU*JZS(;`55}l0iFPqy5I|!tns|5ssFuXZxGoeW==(Rt{S%*
zAKcmMc}vSbzw?`dE>KSOEa}k8X$#HB$iQ$qCVz@PM+xlUK!(otbRa&1c-QvQaY|P|
z=~?J3$X?WY6JT@YL4!`AL`r&)j&#H{(PO;OC=5$lbbd{lLzyEu)$&mCEwu4j35z_4
zm&IiCIS-WrX7sQzkQj6>J1XUC+GZN_6b5Fe;&uCPc>E=ReQXvQhM=ujGBX_4tK-H%
zbU0}tRG>gpS1(>Rdz?;wbJ@k7-)i|yyEfq%asBQYTSC-DVFR_jySDF?|B2q+t&~?^
zCGraA2RaPW8H!{L*AyaM_2b>syKh9532hf5kN4RiMJI!abTsMQ4te`CNA*CAx|-;Z
z8!ulagj$|zQYSBedW*9lcm=TX7Nf(D5%h5kV3)H8vA2*5Ohyz>t6XnGeR0K(bE3z3
z7l<<TO7F`{c**e|LI<^S4C40X><W~J3kc%H9qOgN1wQnstM~fa#;JGy6$?dq{<15k
z35OV+ZHeUvUpRrw4=B}5DxYKsc^b9KsDZQ0{7C(X@^H6dIk6K-7(lj{z^*0*^l?<%
zOKz8@D9tibAgBf8P!y?SmzNmYE{{{PFS9F|DIAR2FjTt=UA0`)<~y&tS2RPr6a{7|
zu%&yBGde1~4cMaoeNVG^Kn3@P*XdrXFGAOXS=t`ph~X)Y!oat7Sr*;D!$}gD&f|0q
z5j$kccQuvRyI|jB&i^ZnFdsbyWep}41XN*mAzqktVl{04kdYW1vK-&%rSKcH<4G{I
ztGD-di#hp~`XaG{O~^i6j?QWUa|puKb`5#1j1C&m42fv69_1<Z;j0@qR-#@TpdG;L
zjK^Yyvs@t*KsZO%wxSU2>dmW{iTP^kzGrtkZp#*@ng|sqv-Klr3pOPhhcTTcJ-MsS
z+`mFBP`md3aGUF@k{yY6S312-HUfJSMzDQE5r(8{j_asv4%E$l9OLtF)fk<1--bMf
zcJ=WCt2~Q@`c~)k{5uD7qGAeaO$F4c;~$v0mR;ikG-yC(BLFTWry*4j;NuA;v_bmi
zA3B-6{DB!c_ME~=oOkMGREOh`s}Rh@<1D}oKntK1FdHzL8-YWm>q{YLK5PRv9M5*l
zwk@T`J<E}9if$7f`EN$|2xkXiNAh7{-U}8WwFuBbKrA>HY4#H~<0(io4|!q0*qDVO
zU033GfyH7eC+eSivDN9=R^sUb9q#)hbSOQL=Nvpp-BI|@0#5E?tBux9tQ*ZcRn&_n
zG1DWLP=7I;Gx3A6YWjsuO?Zr@X|#RDg?6Jy`|y9MH_{!sc$pUWX*X)(gkOvMwNbwo
z@oSdQ4hFPQA07uiTA_{*Cn(3K9Smym04gJ&(jhHgPCa-FrS#|&QMY4FsM@GUyUC+9
zouGaL+QD+|V1;H?YI_b=Q7vRe$~CK68ztF<b|N??0@V(>BE&1ScqLWvoCs;7A?;v|
z7U$;9WA9b*YHjx(g5Ge=9fO@Y>7|J~hjP%mvzx=;=@4nf9cm}7XoY22hvYK=Sfz)K
z+UH!MOQ`}!#qvtj2j4rP{0>Ud6}xF2%NiRw&d1%gwPqDtMpjTe+DDuvjXs#a*!^H)
zJ472mFNc1*Y94B>pVZCNj-l4}7OJk5P4!F{A<tA15;Mds@_|~<bYG=t6E!}$Qav!#
zv>Hp3tbS0y>T!=Y>V;7Rvgm<)!CSB{Y(_>cqhham-fGT~y<pi2r@mJ^NEIq4WCb*f
z3W8*O7@~<ZA19Mgt6CUIpfH3&VX?grR>FPAzUH$Ua;gfCB29kj)8bRKM!0|nRW>)q
zr)rJll<1@eju@}gMj;+aPgyhq^&8j7*3)`)KV7KNF`dpO6c<wPO#6?n-b!?O-qzII
zgKbWn?2_D8(#l9}oMM9|Lv&iDYv@S-u_fcSNOF<1saIeRk5Qj^0T(FLEg?PTJ7zwY
z^d{lEZO%Iyrp;f`*v{q|a0riGfC-Oz$KArtV!7%=b1G)h;x-ZVY5~=CsBy*y@}gJ}
z&-4S9#iHLrzvf3&_Rxp@WYONRPt_zNxr-=wq^1?|ghhvuR(ct@p|yv$GBk1G$CdC&
zbV-K<=14Cxm?)VZ*;Cy;mt}S7aWXz^L3|N`s@2|PoqsLFpQS2OZ64W4EcvE$k>=bB
zOcpRY3>hj}itdr}X~>NgCznCYuOW3Fg#SaVNtNtIoQ*ORRrDpQQwCitjvZ{TR1c@?
z#rgTqr`yKGgRW%bNg`ZTBG_FR|84;9;(XwmL^^psgk|1fItaORU{~Tkkj|V44Lj1#
z`INbTIohlMOil^cXIn&6J(!(IFV9|n;2NQp$VD~0{hyBsdy+@ws2Y$<{BJ|Pwdy-}
zx16_*y0go7M2F;vF@!h2phhPYpc3s2whlK6_^zg^k8JFK4;Sw5o#>|xjTqV~^<L<F
zoVJc;W!H@=d!$8evV!W)BlYTLvz9{$&?@`e`wbr^iyYVp?m{X=q3ou-hIpT5=R{O!
zJEvB&VP~}8NXoO8RfjcGm}<K<Pju%q)~jOL_$f)~;FQ{Pcy)#A22<3%hZl}1^M%6;
zTcJ#OA%K4AMh___oGnN>%A5B`lqa`R-TO#w+Y74lxSbV(cH2j<Z<EL5Ikm?jmrF+P
zLwf}p8mGOsLg=#%!cSBf&Ci~$MvqJvw<>vbp4gec_vom|v7`CpMhvg}c2L8@d$=Bo
zZ(N4yLXX{modoobZUW6P!?;8U`#GNuuuYk3HX`pVt|3me>zHi#glS&J%R6N}s6I2Y
zMD(cdkGxYjoGKXTIN%Wg^;U9Q!?6I}(-iKC_oDEJ0N&&2lBQ~O2Hj1BM;BGJA)BFH
z9T;tB{xY}?Gu1rhv>z`L(>=16dXIV3cRHu#PmlJNUfAGHjU$pXXhPpIu#R3pt{b=(
zFKtS$#8{*&P;WipO2EefU>&-_y_|*|bNRl^74*qTEl$Cg4O~ThA9tb)Ub=P}4w%kW
z#i!B1Zf?m}PnVv3rT*l#4dNT>tJlsGdsN11%9rI%h~_?OQo<X2gkXx<r0&12y=E;H
z>w&E_d^9~G*Qpn;+Y$aQhT2F_vfn25c$il9p*!?R@8rtW1=qKD-qF<l>l<^fN56~a
z?qLH<>fA>Rmm>o5D)hY<pcB|td<U_M_5d8ld5@#Z7?w0<6sBMF{J@x<-@Cp={7^OC
zFi*tP#v6W7xNkzIdjK&4itlzI-4DR3vI8l@8p_aXe<UQm&yrW542GH@l`~8m<^(pi
zYuJ~!B!>~*(4PGta53<*<}LTscxpnPDp5s|pB%{NZ#*t|Gj$yLe)PefekW2Kuomfy
zz7e1+M_d<w9fgkoen8OO!!Mr5iQEP2t^oL#(-F7g1;K~mfU}5$LOpTQoNB$Ia9-zk
z73#H{X3pmcxwES8A5J9W5nOz*J3879s<xZwh=ciUH$N%X6zwFh7NiG%64Y(Ve}xGD
z3vl-;*XR0F_|`eK>~lfK=VJ85jM{f=EXPg<3L&R64?FiDj?)#ctoPF}@&F(L;B)N<
zklKOK_*N&1$3M>zb0z283YkDh9Pzo+97Ns_026lHaVj)4HW5cjdO7lVBQqx@<_3B-
zhp)U?yG6(G3^uDTeK0Ii`7<93i6mQ>&IFoF5sZ;=abU^9aKJ~W22C0OmW_pG@m<N=
zpdbcz*=h(_9C&cGI&gBPNdLt`y=H`Z^on*+8tO-<YCV_w)FbI>B9p)GquuW>#49lK
zInRRUjPo*38`HBB=h@ZcFS4X}1#0ki#=G15iuC^bLkq=T_0$8i>64e2ANZhXE=GCs
z0(3`*I!$guYO{LlBNvRtC=~Fm;()KZkIpjDels?dPFQCJeG=iy#Gvne#VyyW*KXhS
zmkRM@Ay6S|ETvHO#XHF7)xg8^gq5#Qfl`Ch35S=#pKMpU^vpuNd^BR?Jr0#mFy2}5
z39;CmL3R6`?K5ul5s711B60fMkxua5CPKeQ4(DIGGyexIdjH+$l~^>#Q6~t$gb&AA
z&@Q7Jh0@F4o^YKrZ}v{mx7-alXF%x^9*36W0?J(B<3rwr-MzNLiO#V1Id^HSD%Mgl
zD0i;7o&Bx5c>Wr*BN}pR0Mp9c!h2<0kR(#sq_K<EwEFoywc;-IyL-aoXny{^PZw_g
zu7N@<fE!7%h{MP`N<ft7+K}ZuRTS10F^oK3nQUe@DbJ#9ylhj<F^NJ%Gp0#b@t;+Z
zkFFP^e*u;$nOK2YmeDLGc;}>tNSc&Am*`SuA8J%zI$bV)#58l_Po$afWKKysr@Ljs
zKZ{#D>cB(Wawat|i5dUDPm)bE{N(<rIv-vn-eH=#Hh=2j{t}vz_tV6CoRDlWbbuIa
z?8~jmsE2nGY#Ma6;JT<wGrTJot*IBjbXY!?SV570RI1eJU<ZA`#h)>+K$K4p2y+Y3
z|3q?951$T;fcnPi+2XqV&rX+lyB>k8A0y~<US#vG!U4hqDB-Z5w;Kvb$F>{NkL}|?
zO+9#b=}kikO=0@BIBNUS{N?G-(VF!#4cZ;=Acv8Kz^%mSboEdaAly1BlaaggS3-ED
zP93~~PYxts6|Kj!8`<^Z@eu%rci%&bX@a@14x_(|yqQShrq8aXZxx~zy^f|&`J*Yb
zBp5uK8w=D3R?Io~eRU2kvIYM*H_9UzHb<aWr-xJYPXtyX%5MktN_<62pY-t?F}aFH
z&?`uyv2wW&QeThe&bC;C=e8oWtH7g+#B8<V(M!b5`D-7Ih|mZsA16=`-nZU4)SQz<
zuS4CN3C4M-OQJl#bDrS;Fq2Nj%xaRxQLuY~N_*84k8KvWs9!!dT03s{Aq}hJpP9Ax
zBFxNDs5?hW*_C*)k58xZF90|CY1gG+{uhH<Ay2Arf95B-6I?G%cf6@AeUwih6}W4)
z@Q&TN%{xUCrte@AmL0NWG;JcrF4<n-bnII6Q2@CJ#Z`2Oa^w8TqWfA}p6iHiw<btD
z>=~x+&BmgR_1e%MgL4injp}!8+5B;J=!qu!R`I4Mez4mO=5L2lg3e)#tuV>3&bfmn
zn=nQIfjjm6S@raj^|?FH*Pe;rz4kz?y?{A@4nQMdA>cef7hpeN2yhL+1dIS~0o(?-
zoq%XyIF&u-+}z%W+@}Cv;3fMUlFtLa0{AxI6~Oli>?x_Jh1;HFqJL@j7*e+1`RlCk
zU!x{}{^2Kc#WN}SI&$6wyao6zK+r6mm-ppJ)dJ=Ka`S-;0E+;4ZzvZN;Pd)ue@3R|
zdSu~!n_LOl0O$np{qFdCF4;o`dX*VVWRES4ImdbWn!-uQ{b<z-*aL_W<m3R7#WR{r
z<B?-FcVQO88A-3r=ne~TEugqW<t=!;6>yS3uQw0TN9!`Zl&<OE@n!<w4M;e*g!f38
V9e!6eo-khi><KYTUHWv_{{b};*nI#1

diff --git a/tmw.py b/tmw.py
index aa15584..29bcd25 100644
--- a/tmw.py
+++ b/tmw.py
@@ -1013,7 +1013,7 @@ def plot_topItems(averageDatasets,
 # TODO: This next function could be merged with above.
 def get_heatmap_firstWords(firstWordsFile):
     """Function to load list of top topic words into dataframe."""
-    #print("  Getting firstWords.")
+    print("- getting firstWords...")
     with open(firstWordsFile, "r") as infile: 
         firstWords = pd.read_csv(infile, header=None)
         firstWords.drop(0, axis=1, inplace=True)
@@ -1024,14 +1024,20 @@ def get_heatmap_firstWords(firstWordsFile):
 def get_heatmap_dataToPlot(average, firstWordsFile, topTopicsShown, 
                            numOfTopics):
     """From average topic score data, select data to be plotted."""
-    #print("  Getting dataToPlot.")
+    print("- getting dataToPlot...")
     with open(average, "r") as infile:
         ## Read the average topic score data
         allScores = pd.DataFrame.from_csv(infile, sep=",")
         allScores = allScores.T
-        ## Create subset of data based on target.
+        ## Add top topic words to table for display later
+        firstWords = get_heatmap_firstWords(firstWordsFile)
+        allScores.index = allScores.index.astype(np.int64)        
+        allScores = pd.concat([allScores, firstWords], axis=1, join="inner")
+        #print(allScores)
+        ## Sort by standard deviation
         standardDeviations = allScores.std(axis=1)
         standardDeviations.name = "std"
+        allScores.index = allScores.index.astype(np.int64)        
         allScores = pd.concat([allScores, standardDeviations], axis=1)
         allScores = allScores.sort(columns="std", axis=0, ascending=False)
         allScores = allScores.drop("std", axis=1)
@@ -1041,11 +1047,7 @@ def get_heatmap_dataToPlot(average, firstWordsFile, topTopicsShown,
         #print("dtype firstWords: ", type(firstWords.index))
         #print("dtype someScores: ", type(someScores.index))
         #print("\n==intersection==\n",someScores.index.intersection(firstWords.index))
-        ## Add top topic words to table for display later
-        firstWords = get_heatmap_firstWords(firstWordsFile)
-        dataToPlot = pd.concat([someScores, firstWords], axis=1, join="inner")
-        dataToPlot = dataToPlot.set_index("topicwords")
-        #print(dataToPlot)
+        dataToPlot = someScores.set_index("topicwords")
         ## Optionally, limit display to part of the columns
         #dataToPlot = dataToPlot.iloc[:,0:40]
         #print(dataToPlot)
@@ -1057,7 +1059,7 @@ def create_distinctiveness_heatmap(dataToPlot,
                                    fontscale,
                                    dpi, 
                                    outfolder):
-
+    print("- doing the plotting...")
     sns.set_context("poster", font_scale=fontscale)
     sns.heatmap(dataToPlot, annot=False, cmap="YlOrRd", square=False)
     # Nice: bone_r, copper_r, PuBu, OrRd, GnBu, BuGn, YlOrRd
@@ -1089,7 +1091,7 @@ def plot_distinctiveness_heatmap(averageDatasets,
     for average in glob.glob(averageDatasets):
         for targetCategory in targetCategories: 
             if targetCategory in average and targetCategory != "segmentID":
-                print(" Plotting for: "+targetCategory)
+                print("- working on: "+targetCategory)
                 dataToPlot = get_heatmap_dataToPlot(average, 
                                                     firstWordsFile, 
                                                     topTopicsShown,
@@ -1100,7 +1102,7 @@ def plot_distinctiveness_heatmap(averageDatasets,
                                                fontscale,
                                                dpi, 
                                                outfolder)
-
+    print("Done.")
 
 
 #################################
diff --git a/tmw_config.py b/tmw_config.py
index bbd06ef..e31bfc7 100644
--- a/tmw_config.py
+++ b/tmw_config.py
@@ -225,7 +225,7 @@
 averageDatasets = wdir+"7_aggregates/avg*.csv" 
 firstWordsFile = wdir+"7_aggregates/firstWords.csv"
 outfolder = wdir+"8_visuals/distinctiveness/"
-targetCategories = ["author", "subgenre", "binID"] 
+targetCategories = ["subgenre"] 
 numOfTopics = numOfTopics # actual number of topics modeled.
 topTopicsShown = 20 
 fontscale = 1.0

From 3135f47be6b43944c140682cdfa73f50c72cfe1d Mon Sep 17 00:00:00 2001
From: christofs <c.schoech@gmail.com>
Date: Wed, 9 Sep 2015 21:28:12 +0200
Subject: [PATCH 48/56] Fixed double loop bug:
 https://github.com/cligs/tmw/issues/17

---
 tmw.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tmw.py b/tmw.py
index 29bcd25..d862c96 100644
--- a/tmw.py
+++ b/tmw.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python3
+# -*- coding: utf-8 -*-
 # Filename: tmw.py
 # Author: #cf
 
@@ -347,8 +348,7 @@ def perform_multipleSubs(substitutionsFile, text):
     ## Load table and turn into dict
     with open(substitutionsFile, "r") as subsFile: 
         subs = csv.reader(subsFile)
-        for rows in subs:
-            subsDict = {rows[0]:rows[1] for rows in subs}
+        subsDict = {rows[0]:rows[1] for rows in subs}
         #print(subsDict)
         ## Create a regular expression  from the dictionary keys
         regex = re.compile("(%s)" % "|".join(map(re.escape, subsDict.keys())))

From 033804ca18755f9365ec37daeea6caeb65fa90f3 Mon Sep 17 00:00:00 2001
From: christofs <c.schoech@gmail.com>
Date: Thu, 17 Sep 2015 16:46:47 +0200
Subject: [PATCH 49/56] complex progression: 10 bins for display

---
 __pycache__/tmw.cpython-34.pyc | Bin 45451 -> 45362 bytes
 tmw.py                         |   2 +-
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/__pycache__/tmw.cpython-34.pyc b/__pycache__/tmw.cpython-34.pyc
index 0e49b07b889ca56e94fa3393fe1c9f93175c0fb0..c7fc6dfe1b6c46b70f878fb8a6693237a831f651 100644
GIT binary patch
delta 681
zcmW-eO-NKx6vxm1zBfAa@?)Bk8FicwwNQ#6TUZOju%RMKq6DR|(B~jZW;$lbGs;QH
ztR${vso4}xn4&eFYO573lr}-oDudicN$#YeJF__d!~Oc5d;Vu=O)UHnj$av>M(p!u
zO<%&XWQUhm>>a=vSz)}v1>P{$Y-r`L&N>~7yf>#K9qV$+dnaTgqD&Q<iq2a~B19IE
z35p6D?xbEp5m1XDOHdTltC1<l)aa(VrqN-wHAo}}3U=!{SvtQh*aU1c;V+S^{yok*
zD5{~Tq`Q!fZR`s)gm6^eXv!0%kcVWkeAg6`&hSau9Ns76;gW32PAyI{vr^R@#OvW6
z>?}+oGy^W-fkR}#gP`VuL*;z<kOQai=-E1tHIE7exUBUHaDdY<J_Kqx*7gr*<jMAV
zfqvQE?Uj`gyBz7xkprFY(Z$CiSpxUC_KF)A=k{&@x4Gl$et{Uz_ACP*xXY>*_{1Nt
zozk$%?!GAShsUG+0tN>M9&5_&#lcCSoa={%A*yZiY1?+z?gj+T%9j&<t{NT_xX!Eh
z(|}7d9NWQ$d;#?H>yaZGM)~h(gTMlxA5(R{^UDeKRh{dfqyrlqjx`E6c;#6+P$<RJ
za`6_5l0d|u7B?l;qt;~RQLdSvnvRQ1ulzmbL4mZ#E316ujyTCe@+?;k%x(I~ZEaOK
zK<-{w3gIJ5y$X_*rhG?=?N3snMt!=pNUmS5kmnv0%b)RSe)L@V8sxI+9)TL3n{g!@
hS(=>z;@mnHf@&l&ze^xfetT7<6p7;q$P<Z!{{fYl#M1x(

delta 753
zcmYjOTS!z<6y590!<n0pX-LM&8Ao#hL5U>72$eoEp`z%8hEyikDaNQAO=Y?h$S7Ev
zq9@cOALJu5GHd5YKM@3xL1hp~KSBCi`jPe#1f4rye(c9$FV@=ooEOWYb455;lT);H
zteyGt-1)|VO#a}A0H<V$HjfMZTU)Uq%uBAbR=noI^hgrc<b<!Q)J{YxDl94*?yJ;D
zWDx0~xS*akiV122g#{Uc;-Hv?T5`y+(7|(Fidp4F|2AuV9#zO2{sOCw3>#1S{{nV7
z7Ra+jZYn+bKCsJO38kK_5Lw+wLkc`>&#oyGQON2cuc(6j)fbeHYs)m#-ex*MuGSvg
z42wg1Hp6S7yj+v)NRU+tyGTz>?C4A!;Y<v8Dip=;0@K=}!7Vc26v@ay4m9D0Q+1ip
zkcD)WU23XX(-e;nT^<m~<?rFIKr!=`L7<9{Ui}Az`0({vfez_w_Q{_O4(V@LLJMy+
zqzd%1waEjFu&)`wUCwIRCos+x(FNcg=NQKYK6Cx86Bhj7x2<ttgUjRX0xA5leb|Bk
z*WDfg_OsU61DxccuDt^1d9nMIK%DRN+JOl7_I(06xcXkX1w%YNP$MwUTZWYMvRv#A
zaP&csKoV=CNx&c3+qaX8#!3Z}x#?jMkSk}$7fQBMoCF#*3VX=R848o0NnYJYx**+6
zx?jDXfRSY4ItxY8gh}+0o+V#S9LNulr_n_Q%&}MfmPgMfkFizN9P-563Dqnzl+X?t
znDihhpADBtM}JTbPEMmxj`nAA#Z)U0<Zq9n0+oDv#%<np>B$T*&F-gVP<K_%=Bw3Z
R+gvVE<@DSUwg38D>3?Rs&D;P0

diff --git a/tmw.py b/tmw.py
index d862c96..732cbc7 100644
--- a/tmw.py
+++ b/tmw.py
@@ -1603,7 +1603,7 @@ def create_allComplexProgression_lineplot(dataToPlot, targetCategories,
     plt.ylabel("Topic scores (absolut)", fontsize=16)
     plt.xlabel("Textabschnitte", fontsize=16)
     plt.legend()
-    plt.locator_params(axis = 'x', nbins = 5)
+    plt.locator_params(axis = 'x', nbins = 10)
     plt.setp(plt.xticks()[1], rotation=0, fontsize = 14)   
     if height != 0:
         plt.ylim((0.000,height))

From 52270fcd926171d9513e959cdb0988f05fcb2573 Mon Sep 17 00:00:00 2001
From: christofs <c.schoech@gmail.com>
Date: Sun, 18 Oct 2015 18:26:35 +0200
Subject: [PATCH 50/56] Add normalized and zscored topTopics. Fix
 https://github.com/cligs/tmw/issues/18

---
 __pycache__/tmw.cpython-34.pyc | Bin 45362 -> 45567 bytes
 tmw.py                         | 207 +++++++++++++++++++++++++++++----
 tmw_config.py                  |  72 +++++++-----
 3 files changed, 227 insertions(+), 52 deletions(-)

diff --git a/__pycache__/tmw.cpython-34.pyc b/__pycache__/tmw.cpython-34.pyc
index c7fc6dfe1b6c46b70f878fb8a6693237a831f651..229b7dfa66541bcdc1717a7162d6bd5735272263 100644
GIT binary patch
delta 8604
zcmcIp33OCdntr#cQb|=-Qb|Y=LMmjVz#xl2h|qx$2|-VwO$eY6sH7@Mg`_InDo6+c
zM8y37`Xg@GF4HO_ve@N}xQ>pVnRZ7VY<SwX+BhvU(;jJ0(`uv4|J@g=$}}p@5~%Oo
zcenq)|Nid#)yJkYzc<-GOG&ktx282^41dgSOBU8hR`N5B>aeDD3Jqw|wrHwhMq{f<
zb2=N`qTX%MP2EA=8VXB)pj}T0g*`H?n|%SVTj-W<ZyTi)yehWP(Sk{0dnDO)yAa#y
z?+V9@o9OLAzqmCrWAryd?4-)#khqKX7duTY7CKX&L%%C-FM7p1VAVR!+JL5Q5n9di
zC(RpdS~yV~uxVb4w#B4ve}McYlar2_29kOBE&64NEP2rEP0<E2v{hD3aywZYNY_>+
zs&;C$O|#3gEf1Pk@1e%h@`-UvLZ1q6`cGK?YH7K6kiIIt(D{S3GC<2bVA8w}ZIwxr
ziPTnB4py_yi`J~@@@%zync15&xG*%w>>Ykyn_Lv#Bv);+C$`BC3#wTyT>Tr3|1b7B
zwa!%SI!$X6ns<ce&C{;cw1E_@hBd_hH?Zt2T4$Q(9jWnD9@t`5oe5ee_p@kinC3NW
z-qFg6yrXE`m^@KHwPPkaUD`kr^eWimyoH!3cxHpUh;AA4m_xVt0`*P0slHw}HC4mN
z>Z#h@FW#bO-8;k%svrA`c$A#u@<kU-8@FhrQ>2Iq0$40urqLpsu}~C=R3Q&Vc8|MJ
z<dCLmjs4b|V8A!t?oN~`{496U>Y2IOw{g>L)04YBU0uHLiXKn6L+&Dfxhr=kCZ#dx
z$>HECUtopwwfp+yopgVBY5obJX`xIW)J%3JoFXzshRJFwH93TQCUU&|4<<qM+5{(?
z{Ooc^lA@9!pCmD{gS$E>_6Y3bpwA{|i*Ly?sYv{q#!NaqtklH&TCuONn?{-B;fQT=
zi7D@8?x@$3LaY2eGT0lEOkI<&0Pd!`DMjKQ>Yfr3uhB<SW{Kx$^wg(BIsJBOmG~@@
zKW(DOIF6a`0U7~EsJX(Gu^-J#0BwNRXj_FV|06UD8FX{V*Uqx<3rnolZQfujZLZ8o
zPi<#0@k6(VLqXpn`n;l0oQh;lPY}};8BlOr4G|y4Zl?gB0YFDj==RBWpW7x+qU#O7
z2ehy<C*wIZ-vs;_@JCu(c|l$q<dMP8n%QoGoYNCnE>@PzH}q^}-!vtIS26!DfG+`G
z0Tde+gAH}QchUPbgWD`mQ`f~)92>EB68AvX*=F8TqvxBQk=HM76joznDx2AI$v0T3
zvB;&DTO9j3JU!6^xG?xvfN>PGa1uRnO~_a64}|2m=sK;m{n5;<g6lywi~IOQ!VbGp
z_5%yb7t%s2XFI9<XZvQ^wRZNOB#k5679lnyY614Y1dW5<DznBq8AybK^5fr5^Kt-8
zpw-n`G`=dGc2;MMNKoFk+OI7Rv&Ur`9+$`-=b*=`=h1<x@g*>^IJ<&O>tz$8ZdvE^
z$o{0_o;5A>WtH0`1QpF%YJCk0UZ<;Pxt%z_p2#VIZ6hRI7?=k*O)t#~irsY4>~SI^
zvU2ttLRs=T$;bq%n==`kZ=ADuld(DWutM;OfFytoU<ZuA==Fen0CeNF8>&dgxC^}o
z8)&7VFO@;h@OyoMu)odklSVp|X&6ffjK$Qk8l7%WqGdJ3;z`<E;}(v{-kOU|d4|Jb
zhVn-_1x<)Z-b14H@?>RVinpWGT$`7@A47^c<z_4Jc#O8yE-cIeANFvt$KSd*D7_*1
z3C89FP5^#SXKE|O5vrUwO^hNr?@c2D47xmKC~xLnKzmu;g~CcV*DZIL&1X&8lqbfD
zbdxNNoUTg~Gw;F7%K<h9cY-Vh2R8U}y9}=Bkw-D+ga~pO+{p$rMd&y|wU=I;UyKFG
zz07Tw4;Yevz>>Xy|A2K5OUG_;6O!r0OAEyX^vR_jv6pHuYe*dd_SpV8YRZX`hc8Ps
zSto$PZhE|azV!k$3&^ryvY1E}3#Jz+>MATK$+$pj5+=<6oS+>GI)#h=x?r4`7b#m9
z5TcHD{k+b40Q2hTlb<gXl{EG8BGEu^wmK<v`D5Z`dcHd=@d!_7l<(23hLL44oqf)|
zZnK`&=4Z1|TAAzb@);JCLnqrEw5uUK`*Iw?&QmpWCTAL_$T!I*daB{gvk_x^W6oLE
z*xT5fy(~K4l*TTeYsxgq<&kNNvP8=(yoogYjL)M<7=@u%7q{lJF+0$0L}#>K2fPnx
z!c?WQC1{Oi&}|_Wzdzvh^|_PeQgkn(eNAIH`@P#VzMu_!7-1*Vlq+l;OkJi7k;~zS
zt)w+lwD?gWD(LW%0`U%gzGTWc$k;v12%xZG-O|$)49hBzvM{M0-P#xSx2_7wYFfB-
z#`HO$cp*<@>Mdwr0a%IYwzgm(9BTD+`E+ZC&)?n=HZ<tMQEHWFk1ib{`sj<LWuiBd
zcg01bYdKc80(u$TX?nV6txtN|eKnr2hslTJA`Gffn_)zrIi9euJt+OYkX(VWHGs7M
zWiN`Fl8fSO83w?fY*8G%ab>#LMju?6C;I8@E1#-1TwLjdCn2SsOB>1!p$NCdU`V;I
ziaLhPY;R60j9JtJ&;qItjjbnFq?l4U1<3VOvHbm%ICF53t2t+GECwo_gNmNOp68QX
z!War>&h%6cBXwb4cc`Hw*c&k7hho4JJKXl>T<1Q}PBUmL3vjBUMQ)5d-+YIt^<uUU
zz!jfX-*0IQ_PExh@_bj@i_b`x3RaaaS%E3<aJTfM#jg#T5ePXCX|jXvT{*m|i<|w{
zaNUZQb>;nu<?AaaO!jW*w~X%zv@2b>Gh=oo16T*MlGT9K0AvUm1oQyHfDlz}&aE5|
z$}wBB8Mda>q|~b9e=g0lo%QC?I?piiG~MJW6PqFtPm5?-jICE<)AeX=1Z-k(r^`OH
zT^Nd*np}sz4S*puVuqzcxne_cWmwmZt!d(Iy0dkPxQc_(AB{jXb1)ErL?dVpmQ683
z8ycMqPNL@JqVYca@Y7MF;#xEslr~2e_+Aj3qE=-jDwVQS=KPYWa*-GI#-$P!cN8o2
zXI($W%Mf$i3?VGWCRR@<yspQm^0S`c4}_;)BwKN&tA#dGlMQGhLK$hh83(n;?AP#B
z73Fa*-NuD@Lu*j77w6Dx?ZpBD9LxD_WXCvhePmw8wTM%1`pc|?26x(DC}z;8&eDRI
z5h?pomJ~B0Wj|DKd&0Pw`6L?Vqr!zUAEnh;xG;j#bDagEls@Pz%G}A!w<a;0_)F%W
zE#g=;K~!Cj<6)2!r!pI5F)p59Sd2QJq7owzvlzofZtKbyRTSxRi`!JbI@R^EVJ4Ri
zH4|H0DEh92D22!^;6_zT*MKGS5UX*}_T~ht2snl}TjIizN@kp(`1R-D&CbaBKvJ{{
zzUA($O|TckN>k&=4>1ZSD8nd*@E~_#KpBP7zb&SJJ%x)g1Z^XHDeqARz7<QZ1Hg|=
zgJBx1PwjCCKh5vS7k5NH-?m!NvDIVKRs5=B(z>aumn(`)S#DLaYCCP%HS)q8yoYX|
z@9B{JzK%fH7Z<a5L_2}N*v>!oC$ngWoG(t$dorJIFh=%{r2h&fQGTdoB5b4HongeU
z^9fyuS~Hc$#msInL%Nul5l7UimD#NYpZvnfm0{>IPm8Y$39`2s^u>DmbErhz7s(4B
z7YWz!;+<q!TV=<|b&E&(JmiuEXAdpcjw!&Q<!%7~G<iQ-%4_aHOC8dwBGp(7D;S5?
zFL?NVK6YJ=aa>bu{a{?s6FfaVUF*I-c&m_YI3FZ1VTc3<9nS@7$|YhFxEH+P&=>H#
z{;0kZ@ld3u?*<`$NpGynt-AxG_W>SaaHkA)H|1$DCxbZUBv4TVl+WFcKEvm#`?JJD
zw79=q?4hmwCkK<hikWv~QCyXc$0h)RWaHekv07DUh1gMrI4d1m-@$HMIN%WbBU1)e
zi%L~`t00KwC+rOFG{b(>yG<+uDeuX}UW2a+A8yD^{Viys0N_{VeUz}VC<hwTEncpJ
zj3WKtG2>DC*~UqnVpeY~5sy<|*cmyvaf8S{fZbHkH=-pTE2#Ravh?lT)Dvs`e3KXG
zmpiEMs`;a{Fg%~Pi_6viC0ww(P4W=N4$$$diYgFib!(fyeU0?#c7B=i%3!<nbjM}v
zA*mb%q>;df$Gjj>MzH5U0RE8zo5u(v+&;27R~)AoH<ySP=)~qPYWHKp<A4_!ICngP
z_CEn!ysw_v?F)AVy&=_*>^Jft#`Xc?DWHU(x+TN;C8C~D2-!{P{4bTSibe_fQe?%J
z^<vY@;FZ^E@@;@J22|-tXM-jHf$E8&pbYb6sPRgwEJfwddO3}USzD;{B<_fe5?&dJ
zs^rv0>P1%Bj4B@!4MV;{(9*3TkwLF-EfTLrPHx>VrWyhQ@h2E`YZt;oNM4AZ1%RRY
zMc)BBa$Qk|QgV`^<W?VRIqCY=4y9$d18X_7)~Lt~E$0|KKZOmR20R0Ry5+L~71@`d
z^&B7wPzBoesyi;%Cd*eaU!~dDk^TjJ{|b15a&B-d6Ud^*8*;>GTE)+AMQ*zxP1L-K
z8E-M1^>~Ovsb8uc*5UD=0WSj7hKiD+IcQ=qfUj>%7c<FzQ$G6&-U|O8v+Ty3^M)@<
zI9G;!Cvx!Sx$)VxK9*evLxUq~H|LvT-@(C+6idz9a*7w>S;aaP`p%J42ZP|=ZN<Vv
zhqo1oBb-ie5F?_gRNjd_c7v6>(7KyO{i5n@g5j(>S!LC<M!qHVw1iqa0{-ydWLAk>
zGbFQqZ_>GzZoFmT|18H&8IoasB*)6LX@+fL)vYCB4c)?cH1fo)iJ~*=^YvWMDL4Mf
z>fpEFe}l+kT2P60h!%`2bn(t&t}<5cbc$n<{+*E@R6y;n32_R@RvG?X5Q+tXduabI
z=PV<0I}G-fZ6MCbX2AK@EvS|+jRovvqgr-hi#5^1sdIsv@ve<xO;1+^%-rZrO1!kM
zzP*I+tM}aQ&U>Gyw%T|%{6WlU&*5L_1U2{NL`-*FE0!O@&PM^UM5)rrG4x>z4hb<)
zyLh>}6)-rA@f9lqluxu^^$B$=`{kXP;t(a@HC=o-)Gy<z-v0}~1hcj{W`D#xg+^br
z52J+-Bv9pUm-s~a=tn=%N5`DYM3@o2IAlCu$F!6E-b-fuhF8Ru20ytD{$Kd$f8JXv
zzEVE=airqD7LgqFQTaAKcz?;1SeR606at|;b*QYBt*TsMI3k_7f2MlDbg-05L^`c~
zpg?>Y+4X?Me38oNM|hh?sY*B%2A^OF7`Gn>&{FY{%ZX@weB;4saqej~WKZP}<!*}B
z-7%5t*aF!&PNrOGy@UO*z1ww8@2%i*e4Ys#cjIN)1(%n`$DCNUUWq<+3QpOxjEi04
zI$HUE8`ga{aqDSa!B$T=DET`W*K?t`3?D`nUlof54kH#QS-L>z2Lpn955zv9D<7Vo
zj8A9r@!(evmx?kv_VA_RWTa%z3Ss>K6MjcG{{8Hh?{)r9G3O-1cXX^6QEvc>FU8W2
zM}X|iV0TZKuWuo$SYIgQ4+gHJ#z$t0Khds74iCG@$^nUg{Fw+JYvd{F-FuuP*s@29
z>Qy=S7iL2@xB0`mEi65OP+L%T#|0~>%eXd8<U3F#NR9!qQYgLvdi~LwyuBc&zIgF%
zj!jraI{ukTh5VK-*!STCRSA8IlP4J^qdIEbE|828gi-&f%gqJ*lZGF{#)z``@RVw{
z@$Dmja5_VQ{nbmhg2(NETLE_g4gsD4{F*^e2(0PuS!Y};97gYFfKz~P02=6802u%W
zU=$@CDA=SdvIfru47!zN<PCPKFA!4Q`A@{yJU|1W39uZ{3g`v&1HOAhC$Hi@uEKqu
zmY@vCTQFlYAbJ@iZ^L8uF5n%2`v4CC_R!M@M*5yY^SiGX@+I^|%T4(<9##5`+2S$u
vA@j%&0UrTA21Kn={t1s?08Y`2gLxUS5ecoyGk~!w@$?^@EEdrIgV+2wq*)$J

delta 8256
zcmc&(d3aP+mVdV@Rh6ocJs}B6NQD@o$igN<00m>P1u;YxgJ=kqRFM=(s=}>8LI{GO
zwn8`CaY3LzwH*;bTH%fD=%=){dy6dX+D}_(rDtRmMeRk~?$0^D`zom-gEsabGlBYj
z_ub{(<^0a_?)&yb%l;26u1^OfyY4;ux4CPw_qm)&B2mvudMa~LkESiwG@#Jdngt6A
zH(Rv4yxAU6>#-X4a6|@zEru-|@ydu{^#^?(VOTmSqv%z!hTbn4EjH-s?z@E8Ko1pH
zigol(aX@U=tB3qU2%;(Hg$wT1G_5&-0e@Ni%jKVyf8pn?bojiLCFRy0ySCVhm8=z7
zO+&tQrBjPIv>vDCOVHL@w2cWADjkzJ(bAK|&0}bHsVsF^eFL<fG_B39NiHX8J*iro
zLzR>J%5=&3*$EEo5*N)szhYF}knl&sm-+*SA3eW9IOyB+N929)Txpog+GEjtnOd7g
zlP+o>T7jizozq-}YWR}3HU3}k%hnbrYd_YsMMCop)O<O#VOVZLuI9_8-NOnK2P-xC
z2GQYR`68b_9yThkK<i1wafyhg`3iBC!e`ZHdx~hRXCHm-=@ze2WcVhrk!)qZ5f9Rl
zvO=+tJ}sM@=C%wG*}`op79}ECNL_C%Und4`VI9rsw%3G${z{j}AqViYBucN#&Ay$B
z9;cDi;cajCM;bf5kyd#Jl~=lR37t|HjHF1Y%^z%({uY0i+)A4-&ZK3Pnc_oQUzsl+
z*B`9>FN@eh8%O0aPtw<9CMua}^55zCQLS86F}hQjqxOx?7GKdzqf5kd^zrB;>23>i
z$Ieq|!etq3k+13p$CO&~pXZ8NBQe|-=#-&lVJY}oz5v)p11>BPJ80yEVeu0E+l5oa
z6ZFRmpA<vs-f@$~N&Wb^Q6lXadcFmi12{s(RqnKhP@D=_1bB(As&Z$)hhi~<VGa9R
z*e1Fn@)+GxRW4?dUR7-WoST|$RQGTeeOmQNqY@069=myjuVOqp$&&y`Hf$Y!+2Z#&
z<tM0m1@Jr6r*(M45x?y4vKe+Q3bgy>6KMMl;56VPs+w4yzYs@~#?PA7Wf>p_h*TRN
zjhzqYD_T9VYn(cU7t#L<z?XnO0o1w@%yp?P9!2e+89Y|`R~j*CY*rU!CvpvrJKM@K
zztLzzW3Ikw(i~woCDVUj)HL-g4E+Y+V_=uj>h0{?k{k7Z0rYK7ZN+db4g0GD!La-q
zjeljH3}wgi#Knb{Z#Al{&ZW~A@1_5&P95veg6yPh8ryoCRg)*Q9tSLhA6wW$5;dQT
zDKA#4Ztx8@EytHkf0$h4PSM!d*9wa=j5M^sN|pq)`UqP}x<-*pXOWyzsVEf38yr}z
zyqeKSpckfiEPvPN^ptDtFJYxG)67dedDxNRVBgr{kA$Tg4VMFWAD4v0og}KuM3P=n
zeMo$_aH-M2nlTW5MNR!GQ#fP@X0WsXtbhc74KN6;>=`s!2*7E3T;}20(e6gAxeSyX
zs7qup(gHqzFcMf4@JrLLODEb~fZ^ymT%+Qt<wB?0sUDG{cTJsW$u~_1t;)jWSQK$6
zat9sy(X1rJ7`3j~sqFInY-KZQVajkC(en+u_VT*obgaP@33UdV>qF8PmdDXH9dH8h
zJ{`P#f;dc$Y2!p5O`dkxwCSi3qsLnSmV&uG(?@WKxn_Dp=0ExvYRMPFMXE&>>#s~t
z5f|-1&shK`gU2R|u>vRSs<B0emUhb5(UyhnXEJz_%*6~w#R<xqF|n`+1Co~UYTNz5
zu>1f+b^`vWeA=oRX~IUEW)!nK*+0W8c2d@rvy-zpz=e5Pnr4yZdgMxn#a;%f+v&!c
z)9pi197IQFjuGW#tE(zfGb+ngr{RY93UnF=I6)0{i^U*%xvorHs{g4jC`1i4&zff6
zjea$>e^#BifUL7iL@n*`<<XSc`wC25@#v_ol-7>$;~e=GCC@1s8dKZHT+5+8Wl?}x
zp%il&9E~DSI@Qe?l05^vad9`zn!$l0PZV0@RrKRIhllhzmRe7u=|wWRSJYdg^fGnL
zU6y@yp8!jW$Pl@f42ztnCtRH+nqm@6#eyWJp7XhM856s|M2LMGa22{L70yPfkil?<
zS-F9r&)>xhnuF>(>aHIq(&_&C%A$o>M;4FOtQnSE;bdRwwxo$%kp|W*&`;MtEW{Z4
z*)>Jt5Iud(*fN~IC*9l`IBg_!wud5e0;WpfsX7gNS0vEf7M2%L;rvNC7h}c|+>vK*
zKzT0UTI!silcaR%<~CVPTjo~?Nzcq5%F*_t`4@=xdFZzQfMf8a7^&Xt{nFdwukl8_
zJW*KIp%EN0(oBDIsW;+p3CVyzEa#)G6R-pjn@OF3T0=b=t_3u(PP1sobt&Rj+H+mL
zSV}Kl_hhw67<KyG3Bl@Y=dKJ8NA5{57o-d~N|jT=Tza=5lOyeC4cX#)vMqS8HBP-s
zpGwJ`Oq@-)QYwTSHa0cC<Zh-0c|k@p`;BQ4e@A$BYiL=}bQo&h>&!iwlRIf~qbu)0
zs2NF!7FBZ|lP=97m+L=m+$xe9F=8RWOC!C7tHz*sh<i&PN`8Q{mnM{&0d;8Qg4V)6
z^(Q&%Xx;Y6w8p0872S@C6%$5|@vZDmsO${3C<S;jVziL~2mx#4MSyld2Y_?$WKFiA
z7y<<8=!3ZvDtQVS<C4?ll2VRRraJYr>ARW^bmh}!P3ht>n%6W`bnB~|nncqqh+YVx
z*Q2x?(9PgUm0^_KXzC-FjG(Ry&~J_yosMIKT83Ix1J(G7#X(x+8!J|@6FN1^<P*y{
zivvRCOm6DUh+<^u@0H9&Mfr80)lJ{{T{(Y&dXbH478ffv>X3T$f<@1Y3;GCW#wevn
z6)?Z#{+zKzd~s34q~uIWZygvHcoh$Ou>E?Bv2zNxywk6uuwe@XBNtvEo4G-*M3>np
z!S$H2`5J7?72~aGn##3f7lxBF=Go05$wrHVGcx$aD(Le{AMI?-5e;;pb-2*zWNVpN
zt7ivpfj8Q*c&OcEBE7P>Sd62O7oT4g<C3D0A{28eN+rdjktztsSmZESq<n>9ky38V
zS5#A1+i2mYJ#8f!8(CCyA}<(!N&Iu^$J<7V$^25GNi=e-RLoJNadQWgG-{)2hM4de
zX&Pu|$4F5{t2;bmlL}QSKY7J_PE2ved5kkjqLd_Wz`|6Xv<j=Uo<W>b!Lpor3B9E0
zjbR*y<Ys+Ds93D(BaNBvnbzOO8k4KZaVv-=X^JaZm>gW`w+y)%4T>v@7tR<ji~*d9
z!FR)*6$H&xNb!6%#;gHEE$dXUm<B9yiDt@KQYc7&dgBs7JLT|H<;SM+w1%a98K)M<
zWQAS%vUN1_o&vF+MufBT6mwXbb`lOT=Dt#?Av|52pk3iYaWkD<RzM$wD@K70wVpK7
zbA6xF>h)Q0S9E%dZG4Pp(a7iK=}iA|93{oGB`h8<hMWP0XLy_Z?HmdZbe-pH4+Q-P
ztV`+Fky5c;|0r@o*w*r}4RrMS$)b_cmksOP+T>-!iq_+J?gH?SlR=c!Mz^BWyS1as
zGHrKr#dhA_v>J13me`1BTsLgq&d&Db-|eE6Up8&4ceiD+-A1j-jddu_A4f^;b{iIh
z4VLr0$j4o!;yyin`E8=<9<J}UsrTwyPD<5#bp}Qe25bl1$KV;z-^7%4#S9ClDMw?V
znp4@?7SuJ+-@Av3d+EFt72<xHwc_(=bXPRrh5>PjG#*tt4`(>%vR<F8-UbUGO*uC~
zH}|w=KL;aHi6DDk<l1aP59yYbOT+|~Yb$36VJFealVa{oy~xBukg}FcNRGZ6?7bm3
z`5i0+#>DTyyXdz!lngw<4@QEI(;qV{{{wnFNJ*<kbDX(&RjGJH_pQ29WbcNge&Koy
zrQSF+bse`Dj+y{mY`0ubQ*WF;Bo)1<v#7X89hk?Ny2m1QwC$#SH<pZtdo}Ef0xe6W
z-*E9;lTU_Pq_+d{I!gV1fm}IT<#*$fK!^MV`acTTM-$f!6V>EfQzqW0O>0WUQ}oL<
zU*<f7#zz28F>vg70Ocp?nYH=Zdr;g3h?jGEZAso2(1e-dxGbssFPU#{1`GKsz5J#X
zV$kzAEFP!H*8qL~O|GP8Z_c?=y-O+%s0dgq$8j${`4|;r>Q^34g*r7#vEg-ecbN2?
zOQp9&gp0P_QX*c|58Uz#F~!_3mc5_Bu(!k1!*V!krUUw`3w660x(kuwxvA*JfteHB
zavfJ0iKYUZ{j9+^Z*6W>8a&PmU=4<^H*+mhg9FX=KL%;P1UwFaD&>Cwl!u;;(i4Ei
z0F`-trz+#@JW0NQ{wi?B_V+C6eht`9AKvC+^(L<yE(&P;I*&N0H?2z%H7}y|AVaS`
z<4mNRHz3vM2p*pX8~`X`Y8EwLlsd@RUtXWW!T;^`h2pjU!`M0gr<FM~ZD)&}hiF77
zPcPeEac=Q5>I^cmamB844Wfr|EO9~9NICP>NY<60#}>eGNI!cR^=}*}uBD%CED}dJ
zgx)3w^@UG)JBDt@&^u7NlNN5C+#7B<l8!;RF;eF6MW44R+}s)rMEFiF#$j&NBqGoM
zj;_5SXf@Susr%QX>&RHV`6tm;MuX0zt@$*C3Pp&H-CHVN*KZ<6-(}tA+XvR3<Kgpb
z>|q>99YCEh+;aaDZlFKkK8~}9iaYYe8~Ts#(7*R^v+f!hcevRqmLG-u*okhX6?f%L
zF(Y=SY1N7a@H=t=;9PT)vHg<ZBr^$iLq=!c&eWz<%vLcqor<?++xt#(yts>&ZuMlm
z&HbBklY~FGN|*lV)(1qx5y*KRFblAdTg;oeH}Hs2Z2GZdbff+IL%BNX28=$TE}34~
zmLc}jJKL(nyZ=u{7q|YXq48&&{ya#>iW}&&9pl9@WohsIKua5THZri#e`a0@^!7bd
zC;gUt#)b8NDUbc(me#&2pPspQu=rA0+V6GSeN7@M7I0r9`Sa4TF>_Pd54@Qtx&N6e
z7OHSydN4Zp^NaYE@UtDcG~|AFvD408cp#7DOMZUs9n9$ByEy)rz@LCD@<YA({sij<
zDy*9~eJUqYer-QAqpmm*W=)5efOCyHyzM*3#o6Slm>Q7G8KTRu1)Fe5A7`R0C+6?o
zgj7W4xbx?Xs&7V*`>a+x9@6<MO2oYE9s((R9l0R-(IpnT7h<hyXDWVYqNB2G<p;cY
zJTr(JDea+VZzLr7;}qv@;kalYLKZ(c=20_Em!J;bjd{M?FJPXx0LN+AuBxOHC`R2x
z`>rZcOgnd7AwJQ+*wrZPzeA%z^}8==`c4P&A^Lp6a0VIFB9wzrYf{UJIj2~{-``6l
z@5vN%sd&#N;uJOSIg)-e2*W@Aa>LhOLViv)51-)MInUmbT9ws(!804yMS+Onj7V=V
zyeK3);(QpiV_tPS!hDt-^EgF}WkK<|&$hia;sH9gH&1*`U++CWQYAfKV`GVCmZ)|T
zHv$`1`!rJ>az2uuvmfhK_c~1NWUEsQuoMdUH=6iJ^}L&~mJNW-fV%-Y;Bmln3{26b
z9i7X~O91ta?IfB$2YdxkpQgS=DG9Tu0P^U~M~YS{o=wK{AO^$EXW<KVsILW5-PM<)
z?NUH3;3~j8zyg2-ECrl-yCzp~o#6=ky-gt*ls`cab*;1(rHz11fb7jc0`$H7$@}Tn
zM+^Lqq4LZ(1o<n}^<|avH9V^57^B%6s6*h9?*iTf{2tIpIC%<>p8`H7=f3<Ta6v+2
QeBdMYT_L8?ihVcz51*it)Bpeg

diff --git a/tmw.py b/tmw.py
index 732cbc7..feaf48f 100644
--- a/tmw.py
+++ b/tmw.py
@@ -349,11 +349,16 @@ def perform_multipleSubs(substitutionsFile, text):
     with open(substitutionsFile, "r") as subsFile: 
         subs = csv.reader(subsFile)
         subsDict = {rows[0]:rows[1] for rows in subs}
-        #print(subsDict)
+        for key, value in subsDict.items(): 
+            text = re.sub(key, value, text)
+            #print(text)
+        return text
+
         ## Create a regular expression  from the dictionary keys
-        regex = re.compile("(%s)" % "|".join(map(re.escape, subsDict.keys())))
+        #regex = re.compile("(%s)" % "|".join(map(re.escape, subsDict.keys())))
         ## For each match, look-up corresponding value in dictionary
-        return regex.sub(lambda mo: subsDict[mo.string[mo.start():mo.end()]], text) 
+        #result = regex.sub(lambda mo: subsDict[mo.string[mo.start():mo.end()]], text)
+        #print(result)
 
 def pretokenize(inpath, substitutionsFile, outfolder):
     """Deletion of unwanted elided and hyphenated words for better tokenization in TreeTagger. Optional."""
@@ -363,6 +368,8 @@ def pretokenize(inpath, substitutionsFile, outfolder):
             text = text.read()
             text = perform_multipleSubs(substitutionsFile, text)
             basename = os.path.basename(file)
+            if "truc" in text or "type" in text or "flic" in text: 
+                print("Found bad word in", basename)
             cleanfilename = basename
             if not os.path.exists(outfolder):
                 os.makedirs(outfolder)
@@ -455,6 +462,12 @@ def make_lemmatext(inpath, outfolder, mode, stoplist_errors):
                             lemmata.append(token.lower())
                         elif "NC" in pos and "|" not in lemma and "<unknown>" not in lemma:
                             lemmata.append(lemma.lower())
+                    elif mode == "enNV":
+                        if "NN" in pos or "VB" in pos and "|" not in lemma and "<unknown>" not in lemma:
+                            lemmata.append(lemma.lower())
+                    elif mode == "enN":
+                        if "NN" in pos and "|" not in lemma and "<unknown>" not in lemma:
+                            lemmata.append(lemma.lower())
             ## Continue with list of lemmata, but remove undesired leftover words         
             lemmata = ' '.join([word for word in lemmata if word not in stoplist])
             lemmata = re.sub("[ ]{1,4}"," ", lemmata)
@@ -468,19 +481,67 @@ def make_lemmatext(inpath, outfolder, mode, stoplist_errors):
 
 
 
+
+
+#################################
+# substitute                    #
+#################################
+
+import csv
+
+def multipleSubs(substitutionsFile, text):
+    """Search and replace from a table of string pairs."""
+    ## With code from http://stackoverflow.com/users/735204/emmett-j-butler
+    ## Load table and turn into dict
+    with open(substitutionsFile, "r") as subsFile: 
+        subs = csv.reader(subsFile)
+        subsDict = {rows[0]:rows[1] for rows in subs}
+        for key, value in subsDict.items(): 
+            text = re.sub(key, value, text)
+            #print(text)
+        return text
+
+        ## Create a regular expression  from the dictionary keys
+        #regex = re.compile("(%s)" % "|".join(map(re.escape, subsDict.keys())))
+        ## For each match, look-up corresponding value in dictionary
+        #result = regex.sub(lambda mo: subsDict[mo.string[mo.start():mo.end()]], text)
+        #print(result)
+
+def substitute(inpath, substitutionsFile, outfolder):
+    """Deletion of unwanted elided and hyphenated words for better tokenization in TreeTagger. Optional."""
+    print("\nLaunched substitute.")
+    for file in glob.glob(inpath):
+        with open(file,"r") as text:
+            text = text.read()
+            text = multipleSubs(substitutionsFile, text)
+            basename = os.path.basename(file)
+            counter = 0
+            if " truc " in text or " type " in text or " flic " in text: 
+                counter +=1
+            print(counter)
+            cleanfilename = basename
+            if not os.path.exists(outfolder):
+                os.makedirs(outfolder)
+        with open(os.path.join(outfolder, cleanfilename),"w") as output:
+            output.write(text)
+    print("Done.")
+
+
+
+
+
+
 ##################################################################
 ### TOPIC MODELLING WITH MALLET                                ###
 ##################################################################
 
-# TODO: Concatenate two stoplists first, one for errors, one for deliberate ommissions.
-
 
 #################################
 # call_mallet_import            #
 #################################
 
 
-def call_mallet_import(mallet_path, infolder,outfolder, outfile, stoplist_project):
+def call_mallet_import(mallet_path, infolder, outfolder, outfile, stoplist_project):
     """Function to import text data into Mallet."""
     print("\nLaunched call_mallet_import.")    
     import subprocess
@@ -861,12 +922,21 @@ def get_targetItems(average, targetCategory):
         #print(targetItems)
         return(targetItems)    
      
-def get_dataToPlot(average, firstWordsFile, topTopicsShown, item):
+def get_dataToPlot(average, firstWordsFile, mode, topTopicsShown, item):
     """From average topic score data, select data to be plotted."""
     #print("  Getting dataToPlot.")
     with open(average, "r") as infile:
         ## Read the average topic score data
         allData = pd.DataFrame.from_csv(infile, sep=",")
+        if mode == "normalized": # mean normalization
+            colmeans = allData.mean(axis=0)
+            allData = allData / colmeans
+        elif mode == "zscores": # zscore transformation
+            colmeans = allData.mean(axis=0) # mean for each topic
+            allstd = allData.stack().std() #std for entire df
+            allData = (allData - colmeans) / allstd # = zscore transf.
+        elif mode == "absolute": # absolute values
+            allData = allData
         allData = allData.T
         ## Add top topic words to table for display later
         firstWords = get_firstWords(firstWordsFile)
@@ -879,15 +949,19 @@ def get_dataToPlot(average, firstWordsFile, topTopicsShown, item):
         #print(dataToPlot)         
         return dataToPlot
 
-def create_barchart_topTopics(dataToPlot, targetCategory, item, 
+def create_barchart_topTopics(dataToPlot, targetCategory, mode, item, 
                               fontscale, height, dpi, outfolder):
     """Function to make a topTopics barchart."""
     print("  Creating plot for: "+str(item))
     ## Doing the plotting.
     dataToPlot.plot(kind="bar", legend=None) 
     plt.setp(plt.xticks()[1], rotation=90, fontsize = 11)   
-    plt.title("Top-Topics für: "+str(item), fontsize=15)
-    plt.ylabel("Scores", fontsize=13)
+    if mode == "normalized": 
+        plt.title("Top-distinctive Topics für: "+str(item), fontsize=15)
+        plt.ylabel("normalized scores", fontsize=13)
+    elif mode == "absolute":
+        plt.title("Top-wichtigste Topics für: "+str(item), fontsize=15)
+        plt.ylabel("absolute scores", fontsize=13)
     plt.xlabel("Topics", fontsize=13)
     plt.tight_layout() 
     if height != 0:
@@ -897,12 +971,12 @@ def create_barchart_topTopics(dataToPlot, targetCategory, item,
     outfolder = outfolder+targetCategory+"/"
     if not os.path.exists(outfolder):
         os.makedirs(outfolder)
-    figure_filename = outfolder+"tT_"+str(item)+".png"
+    figure_filename = outfolder+"tT_"+mode+"-"+str(item)+".png"
     plt.savefig(figure_filename, dpi=dpi)
     plt.close()
 
 def plot_topTopics(averageDatasets, firstWordsFile, numOfTopics, 
-                   targetCategories, topTopicsShown, fontscale, 
+                   targetCategories, mode, topTopicsShown, fontscale, 
                    height, dpi, outfolder): 
     """For each item in a category, plot the top n topics as a barchart."""
     print("Launched plot_topTopics.")
@@ -911,8 +985,8 @@ def plot_topTopics(averageDatasets, firstWordsFile, numOfTopics,
             if targetCategory in average:
                 targetItems = get_targetItems(average, targetCategory)
                 for item in targetItems:
-                    dataToPlot = get_dataToPlot(average, firstWordsFile, topTopicsShown, item)
-                    create_barchart_topTopics(dataToPlot, targetCategory, item, fontscale, height, dpi, outfolder)
+                    dataToPlot = get_dataToPlot(average, firstWordsFile, mode, topTopicsShown, item)
+                    create_barchart_topTopics(dataToPlot, targetCategory, mode, item, fontscale, height, dpi, outfolder)
     print("Done.")
 
 
@@ -1034,6 +1108,30 @@ def get_heatmap_dataToPlot(average, firstWordsFile, topTopicsShown,
         allScores.index = allScores.index.astype(np.int64)        
         allScores = pd.concat([allScores, firstWords], axis=1, join="inner")
         #print(allScores)
+        ## Remove undesired columns: subsubgenre
+        #allScores = allScores.drop("adventure", axis=1)
+        #allScores = allScores.drop("autobiographical", axis=1)
+        #allScores = allScores.drop("blanche", axis=1)
+        #allScores = allScores.drop("education", axis=1)
+        #allScores = allScores.drop("fantastic", axis=1)
+        #allScores = allScores.drop("fantastique", axis=1)
+        #allScores = allScores.drop("historical", axis=1)
+        #allScores = allScores.drop("n.av.", axis=1)
+        #allScores = allScores.drop("nouveau-roman", axis=1)
+        #allScores = allScores.drop("sciencefiction", axis=1)
+        #allScores = allScores.drop("social", axis=1)
+        #allScores = allScores.drop("other", axis=1)
+        #allScores = allScores.drop("espionnage", axis=1)
+        #allScores = allScores.drop("thriller", axis=1)
+        #allScores = allScores.drop("neopolar", axis=1)
+        ## Remove undesired columns: protagonist-policier
+        #allScores = allScores.drop("crminal", axis=1)
+        #allScores = allScores.drop("mixed", axis=1)
+        #allScores = allScores.drop("witness", axis=1)
+        #allScores = allScores.drop("criminel", axis=1)
+        #allScores = allScores.drop("detection", axis=1)
+        #allScores = allScores.drop("victime", axis=1)
+        #allScores = allScores.drop("n.av.", axis=1)
         ## Sort by standard deviation
         standardDeviations = allScores.std(axis=1)
         standardDeviations.name = "std"
@@ -1065,14 +1163,14 @@ def create_distinctiveness_heatmap(dataToPlot,
     # Nice: bone_r, copper_r, PuBu, OrRd, GnBu, BuGn, YlOrRd
     plt.title("Verteilung der Topic Scores", fontsize=20)
     plt.xlabel(targetCategory, fontsize=16)
-    plt.ylabel("Top topics (stdev)", fontsize=16)
-    plt.setp(plt.xticks()[1], rotation=90, fontsize = 12)   
+    plt.ylabel("Top topics (stdev)", fontsize=14)
+    plt.setp(plt.xticks()[1], rotation=90, fontsize = 14)   
     plt.tight_layout() 
 
     ## Saving the plot to disk.
     if not os.path.exists(outfolder):
         os.makedirs(outfolder)
-    figure_filename = outfolder+"dist-heatmap_by-"+str(targetCategory)+".png"
+    figure_filename = outfolder+"dist-heatmap_by-"+str(targetCategory)+".jpg"
     plt.savefig(figure_filename, dpi=dpi)
     plt.close()
 
@@ -1302,6 +1400,75 @@ def build_itemScoreMatrix(averageDatasets, targetCategory,
             itemScores = itemScores.sort(columns=["sorting"], axis=0, ascending=False)
             itemScoreMatrix = itemScores.iloc[0:topicsPerItem,0:-1]
             itemScoreMatrix = itemScoreMatrix.T
+            itemScoreMatrix = itemScoreMatrix.drop("Allais", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Audoux", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Barbara", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Barjavel", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Beckett", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Bernanos", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Bosco", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Bourget", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Butor", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Camus", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Carco", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Celine", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Colette", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Darien", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Daudet", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Delly", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Dombre", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Duras", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("ErckChat", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("FevalPP", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("MduGard", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Mirbeau", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Ohnet", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Perec", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Proust", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Queneau", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Rodenbach", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Rolland", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Roussel", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("SaintExupery", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Sand", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Aimard", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("AimardAuriac", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Balzac", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Bon", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Echenoz", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Flaubert", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Fleuriot", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("France", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Galopin", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Gary", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("GaryAjar", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("GaryBogat", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("GarySinibaldi", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Gautier", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Giono", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Gouraud", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Huysmans", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Hugo", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("LeClezio", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Loti", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Malot", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Mary", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Maupassant", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Modiano", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("RobbeGrillet", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Stolz", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Sue", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Tournier", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Verne", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Vian", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("VianSullivan", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Zola", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Malraux", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Simon", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("LeRouge", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("LeRougeGuitton", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Toussaint", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Khadra", axis=0)
             #print(itemScoreMatrix)
             return itemScoreMatrix
 
@@ -1315,10 +1482,10 @@ def perform_itemClustering(itemScoreMatrix, targetCategory, method, metric,
     ## Plot the distance matrix as a dendrogram
     plt.figure(figsize=figsize) # TODO: this could be a a parameter.
     itemLabels = itemScoreMatrix.index.values
-    sc.hierarchy.dendrogram(itemDistanceMatrix, labels=itemLabels, orientation="right")
+    sc.hierarchy.dendrogram(itemDistanceMatrix, labels=itemLabels, orientation="top")
 
     ## Format items labels to x-axis tick labels
-    plt.setp(plt.xticks()[1], rotation=90, fontsize = 10)
+    plt.setp(plt.xticks()[1], rotation=90, fontsize = 14)
     plt.title("Item Clustering Dendrogramm: "+targetCategory, fontsize=20)
     plt.ylabel("Distance", fontsize=16)
     plt.xlabel("Parameter: "+method+" clustering - "+metric+" distance - "+str(topicsPerItem)+" topics", fontsize=16)
@@ -1328,7 +1495,7 @@ def perform_itemClustering(itemScoreMatrix, targetCategory, method, metric,
     print("- saving image file.")
     if not os.path.exists(outfolder):
         os.makedirs(outfolder)
-    figure_filename = "item-clustering_"+targetCategory+"_"+metric+"-"+method+"-"+str(topicsPerItem)+"topics"+"-"+sortingCriterium+".svg"
+    figure_filename = "item-clustering_"+targetCategory+"_"+metric+"-"+method+"-"+sortingCriterium+"-"+str(topicsPerItem)+"topics"+".jpg"
     plt.savefig(outfolder + figure_filename, dpi=600)
     plt.close()
     
diff --git a/tmw_config.py b/tmw_config.py
index e31bfc7..bc557bf 100644
--- a/tmw_config.py
+++ b/tmw_config.py
@@ -31,13 +31,13 @@
 
 ### The following settings depend on the system used.
 ### Path to the working directory.
-wdir = "/home/christof/Dropbox/0-Analysen/2015/hybrid/rf10/" # end with slash.
+wdir = "/home/" # end with slash.
 ### Path to the TreeTagger file (language-dependent!)
-tagger = "/home/christof/Programs/TreeTagger/cmd/tree-tagger-french"
+tagger = "/home/[USER]/Programs/TreeTagger/cmd/tree-tagger-french"
 ### Path to Mallet installation directory
-mallet_path = "/home/christof/Programs/Mallet/bin/mallet"
+mallet_path = "/home/[USER]/Programs/Mallet/bin/mallet"
 ### Path to the font for wordle generation
-font_path = "/home/christof/.fonts/AlegreyaSans-Regular.otf"
+font_path = "/home/[USER]/.fonts/AlegreyaSans-Regular.otf"
 
 import tmw
 #print(help(topmod))
@@ -58,7 +58,7 @@
 ### Split entire texts into smaller segments.
 inpath = wdir + "1_txt/*.txt"
 outfolder = wdir + "2_segs/"
-target = 2000
+target = 600
 sizetolerancefactor = 1.1
 preserveparagraphs = True
 #tmw.segmenter(inpath, outfolder, target, sizetolerancefactor, preserveparagraphs)
@@ -67,7 +67,7 @@
 ### Assign each segment to one bin over textual progression.
 inpath = wdir + "2_segs/*.txt"
 outfolder = wdir + "7_aggregates/"
-binsnb = 5 # number of bins
+binsnb = 3 # number of bins
 #tmw.segments_to_bins(inpath,outfolder, binsnb)
 
 ### pretokenize
@@ -79,7 +79,7 @@
 
 ### call_treetagger
 ### Perform lemmatization and POS tagging.
-infolder = wdir + "3_tokens/"
+infolder = wdir + "2_segs/"
 outfolder = wdir + "4_tagged/"
 tagger = tagger
 #tmw.call_treetagger(infolder, outfolder, tagger) 
@@ -89,9 +89,15 @@
 inpath = wdir + "4_tagged/*.trt"
 outfolder = wdir + "5_lemmata/"
 mode = "frN" # frN=nouns, esN=nouns, frNV=nouns+verbs, frNVAA=nouns+verbs+adj+adverbs 
-stoplist_errors = wdir+"extras/fr_stopwords_errors.txt" # in tmw folder
+stoplist_errors = wdir+"extras/fr_stopwords_errors.txt" # wdir
 #tmw.make_lemmatext(inpath, outfolder, mode, stoplist_errors)
 
+### substitute
+### Perform some preliminary tokenization.
+inpath = wdir + "5_lemmata/*.txt"
+outfolder = wdir + "5_substituted/"
+substitutionsFile = wdir+"extras/fr_argot-substitutions.csv"
+#tmw.substitute(inpath, substitutionsFile, outfolder)
 
 
 ################################
@@ -101,7 +107,7 @@
 ### call_mallet_import
 ### Imports text data into the Mallet corpus format.
 mallet_path = mallet_path
-infolder = wdir + "5_lemmata/"
+infolder = wdir + "5_substituted/"
 outfolder = wdir + "6_mallet/" 
 outfile = outfolder + "corpus.mallet"
 stoplist_project = wdir+"extras/fr_stopwords_project.txt" # in tmw folder
@@ -112,9 +118,9 @@
 mallet_path = mallet_path
 inputfile = wdir + "6_mallet/corpus.mallet"
 outfolder = wdir + "6_mallet/"
-numOfTopics = "50" # string
+numOfTopics = "250" # string
 optimize_interval = "100" # string
-num_iterations = "1000" # string
+num_iterations = "5000" # string
 num_top_words = "100" # string
 doc_topics_max = numOfTopics
 num_threads = "4" # string
@@ -129,21 +135,22 @@
 ### create_mastermatrix
 ### Creates a matrix with all information (metadata and topic scores for 
 ### each segment) in one place.
-corpuspath = wdir+"/2_segs/*.txt"
+corpuspath = wdir+"2_segs/*.txt"
 outfolder = wdir+"7_aggregates/"
 mastermatrixfile = "mastermatrix.csv"
-metadatafile = wdir+"/metadata.csv"
-topics_in_texts = wdir+"/6_mallet/topics-in-texts.csv"
+metadatafile = wdir+"metadata.csv"
+topics_in_texts = wdir+"6_mallet/topics-in-texts.csv"
 numOfTopics = int(numOfTopics)
 useBins = True # True|False
 binDataFile = wdir+"7_aggregates/segs-and-bins.csv"
-#tmw.create_mastermatrix(corpuspath, outfolder, mastermatrixfile, metadatafile, topics_in_texts, numOfTopics, useBins, binDataFile)
+###tmw.create_mastermatrix(corpuspath, outfolder, mastermatrixfile, metadatafile, topics_in_texts, numOfTopics, useBins, binDataFile)
 
 ### calculate_averageTopicScores
 ### Based on the mastermatrix, calculates various average topic score datasets.
 mastermatrixfile = wdir+"/7_aggregates/mastermatrix.csv"
 outfolder = wdir+"7_aggregates/"
-targets = ["author", "subgenre", "binID", "decade"] 
+targets = ["segmentID"] 
+#targets = ["subgenre", "author-name", "subsubgenre","decade", "narration", "setting", "author-gender", "title", "protagonist-policier"] 
 #targets = ["author", "author-gender", "title", "decade", "subgenre", 
 #           "idno", "segmentID", "narration", "protagonist-policier", "binID"] 
 #tmw.calculate_averageTopicScores(mastermatrixfile, targets, outfolder)
@@ -153,7 +160,7 @@
 mastermatrixfile = wdir+"/7_aggregates/mastermatrix.csv"
 outfolder = wdir+"7_aggregates/"
 targets = ["decade", "binID"] # 2 targets to combine
-tmw.calculate_complexAverageTopicScores(mastermatrixfile, targets, outfolder)
+#tmw.calculate_complexAverageTopicScores(mastermatrixfile, targets, outfolder)
 
 ### save_firstWords
 ### Saves the first words of each topic to a separate file.
@@ -172,11 +179,11 @@
 ### Creates a wordle for each topic.
 word_weights_file = wdir+"6_mallet/" + "word-weights.txt"
 numOfTopics = numOfTopics
-words = 40
+words = 30
 outfolder = wdir+"8_visuals/wordles/"
 font_path = font_path
 dpi = 300
-tmw.make_wordle_from_mallet(word_weights_file,numOfTopics,words,outfolder,font_path,dpi)
+#tmw.make_wordle_from_mallet(word_weights_file,numOfTopics,words,outfolder,font_path,dpi)
 
 ### crop_images
 ### Optional. Crops the wordle image files.
@@ -192,14 +199,15 @@
 ### For each item from a category, creates a barchart of the top topics.
 averageDatasets = wdir+"7_aggregates/avg*.csv" 
 firstWordsFile = wdir+"7_aggregates/firstWords.csv"
-targetCategories = ["author", "subgenre", "binID"] 
-topTopicsShown = 30 
+targetCategories = ["title"]
+topTopicsShown = 16 
 numOfTopics = numOfTopics 
 fontscale = 1.0
 height = 0 # 0=automatic and variable
 dpi = 300
+mode = "normalized" #normalized|zscores|absolute
 outfolder = wdir+"/8_visuals/topTopics/"
-#tmw.plot_topTopics(averageDatasets, firstWordsFile, numOfTopics, targetCategories, topTopicsShown, fontscale, height, dpi, outfolder)
+tmw.plot_topTopics(averageDatasets, firstWordsFile, numOfTopics, targetCategories, mode, topTopicsShown, fontscale, height, dpi, outfolder)
 
 ### plot_topItems ###
 ### For each topic, creates a barchart with top items from a category. 
@@ -207,8 +215,8 @@
 outfolder = wdir+"8_visuals/topItems/"
 firstWordsFile = wdir+"7_aggregates/firstWords.csv"
 numOfTopics = numOfTopics # must be actual number of topics modeled. 
-targetCategories = ["author", "subgenre", "binID"] 
-topItemsShown = 30 
+targetCategories = ["segmentID"] 
+topItemsShown = 20 
 fontscale = 0.8
 height = 0 # 0=automatic and flexible
 dpi = 300
@@ -225,7 +233,7 @@
 averageDatasets = wdir+"7_aggregates/avg*.csv" 
 firstWordsFile = wdir+"7_aggregates/firstWords.csv"
 outfolder = wdir+"8_visuals/distinctiveness/"
-targetCategories = ["subgenre"] 
+targetCategories = ["protagonist-policier"] 
 numOfTopics = numOfTopics # actual number of topics modeled.
 topTopicsShown = 20 
 fontscale = 1.0
@@ -242,7 +250,7 @@
 dpi = 300
 height = 0 # for lineplot; 0=automatic
 mode = "line" # area|line for areaplot or lineplot
-topics = ["25", "44"] # list of one or several topics
+topics = ["190", "6"] # list of one or several topics
 #tmw.plot_topicsOverTime(averageDatasets, firstWordsFile, outfolder, numOfTopics, fontscale, dpi, height, mode, topics)
 
 ### topicClustering ###
@@ -257,12 +265,12 @@
 
 ### itemClustering ###
 # This function creates a dendrogram of items in a category (authors, titles).
-averageDatasets = wdir+"7_aggregates/avg*author.csv" 
-figsize = (10,80) # width,height
+averageDatasets = wdir+"7_aggregates/avg*.csv" 
+figsize = (15,10) # width,height
 outfolder = wdir+"8_visuals/clustering/"
-topicsPerItem = 40 # can be set
-sortingCriterium = "std" # std|mean
-targetCategories = ["author"] # list
+topicsPerItem = 50 # can be set
+sortingCriterium = "mean" # std|mean
+targetCategories = ["author-name"] # list
 methods=["weighted"] # list
 metrics=["cosine"] # list
 #tmw.itemClustering(averageDatasets, figsize, outfolder, topicsPerItem, targetCategories, methods, metrics, sortingCriterium)
@@ -302,7 +310,7 @@
 
 ### 5c show segment
 ## To read a specific segment, better than looking in the folder.
-segmentID = "rf0166§0118" # indicate here, manually
+segmentID = "rf1246§0048" # indicate here, manually
 outfolder = wdir+"/9_sel-segs/"
 #tmw.show_segment(wdir,segmentID, outfolder)
 

From cf3c02cb30c264f85a2dbc618670221bef2df9f5 Mon Sep 17 00:00:00 2001
From: christofs <c.schoech@gmail.com>
Date: Mon, 19 Oct 2015 15:30:57 +0200
Subject: [PATCH 51/56] Add normalization to heatmap. Fix
 https://github.com/cligs/tmw/issues/18.

---
 tmw.py        | 25 ++++++++++++++++++-------
 tmw_config.py |  3 ++-
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/tmw.py b/tmw.py
index feaf48f..34e72f4 100644
--- a/tmw.py
+++ b/tmw.py
@@ -1095,13 +1095,22 @@ def get_heatmap_firstWords(firstWordsFile):
         #print(firstWords)
         return(firstWords)
 
-def get_heatmap_dataToPlot(average, firstWordsFile, topTopicsShown, 
+def get_heatmap_dataToPlot(average, mode, firstWordsFile, topTopicsShown, 
                            numOfTopics):
     """From average topic score data, select data to be plotted."""
     print("- getting dataToPlot...")
     with open(average, "r") as infile:
         ## Read the average topic score data
         allScores = pd.DataFrame.from_csv(infile, sep=",")
+        if mode == "normalized": # mean normalization
+            colmeans = allScores.mean(axis=0)
+            allScores = allScores / colmeans
+        elif mode == "zscores": # zscore transformation
+            colmeans = allScores.mean(axis=0) # mean for each topic
+            allstd = allScores.stack().std() #std for entire df
+            allScores = (allScores - colmeans) / allstd # = zscore transf.
+        elif mode == "absolute": # absolute values
+            allScores = allScores
         allScores = allScores.T
         ## Add top topic words to table for display later
         firstWords = get_heatmap_firstWords(firstWordsFile)
@@ -1154,6 +1163,7 @@ def get_heatmap_dataToPlot(average, firstWordsFile, topTopicsShown,
 def create_distinctiveness_heatmap(dataToPlot, 
                                    topTopicsShown,
                                    targetCategory, 
+                                   mode,
                                    fontscale,
                                    dpi, 
                                    outfolder):
@@ -1163,21 +1173,20 @@ def create_distinctiveness_heatmap(dataToPlot,
     # Nice: bone_r, copper_r, PuBu, OrRd, GnBu, BuGn, YlOrRd
     plt.title("Verteilung der Topic Scores", fontsize=20)
     plt.xlabel(targetCategory, fontsize=16)
-    plt.ylabel("Top topics (stdev)", fontsize=14)
-    plt.setp(plt.xticks()[1], rotation=90, fontsize = 14)   
+    plt.ylabel("Top topics (stdev)", fontsize=16)
+    plt.setp(plt.xticks()[1], rotation=90, fontsize = 12)   
     plt.tight_layout() 
 
     ## Saving the plot to disk.
     if not os.path.exists(outfolder):
         os.makedirs(outfolder)
-    figure_filename = outfolder+"dist-heatmap_by-"+str(targetCategory)+".jpg"
+    figure_filename = outfolder+"dist-heatmap_"+mode+"-by-"+str(targetCategory)+".png"
     plt.savefig(figure_filename, dpi=dpi)
     plt.close()
 
-
-
 def plot_distinctiveness_heatmap(averageDatasets, 
                                  firstWordsFile, 
+                                 mode,
                                  outfolder, 
                                  targetCategories, 
                                  numOfTopics, 
@@ -1190,13 +1199,15 @@ def plot_distinctiveness_heatmap(averageDatasets,
         for targetCategory in targetCategories: 
             if targetCategory in average and targetCategory != "segmentID":
                 print("- working on: "+targetCategory)
-                dataToPlot = get_heatmap_dataToPlot(average, 
+                dataToPlot = get_heatmap_dataToPlot(average,
+                                                    mode,
                                                     firstWordsFile, 
                                                     topTopicsShown,
                                                     numOfTopics)
                 create_distinctiveness_heatmap(dataToPlot, 
                                                topTopicsShown,
                                                targetCategory, 
+                                               mode,
                                                fontscale,
                                                dpi, 
                                                outfolder)
diff --git a/tmw_config.py b/tmw_config.py
index bc557bf..1cc4cc1 100644
--- a/tmw_config.py
+++ b/tmw_config.py
@@ -234,11 +234,12 @@
 firstWordsFile = wdir+"7_aggregates/firstWords.csv"
 outfolder = wdir+"8_visuals/distinctiveness/"
 targetCategories = ["protagonist-policier"] 
+mode = "zscores" #normalized|zscores|absolute
 numOfTopics = numOfTopics # actual number of topics modeled.
 topTopicsShown = 20 
 fontscale = 1.0
 dpi = 300
-#tmw.plot_distinctiveness_heatmap(averageDatasets, firstWordsFile, outfolder, targetCategories, numOfTopics, topTopicsShown, fontscale, dpi)
+#tmw.plot_distinctiveness_heatmap(averageDatasets, firstWordsFile, mode, outfolder, targetCategories, numOfTopics, topTopicsShown, fontscale, dpi)
 
 ### plot_topicsOverTime ###
 ###     

From 86a400f048401b933efa32f2272783b0648d0f97 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christof=20Sch=C3=B6ch?= <c.schoech@gmail.com>
Date: Wed, 21 Oct 2015 15:42:28 +0200
Subject: [PATCH 52/56] make_wordle: add font_path; Fixes
 https://github.com/cligs/tmw/issues/14

---
 tmw.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tmw.py b/tmw.py
index 34e72f4..e1e0823 100644
--- a/tmw.py
+++ b/tmw.py
@@ -846,7 +846,7 @@ def get_wordlewords(words, word_weights_file, topic):
         wordlewords = wordlewords + ((word + " ") * score)
     return wordlewords
         
-def get_color_scale(word, font_size, position, orientation, random_state=None):
+def get_color_scale(word, font_size, position, orientation, font_path, random_state=None):
     """ Create color scheme for wordle."""
     return "hsl(245, 58%, 25%)" # Default. Uniform dark blue.
     #return "hsl(0, 00%, %d%%)" % random.randint(80, 100) # Greys for black background.

From d6db645490117ea729eb7f5689d8e2065d52c7cb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christof=20Sch=C3=B6ch?= <c.schoech@gmail.com>
Date: Wed, 21 Oct 2015 16:41:59 +0200
Subject: [PATCH 53/56] Added function save_topicRank

---
 tmw.py        | 24 ++++++++++++++++++++++++
 tmw_config.py |  9 ++++++++-
 2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/tmw.py b/tmw.py
index e1e0823..3d5ef97 100644
--- a/tmw.py
+++ b/tmw.py
@@ -808,6 +808,30 @@ def save_firstWords(topicWordFile, outfolder, filename):
         print("Done.")
 
 
+#################################
+# save_topicRanks               #
+#################################
+
+def save_topicRanks(topicWordFile, outfolder, filename):
+    """Save a list of topics with their rank by topic score."""
+    print("Launched save_topicRanks.")
+    with open(topicWordFile, "r") as infile:
+        topicRanks = pd.read_csv(infile, sep="\t", header=None)
+        topicRanks = topicRanks.drop(2, axis=1)
+        topicRanks.rename(columns={0:"Number"}, inplace=True)
+        topicRanks.rename(columns={1:"Score"}, inplace=True)
+        #topicRanks.sort(columns=["Score"], ascending=False, inplace=True)
+        topicRanks["Rank"] = topicRanks["Score"].rank(ascending=False)
+        #print(topicRanks.head())
+        ## Saving the file.
+        if not os.path.exists(outfolder):
+            os.makedirs(outfolder)
+        outfile = outfolder + filename
+        with open(outfile, "w") as outfile: 
+            topicRanks.to_csv(outfile)
+        print("Done.")
+
+
 
 ##################################################################
 ###    VISUALIZATION                                           ###
diff --git a/tmw_config.py b/tmw_config.py
index 1cc4cc1..86ab0b5 100644
--- a/tmw_config.py
+++ b/tmw_config.py
@@ -169,6 +169,13 @@
 filename = "firstWords.csv"
 #tmw.save_firstWords(topicWordFile, outfolder, filename)
 
+### save_topicRanks
+### Saves the rank (in the overall scores) of each topic to a separate file.
+topicWordFile = wdir+"6_mallet/topics-with-words.csv"
+outfolder = wdir+"7_aggregates/"
+filename = "topicRanks.csv"
+tmw.save_topicRanks(topicWordFile, outfolder, filename)
+
 
 
 ################################
@@ -324,4 +331,4 @@
 targetCategories = ["subgenre"] # list
 methods=["weighted"] # list
 metrics=["cosine"] # list
-#tmw.itemPCA(averageDatasets, targetCategories, topicsPerItem, sortingCriterium, figsize, outfolder)
\ No newline at end of file
+#tmw.itemPCA(averageDatasets, targetCategories, topicsPerItem, sortingCriterium, figsize, outfolder)

From d50afaefabe280fbd57b8fd5d85e8dcb9bc5d0b1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christof=20Sch=C3=B6ch?= <c.schoech@gmail.com>
Date: Wed, 21 Oct 2015 16:44:34 +0200
Subject: [PATCH 54/56] make_wordle: added topic rank info to figure title

---
 tmw.py        | 18 ++++++++++++++----
 tmw_config.py |  5 +++--
 2 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/tmw.py b/tmw.py
index 3d5ef97..ba33d45 100644
--- a/tmw.py
+++ b/tmw.py
@@ -876,21 +876,31 @@ def get_color_scale(word, font_size, position, orientation, font_path, random_st
     #return "hsl(0, 00%, %d%%)" % random.randint(80, 100) # Greys for black background.
     #return "hsl(221, 65%%, %d%%)" % random.randint(30, 35) # Dark blues for white background
 
+def get_topicRank(topic, topicRanksFile):
+    #print("getting topic rank.")
+    with open(topicRanksFile, "r") as infile:
+        topicRanks = pd.read_csv(infile, sep=",", index_col=0)
+        rank = int(topicRanks.iloc[topic]["Rank"])
+        return rank
+
+
 def make_wordle_from_mallet(word_weights_file, 
-                            numOfTopics,words,outfolder, 
+                            numOfTopics,words,outfolder,
+                            topicRanksFile,
                             font_path, dpi):
     """Generate wordles from Mallet output, using the wordcloud module."""
     print("\nLaunched make_wordle_from_mallet.")
     for topic in range(0,numOfTopics):
         ## Gets the text for one topic.
         text = get_wordlewords(words, word_weights_file, topic)
-        wordcloud = WordCloud(font_path=font_path, background_color="white", margin=5).generate(text)
+        wordcloud = WordCloud(font_path=font_path, width=600, height=400, background_color="white", margin=4).generate(text)
         default_colors = wordcloud.to_array()
-        figure_title = "topic "+ str(topic)        
+        rank = get_topicRank(topic, topicRanksFile)
+        figure_title = "topic "+ str(topic) + " ("+str(rank)+"/"+str(numOfTopics)+")"       
         plt.imshow(wordcloud.recolor(color_func=get_color_scale, random_state=3))
         plt.imshow(default_colors)
         plt.imshow(wordcloud)
-        plt.title(figure_title, fontsize=24)
+        plt.title(figure_title, fontsize=28)
         plt.axis("off")
         
         ## Saving the image file.
diff --git a/tmw_config.py b/tmw_config.py
index 86ab0b5..d581ab8 100644
--- a/tmw_config.py
+++ b/tmw_config.py
@@ -185,12 +185,13 @@
 ### make_wordle_from_mallet
 ### Creates a wordle for each topic.
 word_weights_file = wdir+"6_mallet/" + "word-weights.txt"
+topicRanksFile = wdir + "7_aggregates/" + "topicRanks.csv"
 numOfTopics = numOfTopics
-words = 30
+words = 40
 outfolder = wdir+"8_visuals/wordles/"
 font_path = font_path
 dpi = 300
-#tmw.make_wordle_from_mallet(word_weights_file,numOfTopics,words,outfolder,font_path,dpi)
+#tmw.make_wordle_from_mallet(word_weights_file,numOfTopics, words,outfolder, topicRanksFile, font_path,dpi)
 
 ### crop_images
 ### Optional. Crops the wordle image files.

From f88f68a362d57e9dce2ce7d126572d39dd21be36 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christof=20Sch=C3=B6ch?= <c.schoech@gmail.com>
Date: Wed, 21 Oct 2015 16:45:45 +0200
Subject: [PATCH 55/56] make_wordle: increase title font size; Fixes
 https://github.com/cligs/tmw/issues/19

---
 tmw.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tmw.py b/tmw.py
index ba33d45..991e79b 100644
--- a/tmw.py
+++ b/tmw.py
@@ -900,7 +900,7 @@ def make_wordle_from_mallet(word_weights_file,
         plt.imshow(wordcloud.recolor(color_func=get_color_scale, random_state=3))
         plt.imshow(default_colors)
         plt.imshow(wordcloud)
-        plt.title(figure_title, fontsize=28)
+        plt.title(figure_title, fontsize=30)
         plt.axis("off")
         
         ## Saving the image file.

From 90020188d2dd97dc57f2ba57bcb082d098651e06 Mon Sep 17 00:00:00 2001
From: Ulrike Henny <ulrike.henny@uni-koeln.de>
Date: Thu, 3 Mar 2016 11:42:43 +0100
Subject: [PATCH 56/56] Modul aktualisiert

---
 tmw.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/tmw.py b/tmw.py
index 991e79b..d71a17d 100644
--- a/tmw.py
+++ b/tmw.py
@@ -966,9 +966,10 @@ def get_dataToPlot(average, firstWordsFile, mode, topTopicsShown, item):
             colmeans = allData.mean(axis=0)
             allData = allData / colmeans
         elif mode == "zscores": # zscore transformation
-            colmeans = allData.mean(axis=0) # mean for each topic
-            allstd = allData.stack().std() #std for entire df
-            allData = (allData - colmeans) / allstd # = zscore transf.
+            colmeans = allData.mean(axis=0) # ???
+            colstd = allData.std(axis=0) #std for each topic
+            allData = (allData - colmeans) / colstd # = zscore transf.
+            
         elif mode == "absolute": # absolute values
             allData = allData
         allData = allData.T
@@ -1141,7 +1142,7 @@ def get_heatmap_dataToPlot(average, mode, firstWordsFile, topTopicsShown,
             allScores = allScores / colmeans
         elif mode == "zscores": # zscore transformation
             colmeans = allScores.mean(axis=0) # mean for each topic
-            allstd = allScores.stack().std() #std for entire df
+            allstd = allScores.std(axis=0) #std for entire df
             allScores = (allScores - colmeans) / allstd # = zscore transf.
         elif mode == "absolute": # absolute values
             allScores = allScores
@@ -1203,7 +1204,7 @@ def create_distinctiveness_heatmap(dataToPlot,
                                    outfolder):
     print("- doing the plotting...")
     sns.set_context("poster", font_scale=fontscale)
-    sns.heatmap(dataToPlot, annot=False, cmap="YlOrRd", square=False)
+    sns.heatmap(dataToPlot, annot=False, cmap="RdBu_r", square=False)
     # Nice: bone_r, copper_r, PuBu, OrRd, GnBu, BuGn, YlOrRd
     plt.title("Verteilung der Topic Scores", fontsize=20)
     plt.xlabel(targetCategory, fontsize=16)
@@ -1445,6 +1446,7 @@ def build_itemScoreMatrix(averageDatasets, targetCategory,
             itemScores = itemScores.sort(columns=["sorting"], axis=0, ascending=False)
             itemScoreMatrix = itemScores.iloc[0:topicsPerItem,0:-1]
             itemScoreMatrix = itemScoreMatrix.T
+            """
             itemScoreMatrix = itemScoreMatrix.drop("Allais", axis=0)
             itemScoreMatrix = itemScoreMatrix.drop("Audoux", axis=0)
             itemScoreMatrix = itemScoreMatrix.drop("Barbara", axis=0)
@@ -1514,6 +1516,7 @@ def build_itemScoreMatrix(averageDatasets, targetCategory,
             itemScoreMatrix = itemScoreMatrix.drop("LeRougeGuitton", axis=0)
             itemScoreMatrix = itemScoreMatrix.drop("Toussaint", axis=0)
             itemScoreMatrix = itemScoreMatrix.drop("Khadra", axis=0)
+            """
             #print(itemScoreMatrix)
             return itemScoreMatrix