From 17cc866cc97a60991b00dd8f47d4bf5abbd9d725 Mon Sep 17 00:00:00 2001 From: christofs Date: Mon, 17 Aug 2015 11:27:43 +0200 Subject: [PATCH 01/56] Kleinigkeiten --- my_tmw.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/my_tmw.py b/my_tmw.py index 71076df..88ce0aa 100644 --- a/my_tmw.py +++ b/my_tmw.py @@ -97,10 +97,10 @@ #tmw.average_topicscores(corpuspath, mastermatrixfile, metadatafile, topics_in_texts, targets, mode, number_of_topics, outfolder) ### 5b make_topic_distribution_plot -aggregates = wdir+"/7_aggregates/avg*decade.csv" # if mode == lineplot, use only bydecade data! +aggregates = wdir+"/7_aggregates/avg*decade.csv" # if mode == lineplot / areaplot, use only bydecade data! outfolder = wdir+"/8_visuals/" topicwordfile = wdir+"/6_mallet/topics-with-words.csv" -rows_shown = 200 # if mode == lineplot, set to maximum number of topics +rows_shown = 200 # if mode == lineplot / areaplot, set to maximum number of topics font_scale = 1.0 dpi = 300 mode = "areaplot" # heatmap|lineplot|areaplot From 08fd0426894a30bfa05b1a25770ac048a9782466 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Schl=C3=B6r?= Date: Wed, 19 Aug 2015 09:36:57 +0200 Subject: [PATCH 02/56] Flexible paragraph respecting segmenter function with tolerance factor added --- tmw.py | 87 +++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 52 insertions(+), 35 deletions(-) diff --git a/tmw.py b/tmw.py index f25d8a6..fe0e5d3 100644 --- a/tmw.py +++ b/tmw.py @@ -68,53 +68,70 @@ def tei5reader_fulldocs(inpath, outfolder): output.write(outtext) print("Done.") - -def segmenter(inpath, outfolder, target): +# Utility function for writing segments +def writesegment(segment, outfolder, filename, counter): + from os.path import join + segname = join(outfolder, filename + "§{:04d}".format(counter) + ".txt") + with open(segname,"w") as output: + output.write(' '.join(segment)) + output.close() + +# Parameters: +# - inpath: path to search documents in +# - outfolder: path to save segments in +# - target: number of words per segment +# - sizetolerancefactor: factor of which exceedance of target is tolerated before slicing paragraphs +# 1 for zero tolerance +# -1 for infinity tolerance +# - preserveparagraphs: if True, segments will contain linebreaks according to paragraphs +# +def segmenter(inpath, outfolder, target, sizetolerancefactor = -1, preserveparagraphs = False): """Script for turning plain text files into equal-sized segments, without respecting paragraph boundaries.""" print("\nLaunched segmenter.") - import os - import glob import re + from os import listdir + from os.path import join if not os.path.exists(outfolder): os.makedirs(outfolder) - - for file in glob.glob(inpath): + counter = 1 + for relfile in listdir(inpath): + file = join(inpath, relfile) with open(file, "r") as infile: filename = os.path.basename(file)[:-4] - #print("File name: ", filename) - text = infile.read() - text = re.sub("[,;\.!?—]", " ", text) - text = re.sub("-", " ", text) - text = re.sub("\n", " ", text) - text = re.sub("[ ]{1,9}", " ", text) - words = re.split("\W", text) - #print("Number of words: ", filename, len(words)) - #for word in words[0:31]: - # print(word) - - seg = "" - actual = 0 - counter = 0 - for i in range(len(words)-1): - if len(words[i]) > 1: - if actual < target: - seg = seg + words[i] + " " - #print(words[i]) - segsplit = re.split(" ", seg) - actual = len(segsplit) - else: - counter += 1 - actual = 0 - segname = outfolder + filename + "§{:04d}".format(counter) + ".txt" - with open(segname,"w") as output: - output.write(seg) - seg = "" + segment = [] + for line in infile: + text = line + text = re.sub("[,;\.!?—]", " ", text) + text = re.sub("-", " ", text) + text = re.sub("[ ]{1,9}", " ", text) + words = re.split("\W", text) + if preserveparagraphs: + words.append("\n") + if sizetolerancefactor != -1 and len(segment) + len(words) > target * sizetolerancefactor: + print("Segment length extending size-constraints. Checking if segment length is sufficient yet.") + if len(segment) * sizetolerancefactor < target: + print("Segment length isn't sufficient. Slicing paragraph to meet segment-legth-constraints.") + # wortweise auffüllen + wordsliceindex = target - len(segment) + segment.extend(words[0:wordsliceindex]) + words = words[wordsliceindex:len(words)] + print("Segment length: \t", len(segment)) + writesegment(segment, outfolder, filename, counter) + counter = counter + 1 + segment = [] + segment.extend(words) + if len(segment) >= target: + print("Segment length: \t", len(segment)) + writesegment(segment, outfolder, filename, counter) + counter = counter + 1 + segment = [] + print("Segment length: \t", len(segment)) + writesegment(segment, outfolder, filename, counter) print("Done.") - def segments_to_bins(inpath, outfile): """Script for sorting text segments into bins.""" print("\nLaunched segments_to_bins.") From 14ff8ade67f87acdfeb978aa9ab0eeb73c12c5dc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Schl=C3=B6r?= Date: Thu, 20 Aug 2015 11:50:29 +0200 Subject: [PATCH 03/56] Tokanization changed --- tmw.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tmw.py b/tmw.py index fe0e5d3..cae666b 100644 --- a/tmw.py +++ b/tmw.py @@ -89,9 +89,11 @@ def segmenter(inpath, outfolder, target, sizetolerancefactor = -1, preserveparag """Script for turning plain text files into equal-sized segments, without respecting paragraph boundaries.""" print("\nLaunched segmenter.") import os + import glob import re from os import listdir from os.path import join + from nltk.tokenize import word_tokenize if not os.path.exists(outfolder): os.makedirs(outfolder) @@ -107,7 +109,9 @@ def segmenter(inpath, outfolder, target, sizetolerancefactor = -1, preserveparag text = re.sub("[,;\.!?—]", " ", text) text = re.sub("-", " ", text) text = re.sub("[ ]{1,9}", " ", text) - words = re.split("\W", text) + # words = re.split("\W", text) + words = word_tokenize(text) + print(words) if preserveparagraphs: words.append("\n") if sizetolerancefactor != -1 and len(segment) + len(words) > target * sizetolerancefactor: From 3e7eca3bffe57ad089264e4ca24dfe08f3980a98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Schl=C3=B6r?= Date: Thu, 20 Aug 2015 11:53:23 +0200 Subject: [PATCH 04/56] Code cleanup --- tmw.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/tmw.py b/tmw.py index cae666b..831dc84 100644 --- a/tmw.py +++ b/tmw.py @@ -89,7 +89,6 @@ def segmenter(inpath, outfolder, target, sizetolerancefactor = -1, preserveparag """Script for turning plain text files into equal-sized segments, without respecting paragraph boundaries.""" print("\nLaunched segmenter.") import os - import glob import re from os import listdir from os.path import join @@ -106,12 +105,10 @@ def segmenter(inpath, outfolder, target, sizetolerancefactor = -1, preserveparag segment = [] for line in infile: text = line - text = re.sub("[,;\.!?—]", " ", text) + text = re.sub("[,;\.!?—\t\r\n\v\f]", " ", text) text = re.sub("-", " ", text) text = re.sub("[ ]{1,9}", " ", text) - # words = re.split("\W", text) words = word_tokenize(text) - print(words) if preserveparagraphs: words.append("\n") if sizetolerancefactor != -1 and len(segment) + len(words) > target * sizetolerancefactor: From c0e101b1ab8106e414b71604adb87dc03b58cbed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Schl=C3=B6r?= Date: Thu, 20 Aug 2015 16:07:51 +0200 Subject: [PATCH 05/56] last segment added to previous if too small --- tmw.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/tmw.py b/tmw.py index 831dc84..f1a7500 100644 --- a/tmw.py +++ b/tmw.py @@ -69,10 +69,10 @@ def tei5reader_fulldocs(inpath, outfolder): print("Done.") # Utility function for writing segments -def writesegment(segment, outfolder, filename, counter): +def writesegment(segment, outfolder, filename, counter, mode="w"): from os.path import join segname = join(outfolder, filename + "§{:04d}".format(counter) + ".txt") - with open(segname,"w") as output: + with open(segname, mode) as output: output.write(' '.join(segment)) output.close() @@ -98,6 +98,7 @@ def segmenter(inpath, outfolder, target, sizetolerancefactor = -1, preserveparag os.makedirs(outfolder) counter = 1 for relfile in listdir(inpath): + counter = 1 file = join(inpath, relfile) with open(file, "r") as infile: filename = os.path.basename(file)[:-4] @@ -130,7 +131,13 @@ def segmenter(inpath, outfolder, target, sizetolerancefactor = -1, preserveparag counter = counter + 1 segment = [] print("Segment length: \t", len(segment)) - writesegment(segment, outfolder, filename, counter) + if sizetolerancefactor != -1 and len(segment) * sizetolerancefactor < target: + print("Segment length of last Segment too short. Adding text to previous segment.") + counter = counter - 1 + writesegment(segment, outfolder, filename, counter, "a") + else: + writesegment(segment, outfolder, filename, counter) + print("Done.") def segments_to_bins(inpath, outfile): From 5d9e06fbde5c916c39ed4f4a68e6c0c104cf629f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Schl=C3=B6r?= Date: Fri, 21 Aug 2015 16:42:24 +0200 Subject: [PATCH 06/56] Single lines with length > target segment size will be sliced --- tmw.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/tmw.py b/tmw.py index f1a7500..2ad92ac 100644 --- a/tmw.py +++ b/tmw.py @@ -93,16 +93,18 @@ def segmenter(inpath, outfolder, target, sizetolerancefactor = -1, preserveparag from os import listdir from os.path import join from nltk.tokenize import word_tokenize + import glob if not os.path.exists(outfolder): os.makedirs(outfolder) counter = 1 - for relfile in listdir(inpath): + for relfile in glob.glob(inpath): counter = 1 file = join(inpath, relfile) with open(file, "r") as infile: filename = os.path.basename(file)[:-4] - + if filename == "rf0053": + print("now") segment = [] for line in infile: text = line @@ -112,7 +114,7 @@ def segmenter(inpath, outfolder, target, sizetolerancefactor = -1, preserveparag words = word_tokenize(text) if preserveparagraphs: words.append("\n") - if sizetolerancefactor != -1 and len(segment) + len(words) > target * sizetolerancefactor: + while sizetolerancefactor != -1 and len(segment) + len(words) > target * sizetolerancefactor: print("Segment length extending size-constraints. Checking if segment length is sufficient yet.") if len(segment) * sizetolerancefactor < target: print("Segment length isn't sufficient. Slicing paragraph to meet segment-legth-constraints.") @@ -125,17 +127,24 @@ def segmenter(inpath, outfolder, target, sizetolerancefactor = -1, preserveparag counter = counter + 1 segment = [] segment.extend(words) - if len(segment) >= target: + if sizetolerancefactor != -1 and len(segment) > 0 and len(segment) * sizetolerancefactor < target: + print("Segment length of last Segment too short. Adding text to previous segment.") + counter = counter - 1 + writesegment(segment, outfolder, filename, counter, "a") + counter = counter + 1 print("Segment length: \t", len(segment)) + segment = [] + elif len(segment) > 0: writesegment(segment, outfolder, filename, counter) + print("Segment length: \t", len(segment)) counter = counter + 1 segment = [] - print("Segment length: \t", len(segment)) - if sizetolerancefactor != -1 and len(segment) * sizetolerancefactor < target: + if sizetolerancefactor != -1 and len(segment) > 0 and len(segment) * sizetolerancefactor < target: print("Segment length of last Segment too short. Adding text to previous segment.") counter = counter - 1 writesegment(segment, outfolder, filename, counter, "a") - else: + counter = counter + 1 + elif len(segment) > 0: writesegment(segment, outfolder, filename, counter) print("Done.") @@ -319,7 +328,7 @@ def nltk_stanfordpos(inpath, outfolder): import os import glob - from nltk.tag.stanford import POSTagger + from nltk.tag.stanford import StanfordPOSTagger as POSTagger for file in glob.glob(inpath): st = POSTagger('/home/christof/Programs/stanfordpos/models/french.tagger', '/home/christof/Programs/stanfordpos/stanford-postagger.jar', encoding="utf8") @@ -522,7 +531,7 @@ def get_color_scale(word, font_size, position, orientation, random_state=None): #font_path = "/home/christof/.fonts/AveriaSans-Regular.ttf" wordcloud = WordCloud(font_path=font_path, background_color="white", margin=5).generate(text) default_colors = wordcloud.to_array() - plt.imshow(wordcloud.recolor(color_func=get_color_scale, random_state=3)) + plt.imshow(wordcloud.recolor(random_state=3)) #color_func=get_color_scale plt.imshow(default_colors) plt.imshow(wordcloud) plt.title(figure_title, fontsize=24) From e1214dd2ee2e167d67501d3b9aff22b33dfaf423 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Schl=C3=B6r?= Date: Fri, 21 Aug 2015 16:45:39 +0200 Subject: [PATCH 07/56] cleanup to avoid conflicts --- tmw.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tmw.py b/tmw.py index 2ad92ac..56e8bf8 100644 --- a/tmw.py +++ b/tmw.py @@ -103,8 +103,6 @@ def segmenter(inpath, outfolder, target, sizetolerancefactor = -1, preserveparag file = join(inpath, relfile) with open(file, "r") as infile: filename = os.path.basename(file)[:-4] - if filename == "rf0053": - print("now") segment = [] for line in infile: text = line @@ -328,7 +326,7 @@ def nltk_stanfordpos(inpath, outfolder): import os import glob - from nltk.tag.stanford import StanfordPOSTagger as POSTagger + from nltk.tag.stanford import POSTagger for file in glob.glob(inpath): st = POSTagger('/home/christof/Programs/stanfordpos/models/french.tagger', '/home/christof/Programs/stanfordpos/stanford-postagger.jar', encoding="utf8") @@ -531,7 +529,7 @@ def get_color_scale(word, font_size, position, orientation, random_state=None): #font_path = "/home/christof/.fonts/AveriaSans-Regular.ttf" wordcloud = WordCloud(font_path=font_path, background_color="white", margin=5).generate(text) default_colors = wordcloud.to_array() - plt.imshow(wordcloud.recolor(random_state=3)) #color_func=get_color_scale + plt.imshow(color_func=get_color_scale, wordcloud.recolor(random_state=3)) plt.imshow(default_colors) plt.imshow(wordcloud) plt.title(figure_title, fontsize=24) From d2cd4502a98f9ce092348227d1ac6baa3514b69a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Schl=C3=B6r?= Date: Fri, 21 Aug 2015 16:46:33 +0200 Subject: [PATCH 08/56] cleanup to avoid conflicts --- tmw.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tmw.py b/tmw.py index 56e8bf8..a8d08eb 100644 --- a/tmw.py +++ b/tmw.py @@ -529,7 +529,7 @@ def get_color_scale(word, font_size, position, orientation, random_state=None): #font_path = "/home/christof/.fonts/AveriaSans-Regular.ttf" wordcloud = WordCloud(font_path=font_path, background_color="white", margin=5).generate(text) default_colors = wordcloud.to_array() - plt.imshow(color_func=get_color_scale, wordcloud.recolor(random_state=3)) + plt.imshow(wordcloud.recolor(color_func=get_color_scale, random_state=3)) plt.imshow(default_colors) plt.imshow(wordcloud) plt.title(figure_title, fontsize=24) From 24e136dd38f3cd6bb62c754cab64e30ec455550f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Schl=C3=B6r?= Date: Mon, 24 Aug 2015 13:06:11 +0200 Subject: [PATCH 09/56] keeping track of last segment size --- tmw.py | 148 ++++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 136 insertions(+), 12 deletions(-) diff --git a/tmw.py b/tmw.py index a8d08eb..0c19e38 100644 --- a/tmw.py +++ b/tmw.py @@ -57,10 +57,10 @@ def tei5reader_fulldocs(inpath, outfolder): ### Some cleaning up text = re.sub(" ", "", text) #text = re.sub(" ", "", text) - text = re.sub("\n{1,6}", " ", text) + # text = re.sub("\n{1,6}", " ", text) #text = re.sub("\n{1,6}", "\n", text) - text = re.sub("\n \n", "\n", text) - text = re.sub("\t\n", "", text) + # text = re.sub("\n \n", "\n", text) + # text = re.sub("\t\n", "", text) outtext = str(text) outfile = outfolder + filename + ".txt" @@ -76,6 +76,71 @@ def writesegment(segment, outfolder, filename, counter, mode="w"): output.write(' '.join(segment)) output.close() +def write(segment, file, mode = "w"): + with open(file, mode) as output: + output.write(' '.join(segment)) + output.close() + + +counter = 0 +currentsegmentsize = 0 + +# Utility function for writing segments +def writesegment(segment, outfolder, filename, target, tolerancefactor): + from os.path import join + global currentsegmentsize + global counter + + segname = join(outfolder, filename + "§{:04d}".format(counter) + ".txt") + + # case: last segment is too small => fill with (slice of) new segment + if currentsegmentsize * tolerancefactor < target: # min size limit not reached => split + #split segment + wordsliceindex = target - len(currentsegmentsize) + + # if it's too big: slice! + if len(segment) > wordsliceindex: + write(segment[0:wordsliceindex], segname, "a") + currentsegmentsize += wordsliceindex + segment = segment[wordsliceindex:len(segment)] + else: + # segment fits so append + write(segment, segname, "a") + currentsegmentsize += len(segment) + # done + return + + + # # case: new segment is too big, last segment is too small + # while currentsegmentsize + len(segment) > target * tolerancefactor: # max size limit exceeded + # if currentsegmentsize * tolerancefactor < target: # min size limit not reached => split + # #split segment + # wordsliceindex = target - len(currentsegmentsize) + # write(segment[0:wordsliceindex], segname, "a") + # segment = segment[wordsliceindex:len(segment)] + # currentsegmentsize += wordsliceindex + + + # case: new segment is too big + # if segment > targer: slice segment + while len(segment) > target * tolerancefactor: + counter += 1 + currentsegmentsize = 0 + write(segment[0:target], segname) + segment = segment[target:len(segment)] + + # now #segment is < target + if (len(segment) == 0): + #segment was perfectly sliced so we are done + return + + # there's some part of segment left, write this to file + counter += 1 + currentsegmentsize = len(segment) + write(segment, segname) + + + # Parameters: # - inpath: path to search documents in # - outfolder: path to save segments in @@ -98,52 +163,111 @@ def segmenter(inpath, outfolder, target, sizetolerancefactor = -1, preserveparag if not os.path.exists(outfolder): os.makedirs(outfolder) counter = 1 + # work on files in inpath for relfile in glob.glob(inpath): counter = 1 + + # get absolut filename file = join(inpath, relfile) + # track size of last segment in order to avoid exceeding size constraints while appending + + lastsegment = 0 with open(file, "r") as infile: filename = os.path.basename(file)[:-4] + # segment contains words assigned to the current segment segment = [] + + # go thru paragraphs one by one for line in infile: text = line + # remove special characters and space-chains text = re.sub("[,;\.!?—\t\r\n\v\f]", " ", text) text = re.sub("-", " ", text) text = re.sub("[ ]{1,9}", " ", text) + + # tokanize text words = word_tokenize(text) + if preserveparagraphs: words.append("\n") + + # while current #segment and #remaining-words exceed limitation (e.g. line too long) while sizetolerancefactor != -1 and len(segment) + len(words) > target * sizetolerancefactor: print("Segment length extending size-constraints. Checking if segment length is sufficient yet.") + + # if #segment is yet too small, extend to desired limit with part of words (e.g. slice line) if len(segment) * sizetolerancefactor < target: print("Segment length isn't sufficient. Slicing paragraph to meet segment-legth-constraints.") # wortweise auffüllen wordsliceindex = target - len(segment) segment.extend(words[0:wordsliceindex]) words = words[wordsliceindex:len(words)] + + # Possible states: Line: / + # Segment: => save segment + print("Segment length: \t", len(segment)) writesegment(segment, outfolder, filename, counter) counter = counter + 1 + lastsegment = len(segment) segment = [] + + # dealing with state: : + if len(words) > target * sizetolerancefactor: + segment.extend(words[0:target]) + words = words[target:len(words)] + + # line completely processed + if len(words) == 0: + continue + + # possible states: / + segment.extend(words) + + # dealing with state: : + # if words is too small for its own segment: append to previous segment ignoring further limitations if sizetolerancefactor != -1 and len(segment) > 0 and len(segment) * sizetolerancefactor < target: print("Segment length of last Segment too short. Adding text to previous segment.") - counter = counter - 1 - writesegment(segment, outfolder, filename, counter, "a") + + # avoid appending beyond size constraints: + if sizetolerancefactor != -1 and lastsegment + len(segment) > target * sizetolerancefactor: + print("Segment length of last segment exceeded. Starting new segment") + lastsegment = len(segment) + writesegment(segment, outfolder, filename, counter) + else: + print("Segment length of last segment: " + str(lastsegment)) + counter = counter - 1 + lastsegment += len(segment) + writesegment(segment, outfolder, filename, counter, "a") counter = counter + 1 print("Segment length: \t", len(segment)) segment = [] + # otherways just save remaining words in own segment elif len(segment) > 0: writesegment(segment, outfolder, filename, counter) print("Segment length: \t", len(segment)) counter = counter + 1 + lastsegment = len(segment) segment = [] - if sizetolerancefactor != -1 and len(segment) > 0 and len(segment) * sizetolerancefactor < target: - print("Segment length of last Segment too short. Adding text to previous segment.") - counter = counter - 1 - writesegment(segment, outfolder, filename, counter, "a") - counter = counter + 1 - elif len(segment) > 0: - writesegment(segment, outfolder, filename, counter) + # # following code might be obsolete. TODO: check! + # if sizetolerancefactor != -1 and len(segment) > 0 and len(segment) * sizetolerancefactor < target: + # print("Segment length of last Segment too short. Adding text to previous segment.") + # # avoid appending beyond size constraints: + # if sizetolerancefactor != -1 and lastsegment + len(segment) > target * sizetolerancefactor: + # print("Segment length of last segment exceeded. Starting new segment") + # lastsegment = 0 + # writesegment(segment, outfolder, filename, counter) + # else: + # counter = counter - 1 + # lastsegment += len(segment) + # writesegment(segment, outfolder, filename, counter, "a") + # counter = counter + 1 + # print("Segment length: \t", len(segment)) + # segment = [] + # elif len(segment) > 0: + # writesegment(segment, outfolder, filename, counter) + # lastsegment = segment print("Done.") From e1e6e695c92f2cde557f2c90347e64468bea8ba9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Schl=C3=B6r?= Date: Mon, 24 Aug 2015 16:20:58 +0200 Subject: [PATCH 10/56] complete refactoring of segmentation --- tmw.py | 207 +++++++++++++++++++++++++-------------------------------- 1 file changed, 89 insertions(+), 118 deletions(-) diff --git a/tmw.py b/tmw.py index 0c19e38..d9eea14 100644 --- a/tmw.py +++ b/tmw.py @@ -76,68 +76,110 @@ def writesegment(segment, outfolder, filename, counter, mode="w"): output.write(' '.join(segment)) output.close() +# Utility function for writing into files def write(segment, file, mode = "w"): with open(file, mode) as output: output.write(' '.join(segment)) output.close() +# global segment counter counter = 0 + +# global current segment size currentsegmentsize = 0 # Utility function for writing segments -def writesegment(segment, outfolder, filename, target, tolerancefactor): +def writesegment(segment, outfolder, filename, target, tolerancefactor, preserveparagraphs): from os.path import join global currentsegmentsize global counter + + + + # ignore empty segments + if segment == ["\n"] or len(segment) < 1: + return + + + # workaround for easy inter line-spacing in case of paragraph removal for lines combined into one segment + if not preserveparagraphs and segment[-1] == "\n": + segment = segment[0:len(segment) - 1] + segment[-1] += " " + + segname = join(outfolder, filename + "§{:04d}".format(counter) + ".txt") + relname = filename + "§{:04d}".format(counter) + ".txt" + + # case: last segment is too small => fill with (slice of) new segment if currentsegmentsize * tolerancefactor < target: # min size limit not reached => split - #split segment - wordsliceindex = target - len(currentsegmentsize) - - # if it's too big: slice! - if len(segment) > wordsliceindex: - write(segment[0:wordsliceindex], segname, "a") - currentsegmentsize += wordsliceindex - segment = segment[wordsliceindex:len(segment)] - else: - # segment fits so append - write(segment, segname, "a") - currentsegmentsize += len(segment) - # done - return - - - # # case: new segment is too big, last segment is too small - # while currentsegmentsize + len(segment) > target * tolerancefactor: # max size limit exceeded - # if currentsegmentsize * tolerancefactor < target: # min size limit not reached => split - # #split segment - # wordsliceindex = target - len(currentsegmentsize) - # write(segment[0:wordsliceindex], segname, "a") - # segment = segment[wordsliceindex:len(segment)] - # currentsegmentsize += wordsliceindex - + #split segment + wordsliceindex = target - currentsegmentsize + + # if it's too big: slice! + if currentsegmentsize + len(segment) > target * tolerancefactor: + print(relname + "\t Last segment size: " + str(currentsegmentsize) + "\t appending " + str(wordsliceindex) + "\t for a total of " + str((currentsegmentsize + wordsliceindex))) + write(segment[0:wordsliceindex], segname, "a") + currentsegmentsize += wordsliceindex + segment = segment[wordsliceindex:len(segment)] + + # segment is filled. continue with next one + counter += 1 + currentsegmentsize = 0 + segname = join(outfolder, filename + "§{:04d}".format(counter) + ".txt") + relname = filename + "§{:04d}".format(counter) + ".txt" + if os.path.isfile(segname): + os.remove(segname) + # else just add text to current segment + else: + print(relname + "\t Last segment size: " + str(currentsegmentsize) + "\t appending " + str(len(segment)) + "\t for a total of " + str((currentsegmentsize + len(segment)))) + # segment fits so append + write(segment, segname, "a") + currentsegmentsize += len(segment) - segment.count("\n") # take possible segment end into account! + # done + return # case: new segment is too big - # if segment > targer: slice segment + # if segment > target: slice segment while len(segment) > target * tolerancefactor: - counter += 1 - currentsegmentsize = 0 + print(relname + "\t Last segment size: " + str(currentsegmentsize) + "\t appending " + str(target) + "\t for a total of " + str((currentsegmentsize + target))) write(segment[0:target], segname) segment = segment[target:len(segment)] - # now #segment is < target + # segment is filled. continue with next one + counter += 1 + currentsegmentsize = 0 + segname = join(outfolder, filename + "§{:04d}".format(counter) + ".txt") + relname = filename + "§{:04d}".format(counter) + ".txt" + if os.path.isfile(segname): + os.remove(segname) + print(relname + "\t New segment with size \t0") + + # now size of segment is < target if (len(segment) == 0): #segment was perfectly sliced so we are done return - # there's some part of segment left, write this to file - counter += 1 - currentsegmentsize = len(segment) - write(segment, segname) + # there's some part of segment left, write this into file + + + # if the remaining part is exceeding current segment's capacity start new segment + if currentsegmentsize + len(segment) > target * tolerancefactor: + # segment is filled. continue with next one + counter += 1 + currentsegmentsize = 0 + segname = join(outfolder, filename + "§{:04d}".format(counter) + ".txt") + relname = filename + "§{:04d}".format(counter) + ".txt" + if os.path.isfile(segname): + os.remove(segname) + print(relname + "\t New segment with size \t0") + + print(relname + "\t Last segment size: " + str(currentsegmentsize) + "\t appending " + str(len(segment)) + "\t for a total of " + str((currentsegmentsize + len(segment)))) + currentsegmentsize += len(segment) - segment.count("\n") # take possible segment end into account! + write(segment, segname, "a") @@ -162,18 +204,25 @@ def segmenter(inpath, outfolder, target, sizetolerancefactor = -1, preserveparag if not os.path.exists(outfolder): os.makedirs(outfolder) - counter = 1 + global counter + global currentsegmentsize # work on files in inpath for relfile in glob.glob(inpath): - counter = 1 + # get absolut filename file = join(inpath, relfile) - # track size of last segment in order to avoid exceeding size constraints while appending - lastsegment = 0 with open(file, "r") as infile: filename = os.path.basename(file)[:-4] + + counter = 0 + currentsegmentsize = 0 + segname = join(outfolder, filename + "§{:04d}".format(counter) + ".txt") + relname = filename + "§{:04d}".format(counter) + ".txt" + if os.path.isfile(segname): + os.remove(segname) + # segment contains words assigned to the current segment segment = [] @@ -188,86 +237,8 @@ def segmenter(inpath, outfolder, target, sizetolerancefactor = -1, preserveparag # tokanize text words = word_tokenize(text) - if preserveparagraphs: - words.append("\n") - - # while current #segment and #remaining-words exceed limitation (e.g. line too long) - while sizetolerancefactor != -1 and len(segment) + len(words) > target * sizetolerancefactor: - print("Segment length extending size-constraints. Checking if segment length is sufficient yet.") - - # if #segment is yet too small, extend to desired limit with part of words (e.g. slice line) - if len(segment) * sizetolerancefactor < target: - print("Segment length isn't sufficient. Slicing paragraph to meet segment-legth-constraints.") - # wortweise auffüllen - wordsliceindex = target - len(segment) - segment.extend(words[0:wordsliceindex]) - words = words[wordsliceindex:len(words)] - - # Possible states: Line: / - # Segment: => save segment - - print("Segment length: \t", len(segment)) - writesegment(segment, outfolder, filename, counter) - counter = counter + 1 - lastsegment = len(segment) - segment = [] - - # dealing with state: : - if len(words) > target * sizetolerancefactor: - segment.extend(words[0:target]) - words = words[target:len(words)] - - # line completely processed - if len(words) == 0: - continue - - # possible states: / - - segment.extend(words) - - # dealing with state: : - # if words is too small for its own segment: append to previous segment ignoring further limitations - if sizetolerancefactor != -1 and len(segment) > 0 and len(segment) * sizetolerancefactor < target: - print("Segment length of last Segment too short. Adding text to previous segment.") - - # avoid appending beyond size constraints: - if sizetolerancefactor != -1 and lastsegment + len(segment) > target * sizetolerancefactor: - print("Segment length of last segment exceeded. Starting new segment") - lastsegment = len(segment) - writesegment(segment, outfolder, filename, counter) - else: - print("Segment length of last segment: " + str(lastsegment)) - counter = counter - 1 - lastsegment += len(segment) - writesegment(segment, outfolder, filename, counter, "a") - counter = counter + 1 - print("Segment length: \t", len(segment)) - segment = [] - # otherways just save remaining words in own segment - elif len(segment) > 0: - writesegment(segment, outfolder, filename, counter) - print("Segment length: \t", len(segment)) - counter = counter + 1 - lastsegment = len(segment) - segment = [] - # # following code might be obsolete. TODO: check! - # if sizetolerancefactor != -1 and len(segment) > 0 and len(segment) * sizetolerancefactor < target: - # print("Segment length of last Segment too short. Adding text to previous segment.") - # # avoid appending beyond size constraints: - # if sizetolerancefactor != -1 and lastsegment + len(segment) > target * sizetolerancefactor: - # print("Segment length of last segment exceeded. Starting new segment") - # lastsegment = 0 - # writesegment(segment, outfolder, filename, counter) - # else: - # counter = counter - 1 - # lastsegment += len(segment) - # writesegment(segment, outfolder, filename, counter, "a") - # counter = counter + 1 - # print("Segment length: \t", len(segment)) - # segment = [] - # elif len(segment) > 0: - # writesegment(segment, outfolder, filename, counter) - # lastsegment = segment + words.append("\n") + writesegment(words, outfolder, filename, target, sizetolerancefactor, preserveparagraphs) print("Done.") From fc64ddffe6defb346ee8f4e7eb57e3b82964bd16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Schl=C3=B6r?= Date: Mon, 24 Aug 2015 17:16:46 +0200 Subject: [PATCH 11/56] default value changed to 1 --- tmw.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tmw.py b/tmw.py index d9eea14..645e239 100644 --- a/tmw.py +++ b/tmw.py @@ -192,7 +192,7 @@ def writesegment(segment, outfolder, filename, target, tolerancefactor, preserve # -1 for infinity tolerance # - preserveparagraphs: if True, segments will contain linebreaks according to paragraphs # -def segmenter(inpath, outfolder, target, sizetolerancefactor = -1, preserveparagraphs = False): +def segmenter(inpath, outfolder, target, sizetolerancefactor = 1, preserveparagraphs = False): """Script for turning plain text files into equal-sized segments, without respecting paragraph boundaries.""" print("\nLaunched segmenter.") import os From 9f3e5b0b044a7542725ef155013424eced4b6929 Mon Sep 17 00:00:00 2001 From: christofs Date: Tue, 25 Aug 2015 16:27:59 +0200 Subject: [PATCH 12/56] Removed defaults from segmenter, added parameters instead --- my_tmw.py | 4 +++- tmw.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/my_tmw.py b/my_tmw.py index 88ce0aa..1afa717 100644 --- a/my_tmw.py +++ b/my_tmw.py @@ -25,7 +25,9 @@ inpath = wdir + "1_txt/*.txt" outpath = wdir + "2_segs/" segment_length = 1000 -#tmw.segmenter(inpath,outpath,segment_length) +sizetolerancefactor = 1 # 1 = zero tolerance; 1.1 = +/- 10% tolerance. +preserveparagraphs = False +#tmw.segmenter(inpath,outpath,segment_length, sizetolerancefactor, preserveparagraphs) ### 1c - segments_to_bins: inpath, outfile inpath = wdir + "2_segs/*.txt" diff --git a/tmw.py b/tmw.py index d9eea14..51663d7 100644 --- a/tmw.py +++ b/tmw.py @@ -192,7 +192,7 @@ def writesegment(segment, outfolder, filename, target, tolerancefactor, preserve # -1 for infinity tolerance # - preserveparagraphs: if True, segments will contain linebreaks according to paragraphs # -def segmenter(inpath, outfolder, target, sizetolerancefactor = -1, preserveparagraphs = False): +def segmenter(inpath, outfolder, target, sizetolerancefactor, preserveparagraphs): """Script for turning plain text files into equal-sized segments, without respecting paragraph boundaries.""" print("\nLaunched segmenter.") import os From e6ec126f0edac68f11b7ac1b603714717c41b709 Mon Sep 17 00:00:00 2001 From: christofs Date: Tue, 25 Aug 2015 17:55:16 +0200 Subject: [PATCH 13/56] make_lemmatext: with mode for POS to be chosen --- my_tmw.py | 4 +++- tmw.py | 28 ++++++++++++++++++++-------- 2 files changed, 23 insertions(+), 9 deletions(-) diff --git a/my_tmw.py b/my_tmw.py index 1afa717..d295979 100644 --- a/my_tmw.py +++ b/my_tmw.py @@ -50,7 +50,9 @@ ### 2c - make_lemmatext inpath = wdir + "4_tagged/*.trt" outfolder = wdir + "5_lemmata/" -#tmw.make_lemmatext(inpath,outfolder) +mode = "N" # N=nouns, NV=nouns+verbs, NVAA=nouns+verbs+adj+adverbs +stoplist = ["", "unknown"] +#tmw.make_lemmatext(inpath, outfolder, mode, stoplist) diff --git a/tmw.py b/tmw.py index 51663d7..bb1829c 100644 --- a/tmw.py +++ b/tmw.py @@ -472,7 +472,7 @@ def call_treetagger(infolder, outfolder, tagger): -def make_lemmatext(inpath,outfolder): +def make_lemmatext(inpath, outfolder, mode, stoplist): """Function to extract lemmas from TreeTagger output.""" print("\nLaunched make_lemmatext.") @@ -496,13 +496,24 @@ def make_lemmatext(inpath,outfolder): if len(splitline) == 3: lemma = splitline[2] pos = splitline[1] - word = splitline[0] - if "|" in lemma: - lemmata.append(word.lower()) - elif "NOM" in pos and "|" not in lemma and "" not in lemma: - #elif "NOM" in pos or "VER" in pos or "ADJ" in pos or "ADV" in pos and "|" not in lemma and "" not in lemma: - lemmata.append(lemma.lower()) - stoplist = ["les","suis","est","un", "pas", "abord", "rien", "fait", "ton", "moi","être"] + token = splitline[0] + ## Select subset of lemmas according to parameter "mode" + if mode == "N": + if "|" in lemma: + lemmata.append(token.lower()) + elif "NOM" in pos and "|" not in lemma and "" not in lemma: + lemmata.append(lemma.lower()) + elif mode == "NV": + if "|" in lemma: + lemmata.append(token.lower()) + elif "NOM" in pos or "VER" in pos and "|" not in lemma and "" not in lemma: + lemmata.append(lemma.lower()) + elif mode == "NVAA": + if "|" in lemma: + lemmata.append(token.lower()) + elif "NOM" in pos or "VER" in pos or "ADJ" in pos or "ADV" in pos and "|" not in lemma and "" not in lemma: + lemmata.append(lemma.lower()) + ## Continue with list of lemmata, but remove undesired leftover words lemmata = ' '.join([word for word in lemmata if word not in stoplist]) lemmata = re.sub("[ ]{1,4}"," ", lemmata) newfilename = os.path.basename(file)[:-4] + ".txt" @@ -514,6 +525,7 @@ def make_lemmatext(inpath,outfolder): + ################################################################## ### 3. Importing and modeling with Mallet ### ################################################################## From cae9b1b518fedbdf303c169c3ba335ec1e44ef5c Mon Sep 17 00:00:00 2001 From: christofs Date: Tue, 25 Aug 2015 17:57:10 +0200 Subject: [PATCH 14/56] make_lemmatext: add option esN with criterium NC to mode --- my_tmw.py | 2 +- tmw.py | 11 ++++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/my_tmw.py b/my_tmw.py index d295979..88f1e0f 100644 --- a/my_tmw.py +++ b/my_tmw.py @@ -50,7 +50,7 @@ ### 2c - make_lemmatext inpath = wdir + "4_tagged/*.trt" outfolder = wdir + "5_lemmata/" -mode = "N" # N=nouns, NV=nouns+verbs, NVAA=nouns+verbs+adj+adverbs +mode = "esN" # esN=nouns, frN=nouns, frNV=nouns+verbs, frNVAA=nouns+verbs+adj+adverbs stoplist = ["", "unknown"] #tmw.make_lemmatext(inpath, outfolder, mode, stoplist) diff --git a/tmw.py b/tmw.py index bb1829c..f14a7b4 100644 --- a/tmw.py +++ b/tmw.py @@ -498,21 +498,26 @@ def make_lemmatext(inpath, outfolder, mode, stoplist): pos = splitline[1] token = splitline[0] ## Select subset of lemmas according to parameter "mode" - if mode == "N": + if mode == "frN": if "|" in lemma: lemmata.append(token.lower()) elif "NOM" in pos and "|" not in lemma and "" not in lemma: lemmata.append(lemma.lower()) - elif mode == "NV": + elif mode == "frNV": if "|" in lemma: lemmata.append(token.lower()) elif "NOM" in pos or "VER" in pos and "|" not in lemma and "" not in lemma: lemmata.append(lemma.lower()) - elif mode == "NVAA": + elif mode == "frNVAA": if "|" in lemma: lemmata.append(token.lower()) elif "NOM" in pos or "VER" in pos or "ADJ" in pos or "ADV" in pos and "|" not in lemma and "" not in lemma: lemmata.append(lemma.lower()) + if mode == "esN": + if "|" in lemma: + lemmata.append(token.lower()) + elif "NC" in pos and "|" not in lemma and "" not in lemma: + lemmata.append(lemma.lower()) ## Continue with list of lemmata, but remove undesired leftover words lemmata = ' '.join([word for word in lemmata if word not in stoplist]) lemmata = re.sub("[ ]{1,4}"," ", lemmata) From e723daa748b820e7227e30f9aac27bd12a00ff7d Mon Sep 17 00:00:00 2001 From: christofs Date: Tue, 25 Aug 2015 18:02:10 +0200 Subject: [PATCH 15/56] make_lemmatext: bugfix elif --- tmw.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tmw.py b/tmw.py index f14a7b4..48fd57d 100644 --- a/tmw.py +++ b/tmw.py @@ -513,7 +513,7 @@ def make_lemmatext(inpath, outfolder, mode, stoplist): lemmata.append(token.lower()) elif "NOM" in pos or "VER" in pos or "ADJ" in pos or "ADV" in pos and "|" not in lemma and "" not in lemma: lemmata.append(lemma.lower()) - if mode == "esN": + elif mode == "esN": if "|" in lemma: lemmata.append(token.lower()) elif "NC" in pos and "|" not in lemma and "" not in lemma: From 32c6b6eba2a9d23dca25e4636f01390ae983ad60 Mon Sep 17 00:00:00 2001 From: christofs Date: Thu, 27 Aug 2015 19:04:04 +0200 Subject: [PATCH 16/56] all deactivated --- __pycache__/tmw.cpython-34.pyc | Bin 27799 -> 27799 bytes tmw_config.py | 8 ++++---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/__pycache__/tmw.cpython-34.pyc b/__pycache__/tmw.cpython-34.pyc index 80f234cfab053dd7b20755136322a872ecafec74..526672e938f5e42d98e93819dc9fc0123f1d2f43 100644 GIT binary patch delta 8311 zcma)Bd6Zk#dDqo8l18)dc*e7NJdy1gjE(Ksn6beN9)kyL4;ViTLYCfW9wX_g_i4P0 z0pYZ17SfR9CeT0+w5MrGLvs?2x`aYoa?*1e=#oQ9n({&p{R`I#g`^F%7#jNhzN?Ys z@i@Ug`ptcJ{qA>v-*Vr3A^mO=rzc5rVu$piz@O@!?A-MfY zBYgXTn{G)+$CPd$kH1F5DM%|HFC@~A(WuL2#>|?zf@7$;;L+}Y$Kx^TPvzss?bPXl zd8*+0frffD*eeFryTM{`n7YW2y1(wuV|7%F`DHU|x&UniZXlVl9W&;4>l`zUx#bn8 z_Gj&sDF@JPumVFUVd(AjbAkjO(JDHH>?&;urF_kpP@?+0K&ehWRo~wdqhc)T*4fT@ zK2eyJZR$t$<2zV@di07CxIxRAu`;F`lxEgGVUkq#e6A3S$Psku0j%JDg@jC-w8MST zkqsjCY64gxor$mh*mN^YiySc~jN18;Q$TH73&ykKWcnaCy0nMAT+ zOSut!Sj||~z@DKueP%%`#}Cldn7kTYHvynS*PpR+rd)|)fE9F+UcTGBUrdzRTE@iY zQ4Hbp!@zDRPr*;H2U5IQ?S;<1u&gLi1sS~YVX=kQ@ByMPXCXqd!N=&?= z#@nvm3eLH{q;moXoFHPc7Th|Fh4_4uzHB1z22CfK$eXTz&OGhN4)s`D&s`AO2s!h| zoxR|QA0@dHQwF4&E}w8r$Q!6i^78F@Q_k2j8_&*X3RXU29;PAxPR$P3=JB_RCea|e zeB)kusMOkC5XBHxmrp$w)d${0RctM^dlFh6!gKYUTkzNbXaw+~fYq^(KFOO=%f|_W zV$Dm8F&dH?Gr=bXy2=5x_Oj}#`m4`NI(tp9H`QS0pqMW0>P!hSs-EoHT4Y{^wU{gs zWOXB?n7p);rsJ?x)kuehU4Jr>$&e|Ah$nUirvUNXAUT*cJya%=w?QP((WujuawOCF z8r1iDJZn9CblC)vcZuLlSR-DLrm1K3sRz4b;*QdvcW?0D&eVc`xFEvkef0Odm%#PS z$SHX}l?>n1%?B33+vjt0Is0U8!thheZHDi*J%@}Siq~Cd_^#W%A3)WnGi7+Ec9Eai z(YJGJMUa?R?!dHu&^#vGkW;Yp8H*ffMZZK{R4{c9t8ew~79sV2eZ5+*ty-^ar&XPR zI5i4oFM1fu1RR>YAlIUOD_3OflcY{21nitoU3=V4dW^N#So&%VXj&Ao7^iWzbhMAC zuFZPq6R1$h2=lV+?&~NhWE12Wx6^Fp23Wfh;0D*1w;eaYbTaC5=1Dk-D)pcgGM)te z>D1QKm`%)?aXl%8lOZ+MZ;9E`BmJKi2X3Wq(+fd%kC9lNRu%Jw#a*JMBbYnv)@O+n zW+Bf0AF)H*MZd}ocHU7LY}B)eac$EEx>e3p3)afyDoa%i|R%IE^Q;zTSf z>BKo$IdOC})Y*~lQKQA4FVIlkAr4BO>bY)mz4vs3%I1F`gyEOA#6V1d7oiI^tdFm1j!c(Gl-1hxRo^CNXD`ry!S3 zy*#o2I7VR5jwxi)hB6YydY*ltA0^)UUFf6kiLKg(sBZoIgAYF77|A8;ZF-C~Yf7A; zb~gmS87H@IgscLnO$%;S-;MPKPVz+OWBq|5mm1crYc?9RI;|X7O)1Xgnw6`JFpY|{ zRIZPT*`Or!YoCx!@#iX*w9DOg@YfA6I~)QQC}!<)f%N4fr&H+nh}} zbbMM=wuT4Mwg7k>@Fnz$*7jPW?Cr^grZ!epBQ#Lysd^sA-sjrWTn z({Wz?;>tZm)^t_L)o}Y&EJ*|ayT}F4QqDl$hy_PblV1T;wjf#HKSJGL1vpb^OA<#5 zX1rE-K~c#;Q1_^Bt#1@xEB*a?C5B)*HRdu;k5jHcamsRB|7kNJ<*2%M!%FeT>ah)5 zVwaVh&<~ya{+b~pol8Qtt$n4hX4A;>vl084mjhxV^OG|q`Z>Ef<#EEn*YmG+Y zIpsQwuD$k|z3!52_mYE2&S$Xm#0m9#8=FO|`ufHZQ7Zk*#y#F*#UV)c7wikI83==*L_$@#YP64Nj zAnp3dA-KMbos?`|o#??f_HmSco4^PjHl<}cMpWvYKaOLm$ zlpWWeE?y?fQp&gT;BpkzaVk~XaLtXPIJ&s9tIs&*H^T5lJdr+{XVB#@35c&1eHbmQ zs(8)fMh8`u!(VN+cE#M&kMiHQ}unmV}b$|-c&k~EOWm}gXCVmsZjAD>uD_~OJ!n-S5?GxNEmd=Zo0 zsk}Q@-vD!U!w`gS!}BTm9GdC-9iAK*$P)TQRMzma^SHUGo z{($E87>y~qE|N1q7)Le<4aIzVKUH;7TDP-T998>wZt}kBDV^Q9Q^Y}lg$_>c1}3wK zwCM~arA@cg)HZz=*0Im^-7>@3 zamm<9ZtI3@X{D`PBBSs7zT@^OqpmE&QXn?XEcCC4Ry_)!A;C;h9Pv)nF_!E zm@JrC=dUWOdK)S2?e?~M)2dW#%#-};8Iy_7FHP=sBUKTF&8j53-^6$o_1WUUS1paCJ zmW*A#g;lr3x~w+;ox1CWKJVL}(tS64TwDP{QW|L&5(Uf7oj0)bOMv@T@8lKYKDA{s zwv9ctmLtC4AJ6B~Y(?c%)b;1F>JKrKpY}oT7)}+e3I7P)YR*md?O!SWRc+rtDng~y{xxEpIlxx) z7W%#o2njOKMwfy*`H{2DOo*t<4E!6=o6$qvJSu=f>+~{V9DM@7l@~&t&xE0 z+(u{s&R8qy4G@R#O#TSvk_||~2s^}DA)~ZHlG^#7pRnGe?!M{CgE~?%ji?9G$mIyc z3ga*qE{~5SfzR-e;dG0rw}J9m*`Pji;MH0RGiRB|l;bV8E)y5) zx&bn3Q&P7oOA+hYV3?#eBKO3jqa)Fv?77=1l~T z^?)Y867pAh|2fRa!m+H&9cEL*BKw&H>($$bH&M2`cKXF*n$)y_M<=&@7H8>E#|e3k zikAxzX!2>2FVF9Tx31_gz#p8Dry3aAN@s61Ydmz#(x0G?+FxtNx8eB@0FY#g)O0!W z)77$&lAl6XIBcCu>nxh?Zl~#L{0*vh1C&Z{zFM{k&ut$0GpeCLP%#wAeug!V9Z}+$ z$sFc8vk|1(%@Yp2VYIWRu4BJ`iS?*wkE{?W^}{2>qNnuUkv7p`1nKP)-5%*OKz>sv zW`-I*dX4ub@+?OOqpb9@0jS?OI#7HWa<$Noa*k1tz>}nb_^eSkOII~|NnAs6<~K9F zO7hF{o2{hk+3U3QylW;ov+QLiA;Trq-eU}lv&u3C#bD_{qfrzeAW;{Z1`yT?l%NmL zJLhc!Q{?=rUcLwX9VCd=Ymzg+d4A?M>$~dVv3Y$YEYfm>?qxowC}U9F$J)fBYTdC> zF+_y!QQ2cJ6h9QH!!=C$7?yk*@MQp4BRe29C?h)|`mZ87b#mb_;nFL24%V2!RE-T>%=uVDji8l$Nhd1LTt-?s4B^=pnBw6o#oZ`AaM4W> zpyD+93a9KxEvq{=xZN~ELc%K+4y%v8=G`$0%wnHc0%t^Xkd@3;(dbXR3_exs9jqFd%$^~0;-hN)%*0x-wEPNNd$vIgXBeW=h46OVH%{f{ z#Oo@5e3H#&YVoz(hm8TtxqsGt)jZ0H4>Of5gMBUY6KTleOSR6M635NVrD0$y*++>_ zCOIO;N?%C+KQy|y4&Y1D#R)v8&hp~KMz+#k4EjW!IBv_F9H1uzU&?daG@ivG delta 8329 zcma)BdvH|OdEc`yX;-TUk`MwUv;qWPfdm!?%)n}=mDSG!kYt=+xrb5|e) z&eoZ@6W8fj5)vWV0${5X4_-}%n(dz^ExzA3K0DMJ4isSkhU-nUME>?@&P`h`}(z$(JN!0SSP zJh*+G4xK)9@Q5W{Tk1d#|Ld@um#JJlZ%w$m!IbT^ow2idSF6e3nf4lw$D`|}bMY}J zc`0vC=e3_veLvVCdezT^h2UE1BE#xKH4l%5s2KCh7St>QEFsW=MA~ufm|xep_5|jZ z-Kh3woTM#d=+-+2Yf!?_)AY{`5_m+jXcKZ-adU0b*MtdWEx`?eQb>KFu4};n6=M-y z_?XJ-BFIk^W=O_E~&ZM19&DipOG|X*9 zHsR+2Kr?{V-HOsCz-GWkz`cON7T{7q2Z0V+xtyI%%3bs$wm^2FAqv<=pj%v8FYlyn zX=M}kxRuB|Qr?GV);t7ZH9(c-7=x)S|A6x3AYcaoTGjq^D!bU0t56QG&TdiFt4;gG zfnv*oKCx*%#_*AX3o)0ERxM^Mwv@b+XXkxw|8+~5z@h34m5nLOZ)uHBxt zGGj?=@LyDa%RO757wt>97jWhlagDX0YcLk#v!;C+E2o3Do3L`W_D|ZET-l+nwJd&w zg|2m{#@xN2jUOd>2vY{6JyAOJn2;x^O7ilpIa`i9G84~CrSqv=+CEN0-lAp~r|a=I zi$+l|7Ww+U@?5dGH7^QbsxA>B7BL){J5ky_2`$f2mx`n3V=kXR9}0*bt2HM%itbSW zY>YK8F-F%X(zeAX1qK_l>$uhKbX`u;*&ELNtLkp+7K!5Swxkg2)#ny%DKJ$-MocCn zWF;3eHI|c3!ggIYvK2oP9ch6LGn2h^kZ%!X;U?CLf05dxrWdC zP=A-lvxb;QN0+gPyj6lXVvTrS+P0ZhsHfXwVzT%b?F0VPOf9&Kn<8|>NB`gO5@_GJ z9Fd2)G;;XROlbR5b~5W+%noWlwLGYO4-OsEK@@lG(!O1L_5-Nec1N^#WH-5<9i2P3 z%n1_n%7d8951Pk>u66THE}bHu%3VMZqAn_!x<}O4I`@dMdatv?=(XACmF={w6L_u) zg@V!2!ZHDuCeO=zP%*?6Y3CxTlL-Mk=TkRMxRZXyS}QDl9R@Vc3RsBKI7hn1N0irQ zy|YJ|qmmBsvK#?wC{AQ7a+brG%`}j?lVjxu?aMi?4ltc`UDmz`CsC#zltRXHiT-Od z*3*!&ChfSHl;TX6>g!60CyQ6ReoGumQnwQ`LH3y8SdCE?^M%E&($YBQ4(YlKk;2Z$ zIV4O_ow19%RJObAp}E1jjzx?cn|>7C=FZdz*39H8O_s{!9GRaWPvwIB9&cDg=x?)V z@&>)Dye&dLReY=a9q-G?!%lD|na>QE}1o656KjjBk`u@!Tp^dgWePQ*$j z+&ITCH;#^4J<-#?S}$;>@-)9wJT^14b+^WreRqBaQu#9KO@RCG^&q6UlC`N?6%e?ZrqIY$VclYm6#R!?iWQsfm zxpeB~kyim}0_>QgC~YV$VXQY;IewJja%2{L)T6PLTL-9=`p@^@f1kr9mr_5YpRp!G ziQh-hTKLU4+gnF4!#Yepzm(PJ`clD7IMQ}OESwYnsOD57lvdIHg8KK~ z9paVZx;1x;oqtWj$u9vu3;1J#6O^ZLiooH43BgL@aBvyDh5*>Vd{+J8+GydcD8g{H zkHjLkHJz83ScixTWVbSPOWJ~#dI&YwdL2jEPW8lbZ%_m&3G1ft)g%^B7z>L|ul##z zO9aq^{vgXivbdZ6xtBd^N8gVNyb8D*4Mnv50pRZd-$1WOWv?nNuhwCnI}T>PjqM!|in}Nd#ba1>EzLGmtl8!K2h9UkA)>L9)PapsvbhS`x(3 zydAF;{yhkYMLp_k>l?&(i{D(Y#Bx|pg}Ho!evW9rHJx&`|B`J9F40#K44neZNU|%5Lgg{vU z7-m!fj?H+Zcij{*?dOC%Jf4zn{uHuuZOSIw45O#Xv}0|CgJoU(2_{5-CO;1-uv4JZ zMUd7$atPX&b`p}!3o+E#*cVWGfj|e3+cHIl9+Bo65XZ4p?`RR8rLDqP0%fY0>FvAbfW`zbeJ7$-Wua{n8Kj^p1QuZXNzuu zvg5|n#Y<$VrhGFGE=5rjr_L1zwjB@!xP0T|%^A~0F$7P<6PcrV7hQf#KzyCkhhD&{ zidQVI7gALz{8d&PSIj+K`M=mp9h%cN;mV{d{|56<170SeRPH9q{{!d+tWnYXTE%&_ z@xIr^w5lB(TxzPRO|vVf(B(+dfJIX(Yj8VVyPq3eL-_5%o)#T8&Er$qgsde+%Z&2w zSoHwRRfix5U67}e^1rZD2<`Lvo5`VG*zfFUE8tSbt5C|7$`iTx z6l*|(@zjLuqam?Iv!gPrqL?y*1s4K%eSuOK0M6+~8a*XJC6m4BNU6fCvI;Y_2~W(U z8$`cVxRUM89peNL$q=fWNV%ah1!pSbXTKo({{TcT~tNyZ$EVf#j3HJP}fbM zYaN-INM)_Gc>?f_In#94FfvpVxeoR56wCRH`knh%h_!0XiEo=6$CwdxZ@d@BZjR+8 z!^Ua5v4FX=!>VN23w=|)yt{3XeF0oS^}-D>in2+6k!~?=nEH8q&0?WgjEKBWsMbCI z&}Tv!wT#fUkb?rNiI;m8GhZXX<(zIO&#T6v)d4K3)wZGTjZEw+cAMZZoVPRXU(T)S zZJ?~T-P`Pyn?&(oZKf#&R@$+tIfl)t$dzFj`)nf zKbM_gBPwO0+MmO!iy^R|_CZe!)A>|lk{%a?dUkmA=zJF5i8a_#YvneSHvqO^j?lP6 zPjYZCI(XhrO`OjglZ`^QW$0FMZmM(tGVzAmzJIkiQcUje75&TsHkwW7yB%!k)yEEfzmmetStc^2c=N5x#KpQgKt@eZDm4`> z#jS!3hDll>($+iaYY~~ZdS|3{aV@=3m_*Rm>}?kz?_$1Q%G1Tx!+#_S_hEuTzz%|u z89(VaI)(j2$G`ECjMBoXL=@?f{2^5u=6)Kb`!Q%YfJ5$1ly(6=gH|eda7*Sta)^HF z(Cj8E7+%1mA@hk-q6aV@G$&#c><1hG>;nt~4gwAVem&_!hDLnc%r8!Qswvn?^4-q( zZyoIvZR(w)tHs%383_+#1$lGelyd{B)>Dip_#I0uQ4R&T{Fp< zWeeGYqUd^?Gaov;LVQT2&UTAj@#(V-qJRtaOk)&bEl>G*l%AZoMn}l`mA!m|b|Z7x zQk@|=Q-D`n^d(20NUh9NUUt3;1DJL2oEcG`B4zQ-hr26G zV5GtZZsmNezCzHse90s#@7_ULDIq-D0#iIxrnrvalNVhS5eoKY&8?2j_Z-HI0q*EX zDmF=btjt)x9`6+S;#=`+qLD+wFm&QgU!=>$Q&wNF8Ph(hUa~scC#k5z`>peG#y(H1 zo^}7~Kdd&fTdf}-TmxTTdciia>Q;YtGqfGojp>{m{AZOL8)kDEnSFKppfO-R_s^Oy zn@1_}i9*Y48SHDBpGZSy-_+U;T4Q$lc3T-qbW-AzPV|T?#g8YxBh0rU-U}Y!vabm-4sEkiXi&P)FUaJ~iGhex#n6 z=n)?-zBqA36qruIq@BQrv)`bBrDv28icMullgEr#03RlxZ&(~s%iiV~npljvGnDHf z8lUR`T3x_xE@v@G)iN7H=@s=b+j++Rl~28vS}Q)Hewn(`RILv6(MOlDdo)4)Ru1ck zOZFEpz0t0uG;{9ra+8oBh4`DQK7mjep^r_LYo5;5LT>)3ya}bvfL(wCfCmBR=$hk^ z2^1#*PXe9-JVT%Z*{MwKl5uhT6#5F9@Qf<21HKD*8}Jh~IoVaDC0E zQ)hV+t)~GT(~ANh($ diff --git a/tmw_config.py b/tmw_config.py index 60ec739..6594241 100644 --- a/tmw_config.py +++ b/tmw_config.py @@ -48,10 +48,10 @@ ### pretokenize ### Perform some preliminary tokenization. -inpath = wdir + "2_test/*.txt" +inpath = wdir + "2_segs/*.txt" substitutionsFile = "./extras/fr_pretokenize_subs.csv" -outfolder = wdir + "3_test/" -tmw.pretokenize(inpath, substitutionsFile, outfolder) +outfolder = wdir + "3_segs/" +#tmw.pretokenize(inpath, substitutionsFile, outfolder) ### call_treetagger ### Perform lemmatization and POS tagging. @@ -205,7 +205,7 @@ dpi = 300 height = 0 # for lineplot; 0=automatic mode = "line" # area|line for areaplot or lineplot -topics = ["48","67","199"] # list of one or several topics +topics = ["56"] # list of one or several topics #tmw.plot_topicsOverTime(averageDatasets, firstWordsFile, outfolder, numberOfTopics, fontscale, dpi, height, mode, topics) From 9267827bacaa3939d472af807bb1f13979f81626 Mon Sep 17 00:00:00 2001 From: christofs Date: Fri, 28 Aug 2015 10:59:27 +0200 Subject: [PATCH 17/56] topItems: save with fix number of digits --- __pycache__/tmw.cpython-34.pyc | Bin 27799 -> 27784 bytes tmw.py | 6 +++--- tmw_config.py | 11 ++++++----- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/__pycache__/tmw.cpython-34.pyc b/__pycache__/tmw.cpython-34.pyc index 526672e938f5e42d98e93819dc9fc0123f1d2f43..8167bdc2aec72c92aeb5b9094fd02337acb34840 100644 GIT binary patch delta 7642 zcma)B32a=)d497OxyzdrNgb59lub!1o~9Hjk}T?=D2Y0#i`G@CtEb(^)r!k~n|Vu# zq9|{iz(Jg(Et+ZK)JmNua9h_7+O*pQXzZc}5+F`cH$dVP@g#wRqN%GkXxh}hoD}Z= z{j=gOB`UB4eVR9KX8wQv`Tk?xKK+LH=o=#ZueEiNu~k2Q>^J``{L6sQDimBt*cbec z(4P!FI-tX6j~+R0OV5!yn8W`%;^k#J7th-%PuH8alX0e?UvIo!Pt&FsdJhI>bixzo9~?mxjoQdc5l5g)lW^0oj6{HGq`_I+)0~o)Zh`D$hw_ zZP|tHz_gomWDLVLEbMN*A%x_C;g3BP*xM%5om?gi#451L)47b z>MGak&)NAaa;5q~P5&-dpdO zPF|~nwcSIkKh)0J(~ho^PKF!x*Ik+PGUz+3BV%jvYGr;f8L#PvgtWT?!}9R zt;j|^HUXLeZ0==fZ2@crybthxKw%qj6`+GahwNO=$tL9QWDCf=v6 zO1RTGD9E<-Pun>ia=e6{b97+Jx#7tUb-QInKdV*k&0O~OgBSs{Hd`3XGwxDT;V-JOZAc`FBW+0`2G#TJ+X~F6u+fSYfNJFFGnaGHO*o#% z{!l?8mJ>+WnG88mH65p$fhnVy7NXFQqKAcrq(j}Tv#v5H?-|-1ME?fMY9oS>{iki> zSf!rVmJf@m;+K~X1$Wa((I2nt63-TW>$`m; zpC%2*iHcKcaz+Yn9P*xq9Wo`|DW8yP83`x3!`lp?1+SJ_j8Ru(4{t}xOaJ@3@4m|k zgj?zVq{mpJIj-Nu$ZABQIKpFfIuDAGz)|&TtRwggFLWo?8GMdgbsKvdb)7LxItzAD z{vX@8=3yPCS#i#C+E2|=Tu&#nF2x2mq*fo_xN5VGz0B4N?S7 zIkB*O6K> zGIrjH+t<`*H*FC=DSmg;sIRc#Xp;SqeN4UqRp40?m6|ca2({*X(GBm2m=3VW$Yff2 z`7@~dv}v1qCW4WsmW#C+=9I1QM_BMpzzcu^I~$!EvXJ(ZqiKJ}O-SZ4a)crDi)ek3 zK!;8_GEELtD@~yp$AMH-6=goKk!wpX3@!Jg{)u8GF3i}}2mT3*M%Wv9C-Nl>I0X1> zz+d8hV|k}WJhJ*?;#}N1E`DD8;?^Je4jV(34a+XfQH@OsOC8*>~XHygV>3RTy z=m^KAynUG~FIVQ}57g`1ySsD?>>4-0B3@!dC2NZ-cO*oCO9)8N=Gf?V5vHo$Ofc5| zDYX7)0wUkS7IhQvK3*Y2w^COr=2v!`sKp~a`9Baz?HI$LQmN9~{!vDJM(id!DkIyQ{5$a-F46DUi++DfmdL zJo}n6nNG<*ni6X;YM0p%#gerU*a~3z0<8!DT+t0Qdqz5(KiNZSOC3fEifh>>teB_U<{ANF8oAs!^-6wrrqWHw_-6D?joonS1Bsw;2ryMVukS^U^lH1Ld z$)pDDKR$Luu7y(NB*Ff-kSTNk6REnV4bw(Dp;uHz_b%T*kLEpdx1g_!Kv%mmol0lz zjJX#2FT1l;*cu%w`KTV>yIJ(Ao{4Xn%MmPAjh~jULwg?j&IqsVsuzdFBw`uNva-t<<$UlX&0oeZqv>rk0zaT5@ zkvDSPKq?NQyPrVUldny1HE`+*mG(M7g~>D$!Foy#R6*`TJqyt~lBDK@dpzTo3igsy z9Xjxc?@>{_b>K;{5-g!A$IaXMw41$i9x60L))!RA*h=wvwRLRckUu{Wp9}QovMDBW zsY23$99CWdr2?cYT@h#V>BJOW4uyJVZ2g6$JZ&}BW{;_sJJ8+)*oHO2lP-NUp~OR9 z7_K63OGg(t4AIrM@~5~3aRLk z3DwJoyLY}HV~zsG0qmkIhwaQdy$>&-n9;LR_L?ID;*=Uc^3@Y2cJo2gNK`&c9Wis@ z1-uXN8Jbin}s*lO#;mX)lvRwV1JWIYdQpjkdKPpdRqQ+|u8PYeuO z$tLx)V^Ptp8jrW0e%s2|SXb+<`6?(NvKCs&7OS1)v?rHYR~xK(KVG#bTdnq6)x1Wd zHBVZ#S@WR<#hM^rB!kweSo>VJ`qIHR^|9l9g`F_WFklzK_*{S-E;>_3`;Jb2?PVDy zgQgSPqsCJ{Pu+4a29E$ZukJ=`58w-UOAQM*I{qU^>2WTUAB$hU(KlDscXRvTrsS3?4WF5_OmwbW7WqPZ>FRl( z(|z+q@5*k`LVq6;we(Ff;;*M~i;bd_dgR$+!^yLvQ1-?_$mG;Iffl$V&k|4$IgWOe z%whcD1RhTUP6HMxV;Ls1s+?X~Usj(3NQGn4dRqPCseR%clF0?*oV1{2DpWq1ax#&9 z^y)!bgeFmfMZh&RHp{CJrA|CHAkG(`c+9DY`xRo?mb#>TQOW4INR?qT z@0TgU+-YThiNbvMad`CnsK^#Se!gB5a9N#eh$2DeskDwxx^jDToPuQCz2t|VP)>MDaq3v$eQsa6&7mwg&o2rs;*Ge7rt3|Pjn4e@TQ{Z z!A`*uc>%IP5P1MAr@sF+MO z^V0{YBXiJ>sm+WHml>;T@v8>i)R3ckvgTR^?#f@soJ)WueO^5m-!5eF&G>E6z+rm~ z`|@S3(?#GZyD!v?nOD@ycE|EGHFe~WeMQbWSBQ;wL$&%hyG@L!!O7tcu&ead)8xrY zm$xsUWnPcp{Y3O$L-SHmnXOsoaj6=LV#iEj4s}c?R48{p0JR;lFFTq0Nj;tz5p%^) zCcY;cD>TMpkE!dBX+;KK^!YuL{fc{zXCR&`ouA{|v>H1Z6!K7LB zTa!mi#_^TcphOd?__A3e1&I(n;W#kH=|R={v+YrG;_~z<%T$;DZG8a|LIwavs$RORrbJT|)Htox<%*rW@psYr} j5dg9P7r?P&@lnVR;RRKBmL!8TS6&do4N3+2E99u9z&PvX0rT+g9G1u?i)}RMUZzU4Fwb%*G4F zUeGNi5qPoD}>GvSj_^mGE+2wZXJbO*`jc!W2=5&*=u! z_Dm51vaP%`siGUOoOG&axxQ)Zq9fbYlWi-8A(k0(X3seLzycpyau=5LODkKoVM56L z)Fpe>zqD--M=Q&gmqa;4JymNZBHB0#zaV4;0Suz z3~&V2n8X~jDV?`ctYOeh#_+b0RZ!JKv}2Q)sO;&;2r;0Z?c82w`h>MqEC57v8|ZXJ zX{Rm6Ve;1rgMD0II+f3p$Ark7b{?vPG;V<6K$adB7LpG`0NOK#Ibrg&vlK6M=D?L+T=UI#)ikfBf{6c^!K`l!1Yebak-mXrg!|v;kocVvxVt` zeZDYa`smHWruX5oqhK*fykYrKOSQDik{12s`A;MkBY+&QP;#=fP+y( zBB<5EW@piE(#UbF6LuSCh(A^-$(TC=%F72+KO3zN-87AL9w%fzq+dyx>&#W#!T z$`9hNd-g3U1n5o9i6CcSDFoNeYi2iZs>7*4?qiY$#e(K5#NOo2m^bZA;5> z$>`8HKPlEC%SIV2W#!(=~l6`5O{nBN%OBA%|S9rBAZ?|+rXCIWiXOVpN61D*h| zLj4qhvsNN3e+OL`0UsrBqcbUo6s&h(=6wwB<^WFtK7&z_`cd_IpqJ{|;ePQ<@*+0cJf*Rc?JuT0bpGUMN^d9$WvJ?#BKTefCT|1i}MHQt9Q32Hf|JO zsQl$dC3?ZjIx^1E*S}i`Wp4_x8aa*Z4I%<1hy$k9Wn|jO7 zQj;tRiQJ&Bq#PX_IKHt|s6EVJR@$>gD`U2#&Sk6K#+)&miAA${_Gw*g)t))t)Ex@D zbW<)%&*oDlE15c{es}X$QK|gn<}pus!OmJ9;eBR@c`IZy1)cSOSV zVN17RGAEtVaTIx`ZtHrcz7*)t%&BSNSFzw10lx+)!`WapWFgl}j^=vvc3Lu*J1~L? z{Ww~`PGAO(Su$riW<=_$G6{p!RTVWpu#zLLc40`l5A_dI;SMavs^0&{ShRt?kxwE& zfp_-;6yUe;K3Y4e-X2+fF>y|B8y8=zJiqPxo)2n8))dPw%sOP1Lb+}@<)o?P%oVZ| z`{*tJf|v~)n@XuOTzT2DBwtW3@92x0ZP05{2a9Bt5%sLCtlXIvWkg3dnyby0@R?yM z(e(ta?XO_SpAZoF7LI7P^68Uxo0#p?RgL-e-8yRVNJqX%>zIvrowo?rW*qrRtp6zB z(*#s)`~}*74q!e~iF)tr;!zbE8M#rj9vXR&ZeLH0tS5YFq_53vph$1av;87fJVSg$?q}r4FbR+H}*s1(SR zh!kv+pp!ahP3E$4Elo+ZXt8T-VB-7@mV6n&`+ozizXpISZVSzxl@{kucFIV#L+`R0 z1HVIS8)kEcE)wLvNNcEapq+$QpQom?UNyAqsOMFqa&6Zxk%WEc+PMUY9hgaFEhmZ``D|NB-sC0N5~XZ+0sO+?&-jEP<`J~vE5z!GMIML+=Xh( z4cRi6%@tC4eFyZOu`igxstomfR9ALy@%+$Gt0umsv%8KU(6Wv%*QhnYsm;}{2fI26 zzrGX08DuR^{I+^=Pe(g@E1Y%FMnBp!;JI$7<$M2ji2aR&G+iJ#9M^&la=W}hfZG>U zyJgifHsHsSrrJ3cUsRc7$L{pJVN^aj)+Tgtqb``^&rOwKmJ<-oBis_k7LRVFhCPXm-4sD#{$f)-o54H;@q z*bn6GYQwc4%F1 znVjLm3E)4m zMtIVuFC~^T0$R$_Oa&O3LO4vRQ^wL%J$iVPXi|?Kezl%eOe5wNl`2c^z+A!(uAf}Q zk~FMF9Hh1b#1)5mQ@8nlK^C?`-RZDMCnW^++W4S|s-`0=#HboLvU`u$cuW|bnUK+0 z62`pHPhsk5FfxtwBt*$;ob?--h=H!Ag|2`x@6lb#KG8nVY^=z%7?~({wbCe$n5S9G zjA<{TBo+x9nKt!LM`G#aM(5?AQED{KMvSvf#(WUFIst*6=IWy;7-(hMwO~~-J7{Kz z7i%%*Lq?|4m=7#S6g4RFi;UkmmgrKCJ}_2(3!3I z0Q`zPiNHgvNH4$u2!IdZ2fVkJg|&T|p-gLJpj()~9G$ng*jFDMT)9oO(f?~iggzTK zc$?|-VN`Tck8J&;qbEeU=6e5*B{-ipL8>UAm4FgQ7;W}0CVvAS8v!kVo4l~*fSJ#%TN-2Ko7(N>WU zS#`0#ZkrEdSKeAXY8#$^4}e|ANd>p_S-NeNGV(Vt6ro>Nyt>wDZ(#cf`giBG)*)f@~<`1c^pXTIMUS%zR}X*HQFj~J>vmi~^2)=3Q6?^jdS_Q-Q+vXj1L zs28k(8)|W_JA0rOnE=o~`oROOx6xXM7V9PcUV6dM_3G;6;i_^xwRNf~%=OdHB>9WC zPhy<9YR>;Y^i}oyMf&QlR&9fj`xd_+O}DGhO@`Ft*?zIP@_cq)l$k1ljFrZ(9e2^J z>dz44loe_rpbyIp0X7rRPZXTIYH@%=fDSuQ1y_&wB&|8jF?lU!-x3Ve=W}D?ZuOJg zeDvlMtBYq>w_n8~e8M Date: Fri, 28 Aug 2015 11:07:12 +0200 Subject: [PATCH 18/56] make_wordle: reorganized with central function --- tmw.py | 95 +++++++++++++++++++++++++++------------------------------- 1 file changed, 44 insertions(+), 51 deletions(-) diff --git a/tmw.py b/tmw.py index 9dc1e89..3de72f1 100644 --- a/tmw.py +++ b/tmw.py @@ -384,10 +384,8 @@ def call_mallet_modeling(mallet_path, inputfile,outfolder,num_topics,optimize_in """Function to perform topic modeling with Mallet.""" print("\nLaunched call_mallet_modeling.") - ### Getting ready. import os import subprocess - if not os.path.exists(outfolder): os.makedirs(outfolder) @@ -513,7 +511,8 @@ def merge_data(corpuspath, metadatafile, topics_in_texts, mastermatrixfile, def create_mastermatrix(corpuspath, outfolder, mastermatrixfile, metadatafile, topics_in_texts, number_of_topics): """Builds the mastermatrix uniting all information about texts and topic scores.""" - print("\nLaunched create_mastermatrix. (This could take a while.)") + print("\nLaunched create_mastermatrix.") + print("(Warning: This is very memory-intensive and may take a while.)") if not os.path.exists(outfolder): os.makedirs(outfolder) mastermatrix = merge_data(corpuspath, metadatafile, topics_in_texts, @@ -593,73 +592,67 @@ def save_firstWords(topicWordFile, outfolder, filename): import matplotlib.pyplot as plt - ################################# # make_wordle_from_mallet # ################################# -def make_wordle_from_mallet(word_weights_file,topics,words,outfolder, +from wordcloud import WordCloud +import random + + +def read_mallet_output(word_weights_file): + """Reads Mallet output (topics with words and word weights) into dataframe.""" + word_scores = pd.read_table(word_weights_file, header=None, sep="\t") + word_scores = word_scores.sort(columns=[0,2], axis=0, ascending=[True, False]) + word_scores_grouped = word_scores.groupby(0) + #print(word_scores.head()) + return word_scores_grouped + +def get_wordlewords(words, word_weights_file, topic): + """Transform Mallet output for wordle generation.""" + topic_word_scores = read_mallet_output(word_weights_file).get_group(topic) + top_topic_word_scores = topic_word_scores.iloc[0:words] + topic_words = top_topic_word_scores.loc[:,1].tolist() + word_scores = top_topic_word_scores.loc[:,2].tolist() + wordlewords = "" + j = 0 + for word in topic_words: + word = word + score = word_scores[j] + j += 1 + wordlewords = wordlewords + ((word + " ") * score) + return wordlewords + +def get_color_scale(word, font_size, position, orientation, random_state=None): + """ Create color scheme for wordle.""" + #return "hsl(0, 00%, %d%%)" % random.randint(80, 100) # Greys for black background. + return "hsl(221, 65%%, %d%%)" % random.randint(30, 35) # Dark blue for white background + +def make_wordle_from_mallet(word_weights_file, + topics,words,outfolder, font_path, dpi): """Generate wordles from Mallet output, using the wordcloud module.""" print("\nLaunched make_wordle_from_mallet.") - - from wordcloud import WordCloud - import random - - if not os.path.exists(outfolder): - os.makedirs(outfolder) - - def read_mallet_output(word_weights_file): - """Reads Mallet output (topics with words and word weights) into dataframe.""" - word_scores = pd.read_table(word_weights_file, header=None, sep="\t") - word_scores = word_scores.sort(columns=[0,2], axis=0, ascending=[True, False]) - word_scores_grouped = word_scores.groupby(0) - #print(word_scores.head()) - return word_scores_grouped - - def get_wordlewords(words,topic): - """Transform Mallet output for wordle generation.""" - topic_word_scores = read_mallet_output(word_weights_file).get_group(topic) - top_topic_word_scores = topic_word_scores.iloc[0:words] - topic_words = top_topic_word_scores.loc[:,1].tolist() - word_scores = top_topic_word_scores.loc[:,2].tolist() - wordlewords = "" - j = 0 - for word in topic_words: - word = word - score = word_scores[j] - j += 1 - wordlewords = wordlewords + ((word + " ") * score) - return wordlewords - - def get_color_scale(word, font_size, position, orientation, random_state=None): - """ Create color scheme for wordle.""" - #return "hsl(0, 00%, %d%%)" % random.randint(80, 100) # Greys for black background. - return "hsl(221, 65%%, %d%%)" % random.randint(30, 35) # Dark blue for white background - -# TODO: pack this into a proper separate function. - - ## Creates the wordle visualisation, using results from the above functions. for topic in range(0,topics): - ## Defines filename and title for the wordle image. - figure_filename = "wordle_tp"+"{:03d}".format(topic) + ".png" - figure_title = "topic "+ str(topic) ## Gets the text for one topic. - text = get_wordlewords(words,topic) - #print(text) - ## Generates, recolors and saves the wordcloud. - #original# wordcloud = WordCloud(background_color="white", margin=5).generate(text) - #font_path = "/home/christof/.fonts/AveriaSans-Regular.ttf" + text = get_wordlewords(words, word_weights_file, topic) wordcloud = WordCloud(font_path=font_path, background_color="white", margin=5).generate(text) default_colors = wordcloud.to_array() + figure_title = "topic "+ str(topic) plt.imshow(wordcloud.recolor(color_func=get_color_scale, random_state=3)) plt.imshow(default_colors) plt.imshow(wordcloud) plt.title(figure_title, fontsize=24) plt.axis("off") + + ## Saving the image file. + if not os.path.exists(outfolder): + os.makedirs(outfolder) + figure_filename = "wordle_tp"+"{:03d}".format(topic) + ".png" plt.savefig(outfolder + figure_filename, dpi=dpi) plt.close() print("Done.") + def crop_images(inpath, outfolder, left, upper, right, lower): """ Function to crop wordle files.""" From 94b9bfdac9f106a2284aa9f59c3259124311689b Mon Sep 17 00:00:00 2001 From: christofs Date: Fri, 28 Aug 2015 12:05:33 +0200 Subject: [PATCH 19/56] Move extras (stoplists) into wdir folder --- __pycache__/tmw.cpython-34.pyc | Bin 27784 -> 27692 bytes tmw_config.py | 10 +++++----- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/__pycache__/tmw.cpython-34.pyc b/__pycache__/tmw.cpython-34.pyc index 8167bdc2aec72c92aeb5b9094fd02337acb34840..aa754bd23bfce6f7330967bc0ad99dd9b9cfb517 100644 GIT binary patch delta 3019 zcma)8YfN0n6+Y+g`@z#-z(80B3|?%5jf1fXU=r*)U~GedV6Ox8vU9<^UUt{B7h}jG z6yfyoD{4oI6e+6OU#+9KQhZyfRh8C1s;bJ5#!{s=S@n@9O&h6AtJbQbrnS;@W-)et zRO;QGv){})bKWy|zw$1={th<(HCz&0Zh3F&{%Zgqz$1@~@F0!c_ugop!3_Y>3aQ6n zsR?3DfP&@-sR+a(Alg9eg&P2|-5{bM+BJTh1;joO`{4#ctQo`s5FMJ;0-_VdL5=r- zI0T|g<5m!dK|G-`>Ffq^MB}|6HhMrD)e|ys3`DQS?I4~6(Wfz4>IX5P@qQ4;K@4h4 zR-Xd#w8kB<<##8<4uTj0aYC{L7dn4D2NFVlNyu9#f0@jtQW)-h-dXWd5)Lc%-FIyrKmzHT;b4zB1!c45?wt z$wP1j(fQ%Qdj?iGw+{c*h5ME4>2Hc+AuyehVknha7I6~R54W11#*p)yV^g?s$si>> z2csU_vCNR45{|PJKOq`!8bdU44iU;h9w}T%BIFGQ>n_Nzr zhjjOlSe|kp;9At=gHTmN+pn&|G_6iVeQO<)(oE+TGIA+$#!RIwJCa$pv&*)c>+AG- zbj<3dzIuO1j!=;D6LqWai^{#TT($Ifil?U9z3{MY4}jp={Bz=i-@EM$YBpX3COW0~%CF8Dt|~^~Pe=Mh`)5 zIh}62-heipaYLuypmcEG zT?tB_^~VM#{-9Uu6gZDHW7~$g2kLK&`Fd5b$_5yePf_DQgDfE zV$E|RleRj}Myq6)i*=I;7hK*`=JA?#*+>foS2krAyosgU<;+UKCoQc{m#pBi6E;=y zT~O~`tl8*hX`6DK}$0CeEY@(XUPk7o&q!3u4LmTHqTPPLQ zg1Ma9>Xuv~OzM`UHK&ag%C=?;iIkN#m#pnRQ|*IItt;GBkCQ@h2XUuSi7j;Dm17f@ z&}RphxGHH@2cz>$CYVe!*$VDk6DKgL{y5Q%F{fejDXgC0 zaDrq8DAmnQRc87E&ZwVHH{-nXm+3eMK6WwN z^e|=}t`7WR%*AlIODWv@7k8CwuO5xGE!Kr=Di+&~87Ch*gst;zF2hcQiV>O`&j^QA+mRO54Pm{kpvb+@1S1dMj8yp>CNC zm{f0>FXQi(Isa_S|I-rG4UI<|c)vc4e)%Nh2-z#vPoGPju5-jFg!Gk>8#U*xlXn>@5B*;M~*Hyzp+_OsXpX6+V`sPA! z4|Agx`lcy(%xpHbD*tB;x|xf0<{0RtbuLyHoB5&7Yix;|xrOxm@ek@ti+k_|=k>*} z;_L?Twy!r&@p`LdA9K9vc_uFqp$kg4`NJmA^?%q#oEO>28?{~Cwi4Byi5m4+HPf;<+ZC!J|CW>7X!!}OxPLOu5jlLRh}xPj&4e| p>}lq|gY+@NzVO~~MYuLx8Lkhvl=;dr>jLJ|n!uec$QQF`!a2Rnx2d^Sx& z&1jk}Z7x<#{ibQ#%{J|3m7=NL(WG^|SgYk~+C{Fq##*IKm7;Xrx{GQ`G*#N(_cKYz zv`IVoJiOok^FGgej(zK6eDf|gePNdShwK0J;#;=>K7~z}i{b&gvzOj$n#1*-AX?xX z0JGab>;e(d@v|UygJ{)p7(^S0b{%gA(E(zQj-Lb338G8KO(42K?A38Ihv#u< z10Z^IOa~nV@w|>(K=gt*MDaSz(y0IfAsvBPI<*hPVLho8L_dflIwl4KAdc#|9mFvZ z$8}6>20@(A@g5K-L7dVtF&hFgtm7^ar$LP9m{^_xF{@YyMK$L)RQ>TGBO3?#(52UD#j-7|sDE9~56hId65li6_0UP%iz z(Rrr~4f3dhjdMBeU0)hl{Ah?@QY;E1m7{ka0`QFInkyJU4Hz-PGS*<_RC0y)$U5W~YR4(D+^{2*5Hjh)) zxM7Wqqusr5rI+m+oa^S=AU_!0Y;zq|1^uJFduSRN&1QL^mb|a7rRA5d!Zh_xMZBvW zQE4q_6SiCmkLpCjw)%GOJ}gnc>#cYBvIl1te|`*Oi6{pNH5r5@89Q-rlZKrVZmxxtL4GrV0`edlFx{jHY zWaGI^!E+^-j9tvi230ZKhmC4vxCghX>%;B%k$P{q1;?Gc!~KZkDl~H3Gs;H3{ei{=tmiKaFjTfsY_$EIPKgV z`+&Kfo>*u*$-6T(*}yQ&aGK#1!w|y=!x;iLzqjZ+cWL8FF&a^|LX zFU)=0)UAd3Dyq)mTzn}zzhKGOq9t>6DmvMM=bZdx8at!BWSZd|0ae8W$9j1ON0SUM zFq~x=RpaMFs&lFfJJrI}ewwFOw^>{#@p{Cwgcpv?x#XcqBz9SHEmzB3cS>QHq1yp{39& zY!=`rGu?PieKfNjmz{?*UD#^z++~7DSus_erOsDpsq@tvj;iSSLHvRG{`nTPoev4k zukqqfn8O^ZA4Z#0?OX`oSDkY~%s40KQdqfp22IPT4_@jvU*csiGu#JtbgV%&%$1f2 zD`!RRiL{-&tvco#Q95Vl|A3wbh<{i2#o{As zXXCr-lg#oJ2D(AM%FzJBWmFl<^2jUH`0DM%*&ZH>Y$?9p+*T%&zAFD`0=n#ru5$vd zPZHQL`oAS`m5=30Z&nQnAAYWk3p+foQD>}FUF)lMj$X*%+}EkS@va?Ls@VKO< z&CLwM(|O6)xsQH20n Date: Fri, 28 Aug 2015 12:06:10 +0200 Subject: [PATCH 20/56] Delete extras in tmw folder (they are project specific) --- extras/fr_pretokenize_subs.csv | 157 ------- extras/fr_stopwords_errors.txt | 738 -------------------------------- extras/fr_stopwords_project.txt | 55 --- 3 files changed, 950 deletions(-) delete mode 100644 extras/fr_pretokenize_subs.csv delete mode 100644 extras/fr_stopwords_errors.txt delete mode 100644 extras/fr_stopwords_project.txt diff --git a/extras/fr_pretokenize_subs.csv b/extras/fr_pretokenize_subs.csv deleted file mode 100644 index 273c5a8..0000000 --- a/extras/fr_pretokenize_subs.csv +++ /dev/null @@ -1,157 +0,0 @@ -"string§To§Find","string§To§Replace" -’,' -J,"Je " -qu'elle,que elle -"’","'" -"J'","Je " -"j'","je " -"S'","Se " -"s'","se " -"C'","Ce " -"c'","ce " -"N'","Ne " -"n'","ne " -"D'","De " -"d'","de " -"L'","Le " -"l'","la " -"T'","tu " -"t'","tu " -"-le"," le" -"-moi"," moi" -"m'","me " -"M'","Me " -"-je"," je" -"-il"," il" -"-on"," on" -"-lui"," lui" -"-elle"," elle" -"-nous"," nous" -"-vous"," vous" -"-nous"," nous" -"-ce"," ce" -"-tu"," tu" -"-toi"," toi" -"jusqu'à'","jusque à" -"aujourd'hui","aujourdhui" -"-t","" -"-y"," y" -"-en"," en" -"-ci"," ci" -"-là"," là" -"Qu'","Que " -"qu'","que " -"-même"," même" -" Il "," il " -" Ils "," ils " -" Elles "," elles " -" Elle "," elle " -" Je "," je " -" Tu "," tu " -" Toi "," toi " -" Nous "," nous " -" Vous "," vous " -" Mais "," mais " -" Ne "," ne " -" Et "," et " -" Pourquoi "," pourquoi " -" Alors "," alors " -" Aussi "," aussi " -" Car "," car " -" Au "," au " -" Ses "," ses " -" Se "," se " -" Moi "," moi " -" Toute "," toute " -" Tout "," tout " -" Hier "," hier " -" Non "," non " -" Comme "," comme " -" Dans "," dans " -" Pour "," pour " -" Voilà "," voilà " -" Son "," son " -" Une "," une " -" Un "," un " -" Où "," où " -" De "," de " -" Qui "," qui " -" Depuis "," depuis " -" Ça "," ça " -" Sur "," sur " -" Ensuite "," ensuite " -" Puis "," puis " -" On "," on " -" Si "," si " -" Même "," même " -" Toutefois "," toutefois " -" Ainsi "," ainsi " -" Aucun "," aucun " -" Ce "," ce " -" Ces "," ces " -" Toutes "," toutes " -" En "," en " -" Après "," après " -" Quel "," quel " -" Quelle "," quelle " -" Quand "," quand " -" Celle "," celle " -" Puisque "," puisque " -" Tous "," tous " -" Dès "," dès " -" Cet "," cet " -" Lorsque "," lorsque " -" Lui "," lui " -" Sauf "," sauf " -" Moins "," moins " -" Encore "," encore " -" Cependant "," cependant " -" Comment "," comment " -" Assez "," assez " -" Ma "," ma " -" Quelques "," quelques " -" Leurs "," leurs " -" Ceux "," ceux " -" Par "," par " -" Devant "," devant " -" Bien "," bien " -" Personne "," personne " -" Près "," près " -" Avant "," avant " -" Rien "," rien " -" Partout "," partout " -" Pourtant "," pourtant " -" Déjà "," déjà " -" Enfin "," enfin " -" Maintenant "," maintenant " -" Quoi "," quoi " -" Eh "," eh " -" Ah "," ah " -" Oh "," oh " -" Jamais "," jamais " -" Mon "," mon " -" Cela "," cela " -" Du "," du " -" Oui "," oui " -" Ou "," ou " -" Sa "," sa " -" Celui "," celui " -" Cette "," cette " -" Des "," des " -" Naturellement "," naturellement " -" Sans "," sans " -" Vos "," vos " -" Votre "," votre " -" Notre "," notre " -" Peut-être "," peut-être " -" Mes "," mes " -" Celle "," celle " -" Tant "," tant " -" Demain "," demain " -" Qu "," que " -" qu "," que " -" quelqu "," quelque " -" jusqu "," jusque " -" Jusqu "," jusque " -" aujourd hui "," aujourd'hui " -" "," " diff --git a/extras/fr_stopwords_errors.txt b/extras/fr_stopwords_errors.txt deleted file mode 100644 index 331dfad..0000000 --- a/extras/fr_stopwords_errors.txt +++ /dev/null @@ -1,738 +0,0 @@ -a -à -abord -aboutissant -achille -adieu -afin -aglaé -aglante -ah -ahi -ai -aidant -aie -ai-je -ailler -ailleurs -ainsi -ais -aise -aise -al -alexandre -aller -alors -angélique -annibal -après -arlequin -arrivant -assez -assurément -as-tu -a-t-elle -a-t-il -a-t-on -attends -atys -au -aucun -aucune -aucuns -aujour -aujourd -aujourd'hui -auprès -aussi -aussitôt -autant -autre -autrement -autres -aux -avant -avec -avecque -avez-vous -avoir -avoir -baccarat -bailli -bajazet -barbier -bas -bazile -beaucoup -bégayait -bel -ben -bérénice -bian -biau -bien -bientôt -bizarre -blaise -bon -bonne -bous -bout -brousse -brute -c -ça -çà -cab -calo -canadien -capucin -car -cassandre -caton -ce -cé -ceci -cela -celle -celles -celui -cent -cent -cents -cents -cependant -certain -ces -ces -césar -cesse -cesse -cet -cette -ceux -chacun -chaque -chatouilleuse -che -chère -cheux -chez -chourineur -ci -cinq -cinquante -claudine -clémence -colette -colin -combien -comme -comment -contre -courant -crois-moi -croyez-moi -croyez-vous -cru -crus -crût -cynthia -d -da -d'abord -d'ailleurs -damis -dan -dans -daphné -davantage -de -dé -debout -début -dedans -dehors -déjà -demain -depuis -dernier -dernière -des -dès -descendant -désormais -dessus -deux -devant -di -dire -dis-je -dis-moi -dis-tu -dites-moi -dites-vous -dit-il -dix -dix-huit -dix-neuf -dix-sept -do -dois-je -dom -donc -dont -dorante -dos -douze -drès -droite -du -dur -écoutez-moi -effraya -effrayait -effrayant -effrayé -effrayée -effrayer -effrayés -effrayons -eh -élise -elle -elle-même -elles -elles-mêmes -embrun -émeri -en -encor -encore -enfin -ensemble -entendant -entier -entrait -entre -essai -essaya -essayai -essayais -essayait -essayant -essayé -essayer -essayez -est -est-ce -est-elle -est-il -es-tu -et -étai -état -été -êtes-vous -être -êtres -eu -eun -eune -eus -eûs -eut -eût -eux -eux-mêmes -fa -fade -faible -faire -faites-vous -falloir -fatmé -faut-il -fi -figaro -fil -fin -fis -fit -fois -folles -force -fort -fossinde -frontin -fur -fût -gauche -gerfaut -gille -gilles -glisse -goualeuse -grave -gris -guère -guise -ha -haut -hé -hélas -heureux -hi -hier -hippolyte -ho -holà -homme-là -hors -hui -huit -hylas -i -ici -ii -il -ils -in -indifférent -indispensable -insu -irai -isabelle -isolé -itou -j -jamais -jason -jaune -je -jé -jean -jeté -jj -joyeux -juan -jugé -jusq -jusqu -jusque -jusques -juste -justement -l -la -là -là-bas -là-dedans -là-dessus -laisse-moi -laisser -laissez-moi -large -le -lé -léandre -les -leur -leurs -levant -levé -levé -li -lire -lis -lisai -lisaient -lisais -lisait -lisant -lisette -lisez -lisons -loin -long -longtemps -lors -lorsq -lorsque -lucas -lucile -lui -lui-même -lut -ly -m -ma -mac -magnier -maintenant -mais -mal -malgré -manière -manqué -margot -marie -marié -marmouset -marton -mathurin -mauvais -me -mé -méchant -même -mêmes -mêmes -ménandre -mes -mettre -mi -mien -mienne -miens -miens -mieux -mille -mille -mine -mis -mise -moderne -moi -moi-même -moins -mon -monsieu -monsir -morgué -mot -moue -moujik -muet -muet -muette -n -ne -né -nérine -ni -no -noir -nommés -non -nos -notre -nôtre -nous -noute -nouveaux -nu -nue -nul -oh -on -ons -ont -onze -oronte -ose -ou -où -oublie -oui -ous -ouvrait -ouvrant -ouvrons -palmure -palsangué -paquier -par -parbleu -parce -pareil -parfait -pargué -parler -parmi -parole -parsonne -part -parton -partout -paru -pas -paya -paya -payai -payais -payait -payât -payé -payées -payer -payés -payez -payons -peignait -pendant -pensé -per -perdre -personne -personnes -peu -peut-être -peut-il -peut-on -pierre -pis -pleurer -plu -plupart -plus -plusieurs -plut -plutôt -point -pompée -porter -possible -pou -pour -pourquoi -pourtant -pourvu -poussa -pouvez-vous -pouvoir -premier -première -prendre -près -presque -priant -prie -pris -prise -promptement -puis -puis-je -puisq -puisque -pût -pyrrhus -qu -quaker -quakeresse -quand -quant -quarante -quatorze -quatre -quatre-vingt -quatre-vingt-dix -que -qué -quel -quelle -quelles -quelq -quelque -quelque -quelquefois -quelques -quelqu'un -quels -queu -queuque -qui -quinze -quinze -quoi -quoiq -quoique -raide -ram -ramené -ramener -ramener -reçois -refait -rendre -rendre -rénine -rentré -reste -rester -rian -riche -rien -rocambole -rosine -rouge -s -sa -sachem -sais-tu -sanche -sans -saurer -savez-vous -savoir -scipion -se -second -seconde -secouée -seize -selon -semble -sembler -sen -serra -ses -seul -seule -seulement -si -sien -sienne -signifiant -sis -sitot -six -soi-même -soixante -soixante-dix -sommes -son -songez -sont -sophie -sortant -sortir -soudain -sous -soutenant -souvent -ste -sti -suis -suis -suivre -sujet -sur -sûr -surtout -sylla -sylvanire -t -ta -taisez-vous -tandis -tant -tantôt -tartarin -tatigué -te -té -tel -téléga -telle -tellement -tels -tenez -tenir -tenons -tente -terrier -tes -thésée -tien -tienne -tiennent -tiens -tient -timar -tirant -tirer -tirinte -tirsis -toi -toi-même -tom -tombée -ton -tôt -toujours -tous -tout -toute -toutefois -toutes -travers -treize -tremblant -tremblante -tremble -trente -très -trois -trop -trouvé -trouver -tu -tullie -turc -un -une -unknown - -ur -ursuline -utile -v -vanda -vela -velà -venir -venu -vers -veux-tu -vi -viant -vingt -vint -vis -vit -vite -vivant -vla -vlà -voici -voilà -voir -vois-je -vois-je -vois-tu -voit -vont -vos -votre -vôtre -vouloir -vous -vous-même -voute -voyant -vraiment -vue -waterproof -y -zoé diff --git a/extras/fr_stopwords_project.txt b/extras/fr_stopwords_project.txt deleted file mode 100644 index 1a3f48d..0000000 --- a/extras/fr_stopwords_project.txt +++ /dev/null @@ -1,55 +0,0 @@ -air -ais -an -année -bras -brousse -chose -chott -côté -coup -doute -effet -état -été -façon -fait -femme -fois -fond -genre -gens -heure -homme -instant -jour -lieu -main -mal -mètre -milieu -moment -monde -nom -nu -nue -oeil -œil -parole -pas -peine -personne -personnes -petit -pied -place -sens -sorte -suite -temps -tête -tour -travers -un -vieux -voix From d2485660f3e176f8c8e7e803a61e64d7b7ea8e52 Mon Sep 17 00:00:00 2001 From: christofs Date: Fri, 28 Aug 2015 12:17:21 +0200 Subject: [PATCH 21/56] config: Moving and adding comments above function calls. --- tmw_config.py | 45 +++++++++++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 18 deletions(-) diff --git a/tmw_config.py b/tmw_config.py index 9c52055..2802c7c 100644 --- a/tmw_config.py +++ b/tmw_config.py @@ -22,7 +22,7 @@ #print(help(topmod)) ### Set the general working directory. -wdir = "/home/christof/Dropbox/0-Analysen/2015/hybrid/rf740c/" # end with slash. +wdir = "/home/christof/Dropbox/0-Analysen/2015/hybrid/rf740d/" # end with slash. ################################ ### PREPROCESSING TEXTS ### @@ -36,11 +36,14 @@ ### segmenter ### Split entire texts into smaller segments. +### target: The desired length of each text segment in words. +### sizetolerancefactor: 1=exact target; >1 = some tolerance, e.g. 1.1= +/-10%. +### preserveparagraphs: True|False, whether \n from input are kept in output. inpath = wdir + "1_txt/*.txt" outfolder = wdir + "2_segs/" target = 600 -sizetolerancefactor = 1.1 # 1 = exact target; >1 = with some tolerance (1.1 = +/- 10%). -preserveparagraphs = True # True|False +sizetolerancefactor = 1.1 +preserveparagraphs = True #tmw.segmenter(inpath, outfolder, target, sizetolerancefactor, preserveparagraphs) ### segments_to_bins: inpath, outfile @@ -66,7 +69,7 @@ outfolder = wdir + "5_lemmata/" mode = "frN" # frN=nouns, esN=nouns, frNV=nouns+verbs, frNVAA=nouns+verbs+adj+adverbs stoplist_errors = wdir+"extras/fr_stopwords_errors.txt" # in tmw folder -tmw.make_lemmatext(inpath, outfolder, mode, stoplist_errors) +#tmw.make_lemmatext(inpath, outfolder, mode, stoplist_errors) @@ -81,20 +84,25 @@ outfolder = wdir + "6_mallet/" outfile = outfolder + "corpus.mallet" stoplist_project = wdir+"extras/fr_stopwords_project.txt" # in tmw folder -#tmw.call_mallet_import(mallet_path, infolder, outfolder, outfile, stoplist_project) +tmw.call_mallet_import(mallet_path, infolder, outfolder, outfile, stoplist_project) ### call_mallet_model ### Performs the actual topic modeling. +### num_topics: Number of different topics the model should find. +### optimize_interval: interval between hypermarameter optimization. +### num_iterations: How many times the model is improved. +### num_top_words: Number of words to save and display for each topic. +### num_threads: Number of parallel processing threads to use. mallet_path = "/home/christof/Programs/Mallet/bin/mallet" inputfile = wdir + "6_mallet/corpus.mallet" outfolder = wdir + "6_mallet/" num_topics = "250" optimize_interval = "100" -num_iterations = "5000" +num_iterations = "1000" num_top_words = "200" doc_topics_max = num_topics num_threads = "4" -#tmw.call_mallet_modeling(mallet_path, inputfile, outfolder, num_topics, optimize_interval, num_iterations, num_top_words, doc_topics_max) +tmw.call_mallet_modeling(mallet_path, inputfile, outfolder, num_topics, optimize_interval, num_iterations, num_top_words, doc_topics_max) @@ -110,23 +118,23 @@ metadatafile = wdir+"/metadata.csv" topics_in_texts = wdir+"/6_mallet/topics-in-texts.csv" number_of_topics = 250 -#tmw.create_mastermatrix(corpuspath, outfolder, mastermatrixfile, metadatafile, topics_in_texts, number_of_topics) +tmw.create_mastermatrix(corpuspath, outfolder, mastermatrixfile, metadatafile, topics_in_texts, number_of_topics) ### calculate_averageTopicScores ### Based on the mastermatrix, calculates various average topic score datasets. +### targets: one or several:author|decade|subgenre|author-gender|idno|segmentID|narration mastermatrixfile = wdir+"/7_aggregates/mastermatrix.csv" outfolder = wdir+"7_aggregates/" -# targets: one or several:author|decade|subgenre|author-gender|idno|segmentID|narration targets = ["author-name", "author-gender", "title", "decade", "subgenre", "idno", "segmentID", "narration", "protagonist-policier"] -#tmw.calculate_averageTopicScores(mastermatrixfile, targets, outfolder) +tmw.calculate_averageTopicScores(mastermatrixfile, targets, outfolder) ### save_firstWords ### Saves the first words of each topic to a separate file. topicWordFile = wdir+"6_mallet/topics-with-words.csv" outfolder = wdir+"7_aggregates/" filename = "firstWords.csv" -#tmw.save_firstWords(topicWordFile, outfolder, filename) +tmw.save_firstWords(topicWordFile, outfolder, filename) @@ -156,27 +164,28 @@ ### plot_topTopics ### For each item from a category, creates a barchart of the top topics. +### targetCategories: one or several: "author-name", "author-gender", "decade", "subgenre", "title" +### numberOfTopics: Must be the actual number of topics modeled before. averageDatasets = wdir+"/7_aggregates/avg*.csv" firstWordsFile = wdir+"/7_aggregates/firstWords.csv" -numberOfTopics = 250 # must be actual number of topics modeled. targetCategories = ["author-name", "author-gender", "decade", "subgenre", "title"] -# one or several: "author-name", "author-gender", "decade", "subgenre", "title" topTopicsShown = 30 +numberOfTopics = 250 fontscale = 1.0 height = 0 # 0=automatic and variable dpi = 300 outfolder = wdir+"/8_visuals/topTopics/" #tmw.plot_topTopics(averageDatasets, firstWordsFile, numberOfTopics, targetCategories, topTopicsShown, fontscale, height, dpi, outfolder) -### plot_topItems +### plot_topItems ### ### For each topic, creates a barchart with top items from a category. +### targetCategories: one or several from the following list: +### "author-name", "decade", "subgenre", "gender", "idno", "title", "segmentID" averageDatasets = wdir+"/7_aggregates/avg*.csv" outfolder = wdir+"/8_visuals/topItems/" firstWordsFile = wdir+"/7_aggregates/firstWords.csv" numberOfTopics = 250 # must be actual number of topics modeled. -targetCategories = ["segmentID"] -#targetCategories = ["author-name", "subgenre", "title", "decade", "author-gender"] -# choose one or several from: author-name, decade, subgenre, gender, idno, title, segmentID +targetCategories = ["author-name", "subgenre", "title", "decade", "author-gender", "segmentID"] topItemsShown = 30 fontscale = 0.8 height = 0 # 0=automatic and flexible @@ -196,7 +205,7 @@ dpi = 300 #tmw.plot_distinctiveness_heatmap(averageDatasets, firstWordsFile, outfolder, targetCategories, numberOfTopics, topTopicsShown, fontscale, dpi) -### plot_topicsOverTime +### plot_topicsOverTime ### ### Creates lineplots or areaplots for topic development over time. averageDatasets = wdir+"/7_aggregates/avgtopicscores_by-decade.csv" firstWordsFile = wdir+"/7_aggregates/firstWords.csv" From 6901aa0316e66a44415b2d63c269092b971efa87 Mon Sep 17 00:00:00 2001 From: christofs Date: Fri, 28 Aug 2015 16:30:26 +0200 Subject: [PATCH 22/56] Deactivate all --- tmw.py | 3 +-- tmw_config.py | 12 ++++++------ 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/tmw.py b/tmw.py index 3de72f1..e3253a8 100644 --- a/tmw.py +++ b/tmw.py @@ -728,8 +728,7 @@ def create_barchart_topTopics(dataToPlot, targetCategory, item, plt.xlabel("Topics", fontsize=13) if height != 0: plt.ylim((0.000,height)) - plt.tight_layout() - +< ## Saving the plot to disk. outfolder = outfolder+targetCategory+"/" if not os.path.exists(outfolder): diff --git a/tmw_config.py b/tmw_config.py index 2802c7c..425b2d2 100644 --- a/tmw_config.py +++ b/tmw_config.py @@ -84,7 +84,7 @@ outfolder = wdir + "6_mallet/" outfile = outfolder + "corpus.mallet" stoplist_project = wdir+"extras/fr_stopwords_project.txt" # in tmw folder -tmw.call_mallet_import(mallet_path, infolder, outfolder, outfile, stoplist_project) +#tmw.call_mallet_import(mallet_path, infolder, outfolder, outfile, stoplist_project) ### call_mallet_model ### Performs the actual topic modeling. @@ -102,7 +102,7 @@ num_top_words = "200" doc_topics_max = num_topics num_threads = "4" -tmw.call_mallet_modeling(mallet_path, inputfile, outfolder, num_topics, optimize_interval, num_iterations, num_top_words, doc_topics_max) +#tmw.call_mallet_modeling(mallet_path, inputfile, outfolder, num_topics, optimize_interval, num_iterations, num_top_words, doc_topics_max) @@ -127,14 +127,14 @@ outfolder = wdir+"7_aggregates/" targets = ["author-name", "author-gender", "title", "decade", "subgenre", "idno", "segmentID", "narration", "protagonist-policier"] -tmw.calculate_averageTopicScores(mastermatrixfile, targets, outfolder) +#tmw.calculate_averageTopicScores(mastermatrixfile, targets, outfolder) ### save_firstWords ### Saves the first words of each topic to a separate file. topicWordFile = wdir+"6_mallet/topics-with-words.csv" outfolder = wdir+"7_aggregates/" filename = "firstWords.csv" -tmw.save_firstWords(topicWordFile, outfolder, filename) +#tmw.save_firstWords(topicWordFile, outfolder, filename) @@ -216,7 +216,7 @@ height = 0 # for lineplot; 0=automatic mode = "line" # area|line for areaplot or lineplot topics = ["25","60"] # list of one or several topics -tmw.plot_topicsOverTime(averageDatasets, firstWordsFile, outfolder, numberOfTopics, fontscale, dpi, height, mode, topics) +#tmw.plot_topicsOverTime(averageDatasets, firstWordsFile, outfolder, numberOfTopics, fontscale, dpi, height, mode, topics) @@ -228,7 +228,7 @@ ## To read a specific segment, better than looking in the folder. segmentID = "rf0166§0118" outfolder = wdir+"/9_sel-segs/" -tmw.show_segment(wdir,segmentID, outfolder) +#tmw.show_segment(wdir,segmentID, outfolder) ### 6b - create_topicscores_lineplot inpath = wdir + "7_aggregates/*-lp.csv" # narrow down as needed From 962448b3f0352ece730b385c03e0c08c41d282a3 Mon Sep 17 00:00:00 2001 From: christofs Date: Fri, 28 Aug 2015 16:58:56 +0200 Subject: [PATCH 23/56] NEW FEATURE: Added topicClustering --- __pycache__/tmw.cpython-34.pyc | Bin 27692 -> 29733 bytes tmw.py | 65 ++++++++++++++++++++++++++++++++- tmw_config.py | 22 ++++++++++- 3 files changed, 84 insertions(+), 3 deletions(-) diff --git a/__pycache__/tmw.cpython-34.pyc b/__pycache__/tmw.cpython-34.pyc index aa754bd23bfce6f7330967bc0ad99dd9b9cfb517..94aed7693b6192515159a5b14338f75dd9772c3e 100644 GIT binary patch delta 2500 zcmZWreQZ-z6hG&+>(;Jaw_(>IAO+;3D9zLu=NAq)fdSiq7RLB+w!N?2>$-N__a<)X z(ik*oOhmax11A2Ze-MpIYD|nKe#O5`h<_L(Ph(6Z@e}{?BT8cQoZGRYzQ_A+ z-1|EpcdvYoFMWq=eh!8L-^;J}U3>%J8@TH6klaU~{mZ#EFB+-BopA6j5c@#vherSo z-VI_D!~rdD1~CTWpqB3eaR|g=E%$){5D3LBaBvHVBOvbAgVmZtKGLn)Q-W))yw%To;ITk*I*Qv&>O@qPaAe(JAF$~j?^v;ZblqY-8=hzLkJ*; zReI-rD=r#z=uWow2Cxf5WEUok;0-8~wGVf$$LJrd64VYlu^WR%2m>fnH5@7?yb40b_4gQANL=?!?l6YF2j3( zG&LdiSd6yNy=S$0b8IEf);=2>$CcWFUR!(7YilpPbqA?K19+--`p~n8=W0Di{z81G zmKcB4z^_i_ma9)PE$aGQM0I7HcFo?~ywwPs4y!Yn&3L|cA=75yx3%v?6!F(nx%X%b z@8|mPo0^fo@IUw3vtDRnjmr~#_@}zlx;aX0>&HvjLSH|9_9UnW)~^~qj>fzJ6-ojR z2p9pz8IPJ) ze!{W0T1C?h@oM&NQ>Mz(R@M#he0^~@^eBQK`m241m)b~LUvs#h$!i4QwKlVAjJ z8E(XGLq1OYXRq+(rzszOny6q!kc|6P<&K`}+=@-H%#2kO9NnBWkItB->};R3<; z%yhi19&-+|?BQHL%hFE_KQO$^@Ct)Yi%Trs$M8DCIT#^)jl{TLK1SwlD}~g`+NttB z+jQHDmhFt>8VM)gB~{le(oS0L;04}cIH3(~PRSv)YE$I)4t=2U`bgT5)QFZs3vfzI*4{k0PeDYP7@R; zV1yI~$|5f;6-|wTBO|Rs>;ow;c5e<2*eB}Un=_&wLat8T?E?@cmr)U@Le!zg&LO9t zt@H)!7Xf)}V?)Z4s0e{*BMoZ$G#u8x=!Qwg!^2#&)27Uoc3v*diGoeppvxC2f0A+UT}E5b(5AM6Zv1iORAKRRweiU0rr delta 705 zcmZXRUr3Wt7{=e{+njE0b8c;GSWTH|BhV*){x9RNFw!^N4l$)lHisj^Ktxg}5CVcyVpUcOr z98R~{H(gctemu-y@(o;0H^VFD(|6I6Rmf2#V^X3`dRo=3(q8XU)v%I`lAMyQgo#WH zQE?}8708H&tVhEzhjOEs6nZY8S^q{jKhxaUtIi!%vY(HqJGnX*#x-FbpF-GVnHUpj zVoad9J<0PEF|c?&aSwPd&P}cZFU8rZWep#>YkCK7$g6bcZ+Xm6FH~f|!VK}7d>`J4 z`U`FiH4(TN1is4yuaM+#)BUK5y_Y`x2RFH<)0WBZVn2RyvAB=-%&rDnt@71vl0oLb t)0ZSEdW2ni)ss@t5ha_C$uI`$^+nZK;u}T>%KXR}Mu2}CJ1y-_?GK9zrIP>v diff --git a/tmw.py b/tmw.py index e3253a8..a764673 100644 --- a/tmw.py +++ b/tmw.py @@ -728,7 +728,7 @@ def create_barchart_topTopics(dataToPlot, targetCategory, item, plt.xlabel("Topics", fontsize=13) if height != 0: plt.ylim((0.000,height)) -< + ## Saving the plot to disk. outfolder = outfolder+targetCategory+"/" if not os.path.exists(outfolder): @@ -1039,6 +1039,69 @@ def plot_topicsOverTime(averageDatasets, firstWordsFile, outfolder, +########################### +## topic_clustering ### +########################### + +import scipy.cluster as sc + +def get_topWordScores(wordWeightsFile, WordsPerTopic): + """Reads Mallet output (topics with words and word weights) into dataframe.""" + print("- getting topWordScores...") + wordScores = pd.read_table(wordWeightsFile, header=None, sep="\t") + wordScores = wordScores.sort(columns=[0,2], axis=0, ascending=[True, False]) + topWordScores = wordScores.groupby(0).head(WordsPerTopic) + #print(topWordScores) + return topWordScores + +def build_scoreMatrix(topWordScores, topicsToUse): + """Transform Mallet output for wordle generation.""" + print("- building frequency table...") + topWordScores = topWordScores.groupby(0) + listOfWordScores = [] + for topic,data in topWordScores: + if topic in list(range(0,topicsToUse)): + words = data.loc[:,1].tolist() + scores = data.loc[:,2].tolist() + wordScores = dict(zip(words, scores)) + wordScores = pd.Series(wordScores, name=topic) + listOfWordScores.append(wordScores) + scoreMatrix = pd.concat(listOfWordScores, axis=1) + scoreMatrix = scoreMatrix.fillna(10) + #print(scoreMatrix.head) + scoreMatrix = scoreMatrix.T + return scoreMatrix + +def perform_clustering(scoreMatrix, method, metric, wordsPerTopic, outfolder): + print("- performing clustering...") + distanceMatrix = sc.hierarchy.linkage(scoreMatrix, method=method, metric=metric) + #print(distanceMatrix) + sc.hierarchy.dendrogram(distanceMatrix) + plt.setp(plt.xticks()[1], rotation=90, fontsize = 2) + plt.tight_layout() + #plt.show() + + ## Saving the image file. + if not os.path.exists(outfolder): + os.makedirs(outfolder) + figure_filename = "clustering_"+method+"-"+metric+"-"+str(wordsPerTopic)+"words"+".svg" + plt.savefig(outfolder + figure_filename, dpi=300) + plt.close() + + +def topicClustering(wordWeightsFile, wordsPerTopic, outfolder, + method, metric, topicsToUse): + """Display dendrogram of topic similarity using clustering.""" + print("Launched topicClustering.") + ## Gets the necessary data: the word scores for each topic + topWordScores = get_topWordScores(wordWeightsFile, wordsPerTopic) + ## Turn the data into a dataframe for further processing + scoreMatrix = build_scoreMatrix(topWordScores, topicsToUse) + ## Do clustering on the dataframe + perform_clustering(scoreMatrix, method, metric, wordsPerTopic, outfolder) + print("Done.") + + ################################################################## ### OTHER / OBSOLETE ### diff --git a/tmw_config.py b/tmw_config.py index 425b2d2..f406646 100644 --- a/tmw_config.py +++ b/tmw_config.py @@ -22,7 +22,7 @@ #print(help(topmod)) ### Set the general working directory. -wdir = "/home/christof/Dropbox/0-Analysen/2015/hybrid/rf740d/" # end with slash. +wdir = "/home/christof/Dropbox/0-Analysen/2015/hybrid/rf740c/" # end with slash. ################################ ### PREPROCESSING TEXTS ### @@ -118,7 +118,7 @@ metadatafile = wdir+"/metadata.csv" topics_in_texts = wdir+"/6_mallet/topics-in-texts.csv" number_of_topics = 250 -tmw.create_mastermatrix(corpuspath, outfolder, mastermatrixfile, metadatafile, topics_in_texts, number_of_topics) +#tmw.create_mastermatrix(corpuspath, outfolder, mastermatrixfile, metadatafile, topics_in_texts, number_of_topics) ### calculate_averageTopicScores ### Based on the mastermatrix, calculates various average topic score datasets. @@ -218,6 +218,24 @@ topics = ["25","60"] # list of one or several topics #tmw.plot_topicsOverTime(averageDatasets, firstWordsFile, outfolder, numberOfTopics, fontscale, dpi, height, mode, topics) +### topic_clustering ### +### This function will create a dendrogram grouping topics based on their word weight similarity. +### wordsPerTopic: Number of top words for each topic to take into account for similarity measure. +### method: The clustering method used to build the dendrogram. +### Options: ward|single|complete|average|weighted|centroid|median +### See http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.cluster.hierarchy.linkage.html +### metric: The distance measure used to build the distance matrix. +### Options: euclidean|minkowski|cityblock|seuclidean|sqeuclidean|cosine|correlation|hamming|jaccard etc. +### See: http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html +wordWeightsFile = wdir + "6_mallet/" + "word-weights.txt" +outfolder = wdir + "8_visuals/clustering/" +topicsToUse = 250 # should be identical to all topics modeled. +wordsPerTopic = 10 +method="complete" +metric="cosine" +tmw.topicClustering(wordWeightsFile, wordsPerTopic, outfolder, method, metric, topicsToUse) + + ################################ From 9ce79ae1167875d3e27dca450f64b839b6490203 Mon Sep 17 00:00:00 2001 From: christofs Date: Sun, 30 Aug 2015 11:03:59 +0200 Subject: [PATCH 24/56] More comments --- __pycache__/tmw.cpython-34.pyc | Bin 29733 -> 30060 bytes tmw.py | 21 +++++++++++++-------- tmw_config.py | 11 ++++++----- 3 files changed, 19 insertions(+), 13 deletions(-) diff --git a/__pycache__/tmw.cpython-34.pyc b/__pycache__/tmw.cpython-34.pyc index 94aed7693b6192515159a5b14338f75dd9772c3e..98f88eaf98a98ecdc9a250f1165a94e428d2809d 100644 GIT binary patch delta 947 zcmYjPOK1~O6uo!y?c~=qsgtB12b5}%_(4R`3Q8kt3nHkr3Ps|)F-}Zp(l=8qVM382 z{=ixWL=b6(x^p81x9(i)HY=AZ(xq$f4bnKnJM-SX=bm@o-1q(m{{DsPPBNwbdHwxH z;nPR;akEgW=y#T^rm*a~?i!9X^@W-zt>x_x%_1siF!($%vf?9%5cmqLqSA?g7luXz zL>NQ~RsqpNXeiK#f*&CUAm?K}R-E!6Vo(%u5Cb4o5D6CPQD~j)g;EqE6{@0qx);jq zh7urF27w%CA`2RM(8LgEUK0EeZ>EVH4XW!`kQ|VSG&C~Mpwck&+#TRZ_TLh1@|EW! zX=0dUS%|*>$?lu_)1(3+Rmsqs!W539 v+zbvZe$5)3?VSnvoN{YBE^p!9SFHDxVU2-Z$xViL3|qlNcO#NdmXgX}gQ(Hf delta 579 zcmXYs&uSA<6voe)%p~c}Yu4N?i{!dP2`y6Cc2-KgL~T-1f7%^gFA>BO9wv{cWMJ_y&UKq|PwEd*+_+yWc&(f8qNdsQ$P`^^;B? ze*bu+hT-|?*~VJG6Y5UXZag8s*Y~5=vqrMI7W&5@!aCwtbJVGsZ|n8v2!REG4XFbI z1WX(VvLMXC06^PN*AD0`=p5)gORSNED`4>$1PX!zq_&vjA;^yV{}&_nk3ovyX$4+t zovld`(u~MGT>x)RyWnk~C6>gPX1P%WU1AB2vdW_@4e+2@Hkt115?uoTT13VTK~=zE za8hW6{W$T7=U$G$qp z-F)iJ&&UDIJiXwB-Uu3Orah_l1+N4z sOJ{T)lq=HV@%{J_UNYi;DcBWACLIXg2@VBU3@+5nX)<)G?v!i&1DpkV_y7O^ diff --git a/tmw.py b/tmw.py index a764673..7748f66 100644 --- a/tmw.py +++ b/tmw.py @@ -1056,7 +1056,7 @@ def get_topWordScores(wordWeightsFile, WordsPerTopic): def build_scoreMatrix(topWordScores, topicsToUse): """Transform Mallet output for wordle generation.""" - print("- building frequency table...") + print("- building score matrix...") topWordScores = topWordScores.groupby(0) listOfWordScores = [] for topic,data in topWordScores: @@ -1076,29 +1076,34 @@ def perform_clustering(scoreMatrix, method, metric, wordsPerTopic, outfolder): print("- performing clustering...") distanceMatrix = sc.hierarchy.linkage(scoreMatrix, method=method, metric=metric) #print(distanceMatrix) + plt.figure(figsize=(25,10)) sc.hierarchy.dendrogram(distanceMatrix) - plt.setp(plt.xticks()[1], rotation=90, fontsize = 2) + plt.setp(plt.xticks()[1], rotation=90, fontsize = 6) + plt.title("Topic-Clustering Dendrogramm", fontsize=20) + plt.ylabel("Distanz", fontsize=16) + plt.xlabel("Parameter: "+method+" clustering - "+metric+" distance measure - "+str(wordsPerTopic)+" words", fontsize=16) plt.tight_layout() - #plt.show() ## Saving the image file. if not os.path.exists(outfolder): os.makedirs(outfolder) - figure_filename = "clustering_"+method+"-"+metric+"-"+str(wordsPerTopic)+"words"+".svg" - plt.savefig(outfolder + figure_filename, dpi=300) + figure_filename = "clustering_"+metric+"-"+method+"-"+str(wordsPerTopic)+"words"+".png" + plt.savefig(outfolder + figure_filename, dpi=600) plt.close() def topicClustering(wordWeightsFile, wordsPerTopic, outfolder, - method, metric, topicsToUse): + methods, metrics, topicsToUse): """Display dendrogram of topic similarity using clustering.""" - print("Launched topicClustering.") + print("\nLaunched topicClustering.") ## Gets the necessary data: the word scores for each topic topWordScores = get_topWordScores(wordWeightsFile, wordsPerTopic) ## Turn the data into a dataframe for further processing scoreMatrix = build_scoreMatrix(topWordScores, topicsToUse) ## Do clustering on the dataframe - perform_clustering(scoreMatrix, method, metric, wordsPerTopic, outfolder) + for method in methods: + for metric in metrics: + perform_clustering(scoreMatrix, method, metric, wordsPerTopic, outfolder) print("Done.") diff --git a/tmw_config.py b/tmw_config.py index f406646..26847d8 100644 --- a/tmw_config.py +++ b/tmw_config.py @@ -218,7 +218,7 @@ topics = ["25","60"] # list of one or several topics #tmw.plot_topicsOverTime(averageDatasets, firstWordsFile, outfolder, numberOfTopics, fontscale, dpi, height, mode, topics) -### topic_clustering ### +### topicClustering ### ### This function will create a dendrogram grouping topics based on their word weight similarity. ### wordsPerTopic: Number of top words for each topic to take into account for similarity measure. ### method: The clustering method used to build the dendrogram. @@ -227,13 +227,14 @@ ### metric: The distance measure used to build the distance matrix. ### Options: euclidean|minkowski|cityblock|seuclidean|sqeuclidean|cosine|correlation|hamming|jaccard etc. ### See: http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html +### Interesting combination: *weighted+cosine wordWeightsFile = wdir + "6_mallet/" + "word-weights.txt" outfolder = wdir + "8_visuals/clustering/" topicsToUse = 250 # should be identical to all topics modeled. -wordsPerTopic = 10 -method="complete" -metric="cosine" -tmw.topicClustering(wordWeightsFile, wordsPerTopic, outfolder, method, metric, topicsToUse) +wordsPerTopic = 50 +methods=["weighted"] # list +metrics=["cosine"] # list +tmw.topicClustering(wordWeightsFile, wordsPerTopic, outfolder, methods, metrics, topicsToUse) From c6f2e21b9243ac2583204d5c73a128c380b5412d Mon Sep 17 00:00:00 2001 From: christofs Date: Sun, 30 Aug 2015 14:56:08 +0200 Subject: [PATCH 25/56] Added topic-based clustering / dendrogram for items --- __pycache__/tmw.cpython-34.pyc | Bin 30060 -> 32130 bytes tmw.py | 80 ++++++++++++++++++++++++++++++--- tmw_config.py | 65 ++++++++++++++++++++------- 3 files changed, 124 insertions(+), 21 deletions(-) diff --git a/__pycache__/tmw.cpython-34.pyc b/__pycache__/tmw.cpython-34.pyc index 98f88eaf98a98ecdc9a250f1165a94e428d2809d..50b060cd8883e588396d79447445916300105ef3 100644 GIT binary patch delta 2072 zcmZWqU2IfE6h3oz+5c^~+ikboQoP7dSd*8w3n08ZprbhV;<~6CZsdnwYo`hNw@*=nEPXV&ZpZ`vZD6^WB+q&&)Y< z&UeoK_NzE|S8V=05)GZd`_s|yzb5*b?s+_z@4>LQdTwsMZ9KD!iciq--6SVSo}^cZ zj_)NoN%EAQ?<1KbIi=@gB#ES;QkITqN#;pPEq{uI<7sPY$df!RQcI(=&l=*vrN#0# zV>n{Ug6-rgdCSb0b)WfCRY|+v<5X*fe8&BzdMA~vS!%kf%9C0)R+_gROBE_JkGkL1 zwume4-?e|DR|HWyq|W1|-gON%-g?@L8+7rtJRzeSzo>|c4MKeYl{~)b#}5o*jA7V+ z0vzrCq9C#G3ps*{f*M3vXD7a+rk5I6dui53YLFa*iXL7{gQSNfCct$heb~_QG}lpM z9TmOA6MXQ~tU>B!Ere2lWRMm~Ku3mXcV&xK3TvgSTFLi+CDI@KBqO|OMraM39VOX? z!&YqK?1$l*LFYT3#dtDpP^#&phEMyXC*yO2bWC<5H9hXe;%rMHHl(gHTx7V!aDrp{ z1J51-wEcEfIf%ackwrHc?f~{5Ry*)Y2UUbcafZtb998uNgFZ;_`iqyJ0n&bTgpVHu zw4+&_VEc%r4mei19b2|ErV5im#E7790^bZ2>}U$Obze=I|c891~7zu zoW~USfTYSWXLm&fizt&W$zr5fw1nP@{C8{B7-St0IMBm6Ps%tY;a^hrQqt*Sf`Y$E zndls{wxOF13ub&DYX~Ps#U3igslhIiozRlr<)x@Vv7cJnndGvC0le8F-uGBabtGVR zQVvqG8PfMm@Ij3bWJ4s^ueuG}@B)2fJUx84KA16=s~r?G^Q*N!GBTpxWTp(f#rQV! z#aGHrStVNqShCICQythbi)nrM0El6(^$9a;RivuUs9d?c%Txv48^|s<9rxAhrq=yE z$ZvLdSWje3J9ma1DU`uFa~iZ8smGXAnKjoY+hS5NL)4dmx_6|8;^>2s!ixqN-rp&R z3+&`O3>O&&7%nk<%rsNmd3qV(j@I@a2&-$n=Bnu2T3`A@UVg;D*Hx1lp=Ox7 zDGCJophzRk1;%v2AqFi5NX+e_V{@YzJ#>sMLJG=C4`}19k0AoJQtqrNS2Jx^r#orP zJU*4PEot)QO}kJolyc}NXU%#0KMbptW%t%xVy~x@J?;uR4CzRxVXbCB z>8w1*-``_+pFt-{XYZQ3O%0E9-5>eaIr0gFj{8v5Ko8P?PC;~|lu*~*in`VRDeG|W zA)N-#d~MYWNtdCjYG|;FaZP;^E{G L;t}DI;$Z>+mC#I* diff --git a/tmw.py b/tmw.py index 7748f66..c04190d 100644 --- a/tmw.py +++ b/tmw.py @@ -1039,8 +1039,9 @@ def plot_topicsOverTime(averageDatasets, firstWordsFile, outfolder, + ########################### -## topic_clustering ### +## topicClustering ### ########################### import scipy.cluster as sc @@ -1072,7 +1073,7 @@ def build_scoreMatrix(topWordScores, topicsToUse): scoreMatrix = scoreMatrix.T return scoreMatrix -def perform_clustering(scoreMatrix, method, metric, wordsPerTopic, outfolder): +def perform_topicClustering(scoreMatrix, method, metric, wordsPerTopic, outfolder): print("- performing clustering...") distanceMatrix = sc.hierarchy.linkage(scoreMatrix, method=method, metric=metric) #print(distanceMatrix) @@ -1081,13 +1082,13 @@ def perform_clustering(scoreMatrix, method, metric, wordsPerTopic, outfolder): plt.setp(plt.xticks()[1], rotation=90, fontsize = 6) plt.title("Topic-Clustering Dendrogramm", fontsize=20) plt.ylabel("Distanz", fontsize=16) - plt.xlabel("Parameter: "+method+" clustering - "+metric+" distance measure - "+str(wordsPerTopic)+" words", fontsize=16) + plt.xlabel("Parameter: "+method+" clustering - "+metric+" distance - "+str(wordsPerTopic)+" words", fontsize=16) plt.tight_layout() ## Saving the image file. if not os.path.exists(outfolder): os.makedirs(outfolder) - figure_filename = "clustering_"+metric+"-"+method+"-"+str(wordsPerTopic)+"words"+".png" + figure_filename = "topic-clustering_"+metric+"-"+method+"-"+str(wordsPerTopic)+"words"+".png" plt.savefig(outfolder + figure_filename, dpi=600) plt.close() @@ -1103,7 +1104,76 @@ def topicClustering(wordWeightsFile, wordsPerTopic, outfolder, ## Do clustering on the dataframe for method in methods: for metric in metrics: - perform_clustering(scoreMatrix, method, metric, wordsPerTopic, outfolder) + perform_topicClustering(scoreMatrix, method, metric, wordsPerTopic, outfolder) + print("Done.") + + + +########################### +## itemClustering ### +########################### + +import scipy.cluster as sc + +def build_itemScoreMatrix(averageDatasets, targetCategory, + topicsPerItem, sortingCriterium): + """Reads Mallet output (topics with words and word weights) into dataframe.""" + print("- getting topWordScores...") + for averageFile in glob.glob(averageDatasets): + if targetCategory in averageFile: + itemScores = pd.read_table(averageFile, header=0, index_col=0, sep=",") + itemScores = itemScores.T + if sortingCriterium == "std": + itemScores["sorting"] = itemScores.std(axis=1) + elif sortingCriterium == "mean": + itemScores["sorting"] = itemScores.mean(axis=1) + itemScores = itemScores.sort(columns=["sorting"], axis=0, ascending=False) + itemScoreMatrix = itemScores.iloc[0:topicsPerItem,0:-1] + itemScoreMatrix = itemScoreMatrix.T + #print(itemScoreMatrix) + return itemScoreMatrix + +def perform_itemClustering(itemScoreMatrix, targetCategory, method, metric, + topicsPerItem, sortingCriterium, figsize, outfolder): + print("- performing clustering...") + + ## Perform the actual clustering + itemDistanceMatrix = sc.hierarchy.linkage(itemScoreMatrix, method=method, metric=metric) + + ## Plot the distance matrix as a dendrogram + plt.figure(figsize=figsize) # TODO: this could be a a parameter. + itemLabels = itemScoreMatrix.index.values + sc.hierarchy.dendrogram(itemDistanceMatrix, labels=itemLabels, orientation="right") + + ## Format items labels to x-axis tick labels + plt.setp(plt.xticks()[1], rotation=90, fontsize = 12) + plt.title("Item Clustering Dendrogramm: "+targetCategory, fontsize=20) + plt.ylabel("Distance", fontsize=16) + plt.xlabel("Parameter: "+method+" clustering - "+metric+" distance - "+str(topicsPerItem)+" topics", fontsize=16) + plt.tight_layout() + + ## Save the image file. + print("- saving image file.") + if not os.path.exists(outfolder): + os.makedirs(outfolder) + figure_filename = "item-clustering_"+targetCategory+"_"+metric+"-"+method+"-"+str(topicsPerItem)+"topics"+"-"+sortingCriterium+".png" + plt.savefig(outfolder + figure_filename, dpi=600) + plt.close() + +def itemClustering(averageDatasets, figsize, outfolder, topicsPerItem, + targetCategories, methods, metrics, sortingCriterium): + """Display dendrogram of topic-based item similarity using clustering.""" + print("\nLaunched itemClustering.") + for targetCategory in targetCategories: + ## Load topic scores per itema and turn into score matrix + itemScoreMatrix = build_itemScoreMatrix(averageDatasets, targetCategory, + topicsPerItem, sortingCriterium) + ## Do clustering on the dataframe + for method in methods: + for metric in metrics: + perform_itemClustering(itemScoreMatrix, targetCategory, + method, metric, topicsPerItem, + sortingCriterium, figsize, outfolder) print("Done.") diff --git a/tmw_config.py b/tmw_config.py index 26847d8..f44e880 100644 --- a/tmw_config.py +++ b/tmw_config.py @@ -15,8 +15,9 @@ # 1. Preprocessing Texts # 2. Topic Modeling # 3. Posprocessing Data -# 4. Visualization -# 5. Other / Obsolete +# 4. Basic Visualizations +# 5. Advanced Visualizations +# 6. Other / Obsolete import tmw #print(help(topmod)) @@ -139,7 +140,7 @@ ################################ -### VISUALIZATION ### +### BASIC VISUALIZATION ### ################################ ### make_wordle_from_mallet @@ -192,7 +193,13 @@ dpi = 300 #tmw.plot_topItems(averageDatasets, outfolder, firstWordsFile, numberOfTopics, targetCategories, topItemsShown, fontscale, height, dpi) -### plot_distinctiveness_heatmap + + +################################ +### ADVANCED VISUALIZATION ### +################################ + +### plot_distinctiveness_heatmap ### ### For each category, make a heatmap of most distinctive topics. averageDatasets = wdir+"/7_aggregates/avg*.csv" firstWordsFile = wdir+"/7_aggregates/firstWords.csv" @@ -219,23 +226,49 @@ #tmw.plot_topicsOverTime(averageDatasets, firstWordsFile, outfolder, numberOfTopics, fontscale, dpi, height, mode, topics) ### topicClustering ### -### This function will create a dendrogram grouping topics based on their word weight similarity. -### wordsPerTopic: Number of top words for each topic to take into account for similarity measure. -### method: The clustering method used to build the dendrogram. -### Options: ward|single|complete|average|weighted|centroid|median -### See http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.cluster.hierarchy.linkage.html -### metric: The distance measure used to build the distance matrix. -### Options: euclidean|minkowski|cityblock|seuclidean|sqeuclidean|cosine|correlation|hamming|jaccard etc. -### See: http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html -### Interesting combination: *weighted+cosine +# This function will create a dendrogram grouping topics based on their word weight similarity. +# Parameters +# wordsPerTopic: Number of top words for each topic to take into account for similarity measure. +# method: The clustering method used to build the dendrogram. +# Options: ward|single|complete|average|weighted|centroid|median +# See http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.cluster.hierarchy.linkage.html +# metric: The distance measure used to build the distance matrix. +# Options: euclidean|minkowski|cityblock|seuclidean|sqeuclidean|cosine|correlation|hamming|jaccard etc. +# See: http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html +# Interesting combination: *weighted+cosine wordWeightsFile = wdir + "6_mallet/" + "word-weights.txt" outfolder = wdir + "8_visuals/clustering/" -topicsToUse = 250 # should be identical to all topics modeled. +topicsToUse = 250 # = all topics modeled wordsPerTopic = 50 methods=["weighted"] # list metrics=["cosine"] # list -tmw.topicClustering(wordWeightsFile, wordsPerTopic, outfolder, methods, metrics, topicsToUse) - +#tmw.topicClustering(wordWeightsFile, wordsPerTopic, outfolder, methods, metrics, topicsToUse) + +### itemClustering ### +# This function creates a dendrogram of items in a category (authors, titles). +# The clustering is based on the topic scores of the items. +# Input: the average topic score file for the category of interest. +# Parameters +# figsize: The size of the resulting figure in inches, width x height. +# sortingCriterium: Topics to be used are sorted by this criterium (descending) +# topicsPerItem: Number of top topics to be used as the basis for clustering. +# targetCategories: Things like author, title, year, depending on available data. +# method: The clustering method used to build the dendrogram. +# Options: ward|single|complete|average|weighted|centroid|median +# See http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.cluster.hierarchy.linkage.html +# metric: The distance measure used to build the distance matrix. +# Options: euclidean|minkowski|cityblock|seuclidean|sqeuclidean|cosine|correlation|hamming|jaccard etc. +# See: http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html +# Interesting combination: *weighted+cosine +averageDatasets = wdir+"/7_aggregates/avg*.csv" +figsize = (10,20) # width,height +outfolder = wdir + "8_visuals/clustering/" +topicsPerItem = 250 +sortingCriterium = "std" # std|mean +targetCategories = ["author-name", "title", "decade"] # list +methods=["weighted"] # list +metrics=["cosine"] # list +tmw.itemClustering(averageDatasets, figsize, outfolder, topicsPerItem, targetCategories, methods, metrics, sortingCriterium) From 4ef7019dc66006d469d2781052c9f563686fc2c3 Mon Sep 17 00:00:00 2001 From: christofs Date: Sun, 30 Aug 2015 15:07:00 +0200 Subject: [PATCH 26/56] Added some minor TODOs --- tmw.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tmw.py b/tmw.py index c04190d..e811975 100644 --- a/tmw.py +++ b/tmw.py @@ -1044,6 +1044,8 @@ def plot_topicsOverTime(averageDatasets, firstWordsFile, outfolder, ## topicClustering ### ########################### +# TOOD: Add figsize and orientation parameters. + import scipy.cluster as sc def get_topWordScores(wordWeightsFile, WordsPerTopic): @@ -1110,9 +1112,11 @@ def topicClustering(wordWeightsFile, wordsPerTopic, outfolder, ########################### -## itemClustering ### +## itemClustering ### ########################### +# TOOD: Add orientation to parameters. + import scipy.cluster as sc def build_itemScoreMatrix(averageDatasets, targetCategory, From ac05b364b23f66934d618b91b55ee73a8ac910b0 Mon Sep 17 00:00:00 2001 From: christofs Date: Sun, 30 Aug 2015 15:07:50 +0200 Subject: [PATCH 27/56] Added another TODO --- tmw.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tmw.py b/tmw.py index e811975..70661dc 100644 --- a/tmw.py +++ b/tmw.py @@ -1045,6 +1045,7 @@ def plot_topicsOverTime(averageDatasets, firstWordsFile, outfolder, ########################### # TOOD: Add figsize and orientation parameters. +# TODO: Add "firstwords" as leaf labels instead of topic numbers. import scipy.cluster as sc From 8b0ea137897e024c6805e99b7b2d6b59200918a4 Mon Sep 17 00:00:00 2001 From: christofs Date: Sun, 30 Aug 2015 20:19:28 +0200 Subject: [PATCH 28/56] First attempt at PCA, not ready yet --- __pycache__/tmw.cpython-34.pyc | Bin 32130 -> 32973 bytes tmw.py | 64 +++++++++++++++++++++++++++++++-- tmw_config.py | 20 ++++++++--- 3 files changed, 78 insertions(+), 6 deletions(-) diff --git a/__pycache__/tmw.cpython-34.pyc b/__pycache__/tmw.cpython-34.pyc index 50b060cd8883e588396d79447445916300105ef3..fc565f93c63db31e0caa65d056bfe6f6a2d929f0 100644 GIT binary patch delta 1004 zcmYjPT}Tvh5TDugK3VfTO-+l|q~HP%Ng;?HDnBw4KRQA%LXO=%J-ypI|NFm`Yeno5 zB6^7SJy0;}r4No23VQCLpyzul-+B;wj-Z+C(DwMx&HQHmvopWnKKPAaHV(?9fG4Y&rAYs;AZ zoO|AO$NHwPxWE412hXT;fj2yyr8J9CKLr5wSoG8b8b+eGQ=XJSY>9kfdnG`lHQPb=> zqmVBaY|EC8K!QXcnZ!s`XdJv@^P2FQn#syqM2fWSWC|iL`Un)SI6c6ogKI*Lk(SaD ziHO+YbiX&g+R+h{X)$lfp={o=oovB&YHHC)i#>1pOv`#x(GuKmE-S4(P4=;Y_>ECV zP@)*YAU5$&Ifvpsbv%?KOa**=IcCT?l4JEUxJ=xDF9tH#B!;Vit3suLH6VqUHwI&j zx+F*dDna6`+50}$vOYXgvJFY2>aw8M2V~lGLMZIzSt(V_8oFZ?gyraI+tiD>g7gU{ zpp=uza`Ska`Yc$cUSBR*4h#|1$%%yekKPx|5-*kriz)VRvFY4myUpeq8=dwO{p{^{ zTd9`Qp?bT|%oWQ7Y7`sQ2x`hH6uVx&{H^UL=lqWs;}}EnfvzXUs;T=fUZ@>+mU32F z*h$m!)pxSItgo{_jo9Uv?Xlr|;yg>`9k|Evk@v?fc6B!?T11O$ty&v-aji>hkF{!z Inyx8-0c(5d6951J delta 225 zcmX@x$kg8_!$k;u(yh?F$f9+YZl`KG; z*+9-=6k-%$EZPXvtLe76sIG>Qy=WbfQ?zlibb}gW(`2WH0#P}BCLT^6As!JP5g<#5 KN18{5hY0`#3Ohmo diff --git a/tmw.py b/tmw.py index 70661dc..8502a5a 100644 --- a/tmw.py +++ b/tmw.py @@ -1151,7 +1151,7 @@ def perform_itemClustering(itemScoreMatrix, targetCategory, method, metric, sc.hierarchy.dendrogram(itemDistanceMatrix, labels=itemLabels, orientation="right") ## Format items labels to x-axis tick labels - plt.setp(plt.xticks()[1], rotation=90, fontsize = 12) + plt.setp(plt.xticks()[1], rotation=90, fontsize = 10) plt.title("Item Clustering Dendrogramm: "+targetCategory, fontsize=20) plt.ylabel("Distance", fontsize=16) plt.xlabel("Parameter: "+method+" clustering - "+metric+" distance - "+str(topicsPerItem)+" topics", fontsize=16) @@ -1161,7 +1161,7 @@ def perform_itemClustering(itemScoreMatrix, targetCategory, method, metric, print("- saving image file.") if not os.path.exists(outfolder): os.makedirs(outfolder) - figure_filename = "item-clustering_"+targetCategory+"_"+metric+"-"+method+"-"+str(topicsPerItem)+"topics"+"-"+sortingCriterium+".png" + figure_filename = "item-clustering_"+targetCategory+"_"+metric+"-"+method+"-"+str(topicsPerItem)+"topics"+"-"+sortingCriterium+".svg" plt.savefig(outfolder + figure_filename, dpi=600) plt.close() @@ -1183,6 +1183,66 @@ def itemClustering(averageDatasets, figsize, outfolder, topicsPerItem, + +########################### +## itemPCA ### +########################### + +from sklearn.decomposition import PCA + +#def build_itemScoreMatrix(averageDatasets, targetCategory, +# topicsPerItem, sortingCriterium): +# """Reads Mallet output (topics with words and word weights) into dataframe.""" +# print("- building item score matrix...") +# for averageFile in glob.glob(averageDatasets): +# if targetCategory in averageFile: +# itemScores = pd.read_table(averageFile, header=0, index_col=0, sep=",") +# itemScores = itemScores.T +# if sortingCriterium == "std": +# itemScores["sorting"] = itemScores.std(axis=1) +# elif sortingCriterium == "mean": +# itemScores["sorting"] = itemScores.mean(axis=1) +# itemScores = itemScores.sort(columns=["sorting"], axis=0, ascending=False) +# itemScoreMatrix = itemScores.iloc[0:topicsPerItem,0:-1] +# itemScoreMatrix = itemScoreMatrix.T +# #print(itemScoreMatrix) +# return itemScoreMatrix + +def perform_itemPCA(itemScoreMatrix, targetCategory, topicsPerItem, + sortingCriterium, figsize, outfolder): + print("- doing the PCA...") + itemScoreMatrix = itemScoreMatrix.T + targetDimensions = 2 + pca = PCA(n_components=targetDimensions) + pca = pca.fit(itemScoreMatrix) + pca = pca.transform(itemScoreMatrix) +# plt.scatter(pca[0,0:20], pca[1,0:20]) + for i in list(range(0,len(pca)-1)): + plt.scatter(pca[i,:], pca[i+1,:]) + + +def itemPCA(averageDatasets, targetCategories, + topicsPerItem, sortingCriterium, figsize, outfolder): + """Function to perform PCA on per-item topic scores and plot the result.""" + print("Launched itemPCA.") + for targetCategory in targetCategories: + ## Load topic scores per item and turn into score matrix + ## (Using the function from itemClustering above!) + itemScoreMatrix = build_itemScoreMatrix(averageDatasets, targetCategory, + topicsPerItem, sortingCriterium) + ## Do clustering on the dataframe + perform_itemPCA(itemScoreMatrix, targetCategory, topicsPerItem, sortingCriterium, figsize, outfolder) + print("Done.") + + + + + + + + + + ################################################################## ### OTHER / OBSOLETE ### ################################################################## diff --git a/tmw_config.py b/tmw_config.py index f44e880..b73a14b 100644 --- a/tmw_config.py +++ b/tmw_config.py @@ -260,15 +260,27 @@ # Options: euclidean|minkowski|cityblock|seuclidean|sqeuclidean|cosine|correlation|hamming|jaccard etc. # See: http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html # Interesting combination: *weighted+cosine +averageDatasets = wdir+"/7_aggregates/avg*title.csv" +figsize = (10,80) # width,height +outfolder = wdir + "8_visuals/clustering/" +topicsPerItem = 250 +sortingCriterium = "std" # std|mean +targetCategories = ["title"] # list +methods=["weighted"] # list +metrics=["cosine"] # list +#tmw.itemClustering(averageDatasets, figsize, outfolder, topicsPerItem, targetCategories, methods, metrics, sortingCriterium) + + +### itemPCA ### averageDatasets = wdir+"/7_aggregates/avg*.csv" -figsize = (10,20) # width,height +figsize = (10,10) # width,height outfolder = wdir + "8_visuals/clustering/" -topicsPerItem = 250 +topicsPerItem = 250 sortingCriterium = "std" # std|mean -targetCategories = ["author-name", "title", "decade"] # list +targetCategories = ["subgenre"] # list methods=["weighted"] # list metrics=["cosine"] # list -tmw.itemClustering(averageDatasets, figsize, outfolder, topicsPerItem, targetCategories, methods, metrics, sortingCriterium) +tmw.itemPCA(averageDatasets, targetCategories, topicsPerItem, sortingCriterium, figsize, outfolder) From 5af5e9096b98dcc2b8d7af51398f7955db437bb3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Schl=C3=B6r?= Date: Mon, 31 Aug 2015 16:05:35 +0200 Subject: [PATCH 29/56] generalized segments to bins with variable number of bins --- tmw.py | 53 ++++++++++++++++++++++++++++------------------------- 1 file changed, 28 insertions(+), 25 deletions(-) diff --git a/tmw.py b/tmw.py index dabf8f4..c12940b 100644 --- a/tmw.py +++ b/tmw.py @@ -242,10 +242,11 @@ def segmenter(inpath, outfolder, target, sizetolerancefactor = 1, preserveparagr print("Done.") -def segments_to_bins(inpath, outfile): +def segments_to_bins(inpath, outfile, binsnb = 5): """Script for sorting text segments into bins.""" print("\nLaunched segments_to_bins.") + import math, sys import os import glob from collections import Counter @@ -254,10 +255,11 @@ def segments_to_bins(inpath, outfile): ### Define various objects for later use. txtids = [] segids = [] - #binsnb = 5 + filenames = [] binids = [] + offset = sys.maxsize # used to track wrong segmenting (i.e. with segment numbering not starting with 0) ### Get filenames, text identifiers, segment identifiers. for file in glob.glob(inpath): @@ -267,9 +269,13 @@ def segments_to_bins(inpath, outfile): segid = filename[-4:] #print(filename, txtid, segid) segids.append(segid) + offset = min(offset, int(segid)) #txtids_sr = pd.Series(txtids) #segids_sr = pd.Series(segids) + if offset > 0: + print("Warning! Segment numbering should start at 0. Using offset: " + str(offset)) + ### For each text identifier, get number of segments. txtids_ct = Counter(txtids) sum_segnbs = 0 @@ -280,14 +286,16 @@ def segments_to_bins(inpath, outfile): #print(txtid, segnb) print("Total number of segments: ", sum_segnbs) + for txtid in txtids_ct: + countsegs = txtids_ct[txtid] + if binsnb > int(countsegs): + print("Warning! You are expecting more bins than segments available! Bins will not be filled continuously!") ### Match each filename to the number of segments of the text. - bcount0 = 0 - bcount1 = 0 - bcount2 = 0 - bcount3 = 0 - bcount4 = 0 + bcount = dict() + for i in range(0, binsnb): + bcount[i] = 0 for file in glob.glob(inpath): filename = os.path.basename(file)[:-4] @@ -303,32 +311,27 @@ def segments_to_bins(inpath, outfile): #print(txtid,segid,segnb) binid = "" - segprop = int(segid) / int(segnb) + segprop = (int(segid) - offset) / int(segnb) #print(txtid, segid, segnb, segprop) - if segprop > 0 and segprop <= 0.21: - binid = 1 - bcount0 += 1 - if segprop > 0.21 and segprop <= 0.41: - binid = 2 - bcount1 += 1 - if segprop > 0.41 and segprop <= 0.61: - binid = 3 - bcount2 += 1 - if segprop > 0.61 and segprop <= 0.81: - binid = 4 - bcount3 += 1 - if segprop > 0.81 and segprop <= 1: - binid = 5 - bcount4 += 1 + + + binid = math.floor(segprop * binsnb) + + if binid == binsnb: # avoid 1.0 beeing in seperate bin (should never happen due to offset!) + print("Error: Segment numbering is wrong! Continuing anyway...") + binid -= 1 + + bcount[binid] += 1 + #print(segprop, binid) - filenames.append(filename[:10]) + filenames.append(filename[:11]) binids.append(binid) filenames_sr = pd.Series(filenames, name="filenames") binids_sr = pd.Series(binids, name="binids") files_and_bins = pd.concat([filenames_sr,binids_sr], axis=1) - print("chunks per bin: ", bcount0,bcount1,bcount2,bcount3,bcount4) + print("chunks per bin: ", bcount) with open(outfile, "w") as outfile: files_and_bins.to_csv(outfile, index=False) From b62adc94d88173560a2b2dab1c4e1baf225ab7f8 Mon Sep 17 00:00:00 2001 From: christofs Date: Mon, 31 Aug 2015 17:06:51 +0200 Subject: [PATCH 30/56] renamed my_tmw.py to tmw_config.py --- my_tmw.py => tmw_config.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename my_tmw.py => tmw_config.py (100%) diff --git a/my_tmw.py b/tmw_config.py similarity index 100% rename from my_tmw.py rename to tmw_config.py From 3607c7db2693790d92ffe67d7fee04ef5c9d08ca Mon Sep 17 00:00:00 2001 From: christofs Date: Mon, 31 Aug 2015 17:19:46 +0200 Subject: [PATCH 31/56] Remove traces of merge conflict --- __pycache__/tmw.cpython-34.pyc | Bin 32973 -> 35635 bytes tmw.py | 4 ---- tmw_config.py | 8 ++++---- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/__pycache__/tmw.cpython-34.pyc b/__pycache__/tmw.cpython-34.pyc index fc565f93c63db31e0caa65d056bfe6f6a2d929f0..ccb18ae9e6026ee38f88806d334f8bbd82232363 100644 GIT binary patch delta 14900 zcmb7r32+?OdFFdP2QY&HFt`B{AP2lhBq35H#X}TDksu{r5Gg_g*&;O>OgDf5=fdj- z2|_@|w&Hk8<*Hcf*ohN6vE{_MT-jA2m5Lo#tgOrHwY|31PVAN2$vW3M#r4LrQoEZi zc^&Wf|8Foe8bWrKB!0Yp$N&EKzyI#$SKn={e8|}RgIH_yRO#=Z+VpK9{!Y~W37~x( zKTh_=%}g0Btu)2iR&23ER5mqm^>83%7xCtFX3ld!4Yh3u_0rfl!k@ zC}xL*xq%;(BDYcG6T(agSs5M{Ch(T;6V^^%w@FyLgteR7*hC6zL|FGTC$9TL`IZYPCxL|6}T8+#oU*2CQ1DXe3{dZf$` z*!Z}x9_99KVVw}xNp6FJQDL3p_Wi5p+9S-p!a6Oi$G9KVJT9y=+}YsTTcq>GCzRiabZ2h?Zd*lBCH8+gM7`jt-Rm}i8Q;`XEWEPqy5X&%BkZ%hg+!w)BgWeLmXHV%y~ zr$p|Q$e$MGX6f2NR^YoXoG>3gnEK*`(}D z@s_Q|7uS0o;q}t=qTz1<{#E$s{buCqh4UBF%CVInD$#E<>bNRfO1bH2$86JWJ7?$Z zg6o*x<4xE4Lr^iZWucUsELwM5d%^XC1WoVLO@ngS`&LuAyctteGeCwW(;s?>;^!oO zoE!j&m?6;0OR*dhaw#O1q*yY96%ezASPF>erRcMQqVIV_EQf^~5KCdP6cl*;#UZiW zB$k@QY*1JsF^hGArsJQS08h%7uF_ygJTF+_FDJueN@CNbR9*NrPt^ST-+u1Mp8eMH zVsOM=a8(lxhyA9aGg3;sGd#}^PZd=@?W#BiRUd#K%H)cUoox1-9DACWq6TPeJ&iSG zit_~*oID%M7cE<(pm~LK-c|{kH(UjqXhrO{%DQ$1(`EEH#3Lc&G9f$Vpj11(r<-q- znYLh!3dXFqVP-O@w&RytM<7WcLE!D%Fy$%qJ7E9;%)OHPGW?7#+Ld_1w{tPra68_y z$UfQXsYp_`c^`>fG7cHu8<9;gwRk(C`Eo?eM#XFkT`nf71BQ#9Smja99Wdq&89!xL7m~9o3VpEc z6}qb34HWRc;`sn53JIFDkrxBSEbKgsVE@5%i1h4oL5PL_?Y$B0Il3qCQz%#!-0I%w zsN2i~`>YbccU(PPwYzpk%}Hn>X5Pa|ee*TG((21{L}bt1pWu%g!OqDxwS@>EYvBiSc0p~W#~lFP*JC|Bq%f+N09ggq zLLlb785`q5&ev{&0;w!VBcq1?qTI27|y%62a2xEHfh%dxdh#3(|{SHH;$I& zGA3}xQVSXW<(P0`3=ogOJirh1%H`Ho%xJ_2GpI#`Mpk7R@tGmg8n!bcNF*LXM_jmV zRRcwsF@DcTS@bQ(SMhg}9cx!A`N7pJ|?w4ZD%&btX^J0&~gLQ}ks_}=7LQuMOX(j1Qxy74y0UD+PkU{U+_y{|sx8&}{=BE=kp4+?Y@G|S{VraBjut!t`(~`Z4 zb%Zse8y)H(fvQrOt%aO>bEJy;5RJAIh!JQd&_;k=ZwIwV`P3c)Wmd+$)O>(I4}c%m zs;@|8l0B-Q23QaG(`Xl&2)CHCRl1O|r_vd>sMG z{^6=R$Bl5HRfdfoiNArE+}0|$OZ5nr-=+8}L-#^|hVVnjVK+V(@LoCI<6Z1qU#{qI zkm9I{4u?cOT-9OdC=|Atf7nRWw)*a9$C?AwsOC^mWdkN@4Xqfi>9a_e*ajS5l*ZfRM+A8MWCnnH~Q;?tA zC`rN*dVq*lT0~xmG!hO58WJm^&m{ zVPJ2dbB2%$2la~k>sLVW3?{0F&=V1}Oa$*H=Vs~OgbOW&P3RR1!@`Z1iCnA#&e0oj zv5cG!&%I))PaK{5BRR3KS(2fsakW#f&DHg*TwOSzDcjAI;W+K$W>~0zK@26|41VYn z5>Lu7ZgT+x|0ZZ0M!R+!cxLOw_L5B;rNEU7fw^fD%l)-G0F(RaLb~bf>z4<}@XTTr z8iwd5cK3@V{KM@aM){0Ex6{snfJ8S;(IHHr>%rmsMQ8v$2Lnf8+esuq3Irv_X9Ow< zb57m;w3nRoq)bwyNVWrzsxgjD+p(Nu4*SAawUGML&y2C(^0_ z)WeC(I?YHF=JS)bqTQUC;(QMH5Lvj1w42y7l6cCYf#TGZW4nlh){S$(qn9L#<*DlC z7|c{rUn|Zh(#lTQ3l&e0$QSWQ#3kIBbfLN`k-nAA=F*cndpL1|MsH_xxkRDpCMN9! z3}Oz)%M=UX#e8wz$=w-#OK!Q_M28`3bhk_<-4}}oPbpPYN7gt}7ARGbj(bw0q2r_r zcW$TejEsz^U9>((@sb}Q`%5jy4};LG<@mw$Le^P~XJ+OLH=RTYnHC0)Bsl-{LkO$N z_QGj)N;Ob%Pks>D-Hab_?l{^HkO%TxGDRE-zOh(v)T7iBE~N`r+VPw6=>>8m+DWlj zA}j6(^4WqPz{PV+=m)KA#`QyRHPf~qnkwgtMdgPt-Ui4rzad z^aDjHx`4u@x_}2YMu1%uJ4dF*C|PD0IV%othT&82Tb-eKX94`M76Y9UQeJA7Lp<3G z@mw>am!>iKbUvY-90GyveKaoH$^5D2^m2sn0 z4g_O}dSh^mtukt~8LjAx!dZ3%Bgkpn44T^t92ry$F4$%C$1thYXbbisY$hMN z9?LqVy0P-Zj@|MgNf+xPlbJMpV`PsXRK?p)g9CpndXz3UQ_Po+JuSWQu21xEqRj_u zKAKDCC$03cXQdbG-ZxBfiyy$<`e9|KQ4LVt)Pov9zC`gvm-lRU-{Taypz6@!%uhOQ z)}3dWJ5A#hN0`mjPmg|(T8ff*$dx@^$rKQ&O`%I|@;=u+&^L~p$st`6!A{u@M)UnVr zFt0INGdVlWff6aIO3=U%bE?L!mXbm^rsK*<>HVZPA)S>(pC#ok@4fwpAK(}=QWrzA zWq4)2Hky^|F1L8;Qy(_nZ_ZRUvtiU`dn9l<#|nmkJVq;z~Qc}Q); zmHX|PbS{^2m2GRJx4oHxq+DEiY2bn2RhAsG*Kf(lT?)eP8iXq_rN&emEi*9o#D&Gk zQ5dE|@pj>u8N|pHGjQe9lV+Hjk3MP!9vwYP0Nu7TW*TEBRg*Wk;rQVy!;*%2hR_Bn zs6VTLb(xkIjtTBw#op-)@FzY{6MMb_#u`av!!6)UbRJ9(ui-+i+AByuVU3~%cpPJ zDUB4K`a@nck(KXTaT71dGNJ)7z8L1jC7Nu~tYUGgDMF1)w+MH{Z^=U_Y&XS)!rSQ6 zi3ST&eD&}t2p_8G+Zi^8(04ZJNc1pp)~IjtJHvZck7dztEHPzdEC)}7G%vuy+Ne|q29wpqUD{bbv7 z5l#_JkmtQ;`<~r(Gl}9D>L#f?^v;4T3_`Oi<;$qC!B8Jw`R?|uM)@U79D_#O%!1o6 z*b>jmBk&xdz@br8!I^R#L)>MHM^TnKMPM1g3>V>+;knQAf{J@OGwQ=qlyd2eJ zqpmYb?3?0K`vEl7cJIp0-K}O6VNqp~>3v}5;BW)4+lkkDE5^Cm@V>h9n0#d=wri(6 ztxga{zf0g{0v{#t>j2})sOA7Owx%)cI^ILGm;=Ng^+9iWcjDx4Q*#{vf-Za758zac z*pEP#sMMZqtbzHhL|b+AoZU_FiYVty|njKxmE~i(9c(~T3rKx(Va<`Y}g+rae(^&lz=9dT1@Nb zsMT-obTiS+$55iVv=5uuR^eylA_Q&Nfcz?2gtNhB!-|RnSl&e-i zv5{Z{z(-}Z=hCv~#9>giXe4&LC6nrLB0oBnRgODBg;`U#bvdU#gsPOcYOMGTtW;kj z@G%1CCjcqDXGeXR9s*|}%7)RU^buh!JevA~_ZNq^o-@1H?>KC5 zQWdFaB&d@YR~({tG5?u_o9;pw>*am2-B7Quj2?-|@~$c^k5P*f&}2xxj$dYkYzS|m zuje(I@HLvyq8arq>L6~J9oo;;(Ps9btK$CZyW!5sBT|>6{)>17i(BX+XX~y{5zaD! ze@mc2eA(pIO@G~MK00iCTX;KMrnEvJ+ebvGV*8Y+XTMBlc#NzJ)wnWGm=@@kn|m$1qVtw7ZuYy>J_*3 zOK4CILV+=aTnB*&8J;zzoa(!H!wOlP%1-MSoiX(-aTC*PvdvoXcZA|D0T%TQv@k+k z#e3R$%WShy>xE6xtZ*oAot{^Aiud>1D^Jw`vhIq!3&#iL|L|sy?=;>NE5CgFxR$TQ z9u5@~XY%Q3+eu`Q48xWk;mDxk4E?~xGw0Pm5luC>$cBQ=Klw&KNDHA2IAOBA7C)ng zY)_$bFb|}7A+NI2Gp<4fL~hNx$z7bRsDq!22roFT%TT^lQ7>$gp!lvdMm^wj)JO>O%lja7Pk! z-rIU=dx*9%y-TNt_t)JaP12DKzl1yE+IeS3byuSeMSy;z(@?>cPoCZ1{LZPH0xgoe2jnq57jr(qH@2hLopmeQumLAH}u%Rb)wa8LJBcTNe=De zM=7KjFJ87Xi6|AOS*;+Q z#D70nBGSIkO`NObhf#mZ+;r4G3hy`043t|4JIi;?SN(wcwgKS$5jwv?pa}qFJhCuk z#QgA#&0@|<%sTx;TE`4vz+4!pL^+dbm6<`|lgzQEf0P*F9XvZM2Uo70y-$v?ys+j4 zX}>6eL4ahNkB|eVYT72o;}j0v$Sdczmcuk>9f9=(YNYVBG1&ncAlmg&hO0E{OCt8v zXu3u;bz+JL;>9eNq^KU#BX>%b0FI17WB5b4NQ>~EI{!PDv<9-wVFm@XYf0P77-B(# zcl|N7+3c;YWMdZw`|zf=QWJubp}fWgGd1kJ`@&!92#K{!v4FGGBL-b--%qUW>xW=m zZG~wSk;ZLOlETV$8K%BaM^f`U@v9F)-9!B1WgJjz>0j{P9P5)?Rw7UQifn12sM@I2PJr_|H0aHh{-e5Hy0pGbSGrmW;qa@QW|KTaf&u#I zv7bN>fnEXw0DjCR=}hI)cMyHNqq9;qS&wJ|;frL+p&Qz|rgSw5y7#$DLkMF3>e6<( zZ6);NRjC~=%koc%N_M@&MB5eu+W?@TL)2!?VTss4j~fYWCeR?(HHl_V%HqScvW5=R zE(}<#+gn%Vn(yhO-uEvbklVfC@h|JlTieOC9ESS8!1Pl+g%;m6B^OKmUNU<#$ZYcI zQe8m$MUrP|cq`ezRj;vQ<@QrHVkRP<Uf{L4T1R@Pfg8aKG9i#TB7K*a>c;su_w63ttEw)Q`aEU*>oh zg?rbsZ#jO{Dduh6$FATW^WwcSu^tilS0^^hk(Ds`ebL*6Z^@BB~ zrX*W&&EAKYjr2MG&fv5Yg z*3A0|ZGC`P-lBDY#h(>{#h>|JGwr-pP5c|>{iW-}_+G}_e0`gI5E4G%Wv+keUnk{} zIvJ{^fh*uT7nRg~L?;Q0+DXI)eum_eZA|e8n_2JoZ?yJpgOku@{Z)xa)S?cqymn)_ zPUdwQug&4xg*@3>M_)C1(s9dc??9wFNHkxiQI_dQO{V=ARm<=ui;Vrwf)2=8jd#(S z-2~P&DBiE8293Qz?~hZ1^6<(xQ@VTP=Cz^zKV+1-_7)DpbNDmQ>aPHpMKT6o2?S zv2+fcPfyx8)uy$SxgPYCy}zYqyj#3i>>Xjve($^Xj*Y};^)Lba2z7#5Y*mg?i}||kXYf@$f~i^J z=;IY`8Z~6e4jo&D@a0bKj(SH1lOL{g%WH+24GgZ>nzu1{uoBlDCv1&8_43n0#?_$r z{^>39v6WY*mt>j6GHlrydU4F)Xq~D}Ribzq@y@2DcD$IECDSW^lr?l{iDG+$6LLqBB-pW(D#bvSICj{qt zC0myZYZjh6Cg-y`3;q>Vc0J=)z5klu8orE$73CT#EX#FI;JMBQaU9BObi(_?;-+@8 zWoFYTh)Ne0)jsdP70)IQ69Gg@ofpntHP}W?5IVw%H)~PvYH6_ieN4o+=v=x$s)z!c zDo(3(zD_Crkr4LLN_Nuhd)IKy+8os^QImR-){GN)3P6AH=UjxWQF6#prTK6h#SZk{ zT!l}K+C${^At0i!* z0;I3_#=>@IiukUB8VZuHF47p?a|5wjDu?OhESl{NTfa(zNzQ;u^zBA_1j$dU+#pqY zT>+7P7+>YsIY<39FsuJVpymr}a!A6oI7A_Q!uwpYw+q=m3uUtw%8U)EoWGqZ zvs98{f>b)UbOAt1Wu@GG6^PU|0ww{HOZ7ChC@$8ekL&bU3lIN)X_#`Dl3Ay*eVpL|m z4=(kUv9K~(xq0ai#l$@e|bS&b}H-o5f`5sIiz`YhV*{S*TtG9 z6j@woA}nQ%p=+@*4%>JbwtbX*j6)wxk zZX=t1^#(nEgTP-Bc%8tv3H&XAHwpZR09PS{I7qGN#pNb7OnqAjP{}}T2cU1c(vwA1 zQ2VJ1`KwUXgi1X~j|VxYeS3tX&d}OcCzCDR8OdlXk8-naD7Fu8H}EP=sH)nfvb5fN zquiG!8CE1CszhLp0Np~}wGGp$SdW^I;=feQ=j>y2+s^v|=$$M6#$t!sW9?ylEf$E4 X#~x^(j2W@ESWj$Y?2MOr*X;iTjZPfR delta 12378 zcmb7K3vgUlc|PavYNg$kw37UgAF{o&vE#KZ*-re39ovcHNBoFoJ9hk#?QB+iujG~Y z<#VrWS+-{<<2WQ?S_o$df$(U;tAr4sI|KrOv<=e&B``yQ(o2Ezn&33hl9EC_ou>cy zpOtpKR)8wm-{+p^fBy6RkMrMKe$;yDK5N0Rn_D73-SM-#@B4-jKNB^70V>}{Kez9Z z1$#?Eq#A`yv`N@hXcG4e(H|Bz{T~r_RNOB`KNWOT0 z683B^PZM^Ru;*}@=9w$(d0g%kc3jxqT&98Zg}s2wGll&YVK3w|O=7k%TMT!J{yF@( zNMz@V+&s}gPsmDhT=dgir5l93n8$SsJ0a{PT&9Vou$KyZ8B1k>uzQ5v%T2V-a$&FF z@~d^sf;17GZDYdg5T4u(xyhMq%F}>>XStZgvWL7nfHF zd$+LfD{{~^-E$lZPu{1L-g+u_DNx9xSlpH#eu|d`H?`@lD_`mhDD8j zz;iNVaa|0}(K|w)qT1hwHZ1qUNA}#gFQr^Z`3*(<>qlHqWr|5JHRSrOrtD;$oRjxl zU#A<7u5Kii@}r~0kMH@)s5<4u3$SVfo@AC$WoLr^a%aj5D_6e zrnD7W%mx$RYsQme79|Suex-tC56^=+`3qt>>lQ7MkQo_$8EQ#VpuqS z?hQa6fNP11O~ND8k{4{iCbmQ58rm5D7s)4DB1ImMXbaCoP^L=K zg;wD;qkAVg(N0WF<0omk*gQUeOk9X!fC@r0hTAHSV0=i3i!EaGvLL_2k)xH#;QIp7H-iRD0|eJrOx+_7b9Je$c8UhKGX+@h2A z;)G)TddJGS72QZkzXgCxdWBO?9uOw$Bl`5-DZOm{uF9UVrepCF=T|OUcaarotYK>* z@ARrD^-D}s&Ge7D354<7lFxdldM75k9SuewPy%=qP^3CQ*se}RX${C)kkueJgOt{R z&Iais;)hbjqLa57lZkdU4-IjUThKZM`sWq0j!Naz&R{C-6_nbDX7HuJ2J)qxdA)1=~bc6%)O}u@%n+Zd&UnbCq>&~VZwmrijV>y zSRj`w`XR?nr;3grJms8oS=xz^-|mhKxL(E^@iK+HyB#A~9!;pnqaQ>`?ZGk)${DH{ zD?-*m>}Mr2HMQs{Sk7c_B&^JEFmumZ9~ zv%%K{T1Y6g6Ll|jseu(ksERE*kIFtf~ zl-h~eOaqw{$8X>UKTs^Vegi|wkLI1TFhW)MK`x|0=jfm5iZ!S;mpbJn&7>5^&ez_O zjJ#a_(UON{iJ~2Ga4f_YIg)5JqGHG}yLD0;#N1&&nj>5|UXlYG@@N&^ATSBPZG7`d zLhmp+3f$njPw;D`(1*-nMdqe3#CwJl_Nh=37Nu|7zumo|=xR zZl*XdnIp}!2?`&RWbZ;$Eo8)+YpD;Ecip(ql8@@*s;L{^P1P!kF|$>DRzW?l zusnh&!N{mQ`h*H9LqVtCzhx7|tY|K@-IT zglqCGMnA}54v{U-WD99O2=C$t#xh0qUW^J4szNTwm1JeIBe}fm%Y%L>UC5_Xp87EQ z=sl~KwfQY9UpGUc@hI+K53Qa(&6gQJOe-asS3f{S)vLd_x@VUkA-7QRpnglG;HC={ zr2uAhQs&`0lLo2Qv_B*DjcYc^uar-(St?6cAY%0ekY_+X0`gHJ2NMw^3U(w69hNOS zj`QdRQ78z??3)}WR5H=zM~JE7hzngIf9DLj>RI%H=KBG9Cet_S@tXdUFae4*UuCX~ z+FQW)MC<6$AI;pRK@G&-!n;LQwVf9s1 zy$|HCi1=-}luO5IR%bOSqwW15-vD_Ly_#!#)yeh-Kf-e-0oa!TShy?^h)}H3EMpA> z+z>(cJLUHEeKN$LeM~#IY%4(vYMgP^(R~L?hP(o!uE9M?5eAtkW;~8D-v*i3f?|gM z5p{K{G$l>g^_*la^KVQrs26Q$lix3|-Js?CM=_dOgfje5sm~9lMl-G-Jm;j8TA}}K z<6QY&9oe)t@uqyUGnv4GdRz~0nqT@J%|$2z&wiu1#%*3%e3;g&mF=>Qt0q;b85gm4cya4hX+S{r-HOnJpx(;A5H+9A#Z!s>E zW%`G=uamFo&Mn{CZG>ADHX9a(7>gMK_robSO*fNFe(2zKN*4h-euV9olfaK(=r&vcB!Gq;Im7S?RJM8+(2wfw}^uq<&P2VQ)0 z=!ph(6;EN99G>4C*sD~jULlf*8Y%q_6%du*&QdhKvtop&QB}od*6>@g&D_&fzsJ}O zQM6|rP*;5kbB==iHOP_)!r?&-EY);R>yz8NEh+Wewm&RK_4GSd)q7DML5`=ONMb2M z0i=LHRRmN-me1bt5AvwrHjql6!cOz#;Lw+@Y$wUd`Jo)8TQd184b*>(ph|8f4ug*A z21oK~6{dq!IcoKr1`CwQp)2H-8F&m71_yop_|C<~x_zALltkzM*&nmFv7#IZ~(?A&Ki!d+@sZv^U#P(Zly8V)wF`^)SqxnXZljQaNSmkNl2-O~u z{^YJMc~*aY*HWunmVdYFHUqA)X&he0cji(nbgrqa z$*Khg3ZrxhGJMr;2z{fdPiC0+M=TxMGhZ&#_ucdLyisgoS(rm%J>p=2gNKS~VOWz& zO=buGp2j8uerwfmu%Tk}1nX4Gica-Snf`I0H$p8U&_P*%nUsoC%cXv}@0&eUe?^y8 z_Dt0k5s9hCi6AXOSEmo_%X@EVz%su6%--%*lWCdpP&+**=awf{wOT1BF~jPx)S7Z= z-&C2du4-HbbIlg+w>0~9@^_PPp*{tEmgBIWLTM#R{{?O+C*qk{k2Y~5s(YorxPRtx z=CzS-9b0iP!YY0QVaCD2zHFg#7vx>AnP}x;X`=Z{;2)8ipd9548K!t7PI;CYvJF;n zL;1D+7vyT39>ur?3fw7MaiyOgRXaF%l1d{;IUOt%d+0}zWqk0(=hgbSdP0BbK;kBP zR%Rh_#Bq3HkfMB3PS2b0F^XC`mHvWzwhUHYoKQ2WC>@k7ln^;x4Sny4$2~ zLHQ<-b(kZ}1_n0F51r)uiP7OWrir6l-7(IrL_Gs3l|GqL9QGRizXgHlcOUGQd&|!q zTq1iJFRXZLFwq8(y+jhN2tvqG4LJ}z?ZU;0v>#enx*2`;fb@Z|Ibl}#*qH6rXaMhK zD|PI(LOk#ghNqu{C$J!blyRnv4uy4B1Rcw+LmV^iK}|i;0;YqozK^ z0B6)wOgh=RjB>2&H;`?mLZtl~uG3*RhsyVDmvO*1PXnE&JgyN`J&-s2*DT54sA&?4;18t@zVn z_fFDPhf=aN(db>2orBkCsdXF2%KwyfWEIdgK8zKHSP>PB&qDEAAYibMr1xfUrwgrT zt52gV?74~dO`K1+%X8$Qe@N9xJ$P$Ysbf+cqNF`G9jU5@V#~q@f{jvaoXs9E7yA&+ z?wp}3hg--wrp~Bv{W<9Nf!Xr7?ipAh2g>^grpRf2h>~V_=@x+*HL;+t&!um%+NFLr z-EE@a38lCK{U2+y$y@e}3v}RIPpy@2r_E9_BMO&A*$k7ptMsi=64npal&X>(p)J>` z#WwR+gVn667mY?QSR$+!49u#ny-pb(w-?BB`USgN7Ro=dTc!Mn{tsuyk*a+kz|sYX zJZf}+En=K@Yv9jv=k=;fRpEb~obMg%w%!@gmj)NfB0+qnesSe*h<5NBR+c80g$k=ptr*U%&i!V+MxRiDC5^9(l{m8D= zNj2u2#Ovp(7oZk#1<^>=0c_DHvYVCxjg^ee^L`ls0* z_8NWTnZrNPW7H{w(FHz`sxDLlqB!wn>@(Y9)&)c$<5|DeccuoM?4$wFj7mUcV3MPh zI3?9#tiKu1FXrAN&zFCh`>r(a!+56%Ce__UUZyu6g;i0$d$HU$>h|K|kdeB+wYYd5 z#H-GNK$_|TN^DI=QDSi}`Xl00=cwy%A;=vSAzIbR=zS&7>^IOm%!Zn;B$z4VUYFwYy=(_^ z&VL8=LT{P8r0?>6lQhov59VRm-%$ppl0HQmM!gQg&g=(N@`GgIZTfO2hI`#%^O^vc zHeWwEvSiJ*`r)_3C{e_AnED!b)kV>(xxmqf5=g_4QF^FsoypduVbsf55?H%JPMczI zBSI{%su2wvp;u7%36Q5jnBiZc#Mx-JM4!PULQzWjK1k9V*OP;VyXh;aHrxmI4iba= zD;%<-QyL%ltX&xUJjh#lBpa1Iljf(}2Szg4N>nhE_ z3?Y-7psOKX^B0CxDp#WmvoUBF2>WDqwMXzcnUYlPPNVvBbb1HKBSg&S4(>j3HMk6N z5b1AhMW}!e2P)#JR~p)k-%0TgzsV*@WYSx*BM3++=eJhBThUcC)%INmyDYcDq#8LL}*H}Hu4BTXN(Vqv=M zw8*(qeXzXj;(5ug+!&$1pgz^4xH^utfKfhG$~#ellD49LgsNvi_-Uxx$d8u4 zTzW>{HgQtwoy4Rc%EBSIstFt~2AK?p0MpnUMC@@rZ*2M$y4^D_r7$A zQ3-oQsC3Vm3%@~SCExuntnetvyFq|U^%w}kV3X>24<2`b>;(D$;L=f5U`EwPFr4GU z>YkrK-IE~Ly}?UIhW0hS5d%;^f;#q<^cDg=Yh~JXX|^2EH(yG~r^^GEC`Zgj{G;gh zG?8)BJ+Xq~RqsOk`$0Yc!b~uC%x9Iwnw3Z?Urvh{w}>fxR}YQS{9n`3N_iDW2<18yio#Wm8aW9j-O+>VueSLbuc>F<$R~xbp-M ztbh>pEXe0Ta0WGp#(+SWO5bkL*TW-OXA^MXK1c*_UFpB(=8dsX$O;9T?})8xwwha; Pr!~)O-l4zt@bLcu>>>>>> dhd2016 """Function to extract lemmas from TreeTagger output.""" print("\nLaunched make_lemmatext.") diff --git a/tmw_config.py b/tmw_config.py index 872758c..92fa45e 100644 --- a/tmw_config.py +++ b/tmw_config.py @@ -23,7 +23,7 @@ #print(help(topmod)) ### Set the general working directory. -wdir = "/home/christof/Dropbox/0-Analysen/2015/hybrid/rf740c/" # end with slash. +wdir = "/home/christof/Dropbox/0-Analysen/2015/hybrid/rf10/" # end with slash. ################################ ### PREPROCESSING TEXTS ### @@ -45,12 +45,12 @@ target = 600 sizetolerancefactor = 1.1 preserveparagraphs = True -#tmw.segmenter(inpath, outfolder, target, sizetolerancefactor, preserveparagraphs) +tmw.segmenter(inpath, outfolder, target, sizetolerancefactor, preserveparagraphs) ### segments_to_bins: inpath, outfile inpath = wdir + "2_segs/*.txt" outfile = wdir + "segs-and-bins.csv" -#tmw.segments_to_bins(inpath,outfile) +tmw.segments_to_bins(inpath,outfile) ### pretokenize ### Perform some preliminary tokenization. @@ -282,7 +282,7 @@ targetCategories = ["subgenre"] # list methods=["weighted"] # list metrics=["cosine"] # list -tmw.itemPCA(averageDatasets, targetCategories, topicsPerItem, sortingCriterium, figsize, outfolder) +#tmw.itemPCA(averageDatasets, targetCategories, topicsPerItem, sortingCriterium, figsize, outfolder) From 47914de4ac05f1223efc743dd4160c523e797535 Mon Sep 17 00:00:00 2001 From: christofs Date: Mon, 31 Aug 2015 17:25:42 +0200 Subject: [PATCH 32/56] Added binsnb as parameter --- __pycache__/tmw.cpython-34.pyc | Bin 35635 -> 35627 bytes tmw.py | 7 ++++--- tmw_config.py | 7 ++++--- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/__pycache__/tmw.cpython-34.pyc b/__pycache__/tmw.cpython-34.pyc index ccb18ae9e6026ee38f88806d334f8bbd82232363..ad3e06396f8ae569c71c5f7cd5a65a0f546b29c1 100644 GIT binary patch delta 11707 zcmb7K3v^sZdA_r&-PNw%l5P1R*((d#UfXgkM}Ee!omh78OMb~O+0J^k_exrOUzxkI zB_-)LBs}7tK%AjTAgAR3<<&MxVM|JRls16^=GBl;$h4FgXp2LDCZ`<0<&gg0KP&Be zqZI0M^!vXvb7%hf=lfr?xA3s%FCO-E{;akx^yD2sI_*FOVxAhCExU#J2Fsph*|V9a0dp+7mH8&i zo@?3j9ugZy=3DjxZkuh{ZI<26JPllE*^8KOwd}BEcQ8*cEw=1V=I2@Vn=HGFd75O2 zWiMrZ!IX?`*4cJzkmezgm#O!Ox%r4?FXu5d$qLI}$$Td^WOrNkD(gOB4bq%9SoUh} z>ay$}%U;7g&3~h1_cFiKve#PnI_8&I_Ik_Sz&y>r(Xux&zudBKvh2;w(^@xM_Le+9 zthDT{mc5O6TJ;vozLoh^mc8Aw`S%Rb8drm2N*vd(U{ z21)8;mVKPNwpjKF%N}H&HgM9iPcgsEvQJy~?W)P!nuuEV8E(AQvSXG##60aPZrL{T zeLTNoC3je?Y&;PHKX55%PFNbSqwe;smjGLenuVk54uH@uT{ zQcgPOY84NhSQ{X)>zeUwbSPs_nEbM5=Ybbw(hY45M$ciUnI1@;m5A$5M`rpkw(oS)m>Zc8Ol~so?1&yjWFf zT3$`wKw~3Say4G*09gZ4IStw%N|gRwKM_kbi#b9rESwFES2aQgiTzy_;DYMadbe#o zv5|V&jTRQtn{?806&`!kJq>eb*Me;a!PK%Iq(ME=(B8_@gz>zXi1z1Va>U7L^@E14 z8+ol8(HxLY5}Vfb7c$azlZm*KNZZbM`Tj9SZ(&p8|9HfFwXWs1JhX64bWc{F44=NU zXXVDbS?)<+UncFWk~P#DnIl;;mS+JbxD}+dkGga+cWzZKbIwU8CLOsQz0DvEAdMhR zAbfzc!1aJ^1j+OMH-Wtgq>YI7$Ff-`ZOdMIh|HFY&=Cf?nTT$2X_Z_i>Bv|*?hMD` zxr~%s(On@SYi3#+Ru6#5$Ey8_G#=KlMkVwow}C*_x+n`1mlkK>zUzU*U1@MVD$NTtoHFx9W-y| zqa;Yk9ZN-Fc|$IDr-yVfo*5(G;gCD=4>7MrLiHi@mTN!Vg3|+Oul4 zu9BGxmD61dZH7MMnN)VmDfPMbKbyslV+B^K-kgl3hV0mum>OHyYqaepRcpU=VzwhC z3_}O-E+7Wz(P721+V8mWSk}>1=NvkFz?&RglRGx#<`TIvKKh+_gTYpfc0B4TaB>pU z_@pyZG~$Sm_fV7UVVP!S9SPqUO^qdUiEPq2Oz#|_ow~4BtEyQvh&s{gUFDGv6izJ4 z2?lD>v?H~~su>3io+|7_K7jcdA#A-2y^;yDNx?fsstv^M$M7&n2MBf{@s6&GC!H9Z zSji8!l{sRJsmV$*WaBpCYP6$6JXpBcVGA;=pD*5+XXg_vb3SY|1hOFlP26$aN(aLW zSH)w=WTaMClYwWFIGQllBA+=jLJuYIKSa&E2^}!Lz%_np7uv~mR+#fl5%?-`*MK#m zIq5iN)^YW%&WL!l@SDyxRmT}`aQ@ds@GA1ds~#fSJ1qO#GXU8>*pmo>6VHQ7yall3HeR+V8|lK9}at$qmFyc1FHr z<^pMyF#%xn!JF6x?-ZF;=DF8khK4B_^HF*)BVD$dvLj*Db8whOlMeE-CbghUg#2hs z?#67U1D;y@xI=rh8FD;~C|#3wE=+lqQurnV`Y^paN3sFcr()-vsF{>9nMYJ^c|v@m z@Ndg6iu@%SHZbYuFc^vijH(ziCF3aBg`&}spsq;~E}UGHlb3f>n=uXc!;Mp$j}m&% zlGm5hv*{X^Icg035xmzqeb7eKW=33b`a~+5k+~ODsC$8Us<5Q{KRr8V6qhCCRVogZ z#ch-YT1f{JY0A=|Dj8OXS04&ZzopfatLNIfm4KCqM}f2G1t&2wnscKVuA5AP7d8Fx zZ_`b7CT^Nt-F)3BHy6t}lDAm4L$#u3p7@2@(bFlOR*9ZF{hV-&sF!+f>@FJyace2@ ziKfWzYyv_%SynOq3V9ZKTBGJ#C;xcW@*ygYKX#4E48_8LcW<`Tc4$4H|A!U$fehQma>C0|3E;)nvnkTLXD zC6bxAJOkD{naIk21Q#5ZnN*acHpPnMSSsymF(9duvE*?K@RMefX?1w*l15#}ighX4 z6F4dTzJG1o9LclLQc+Gfw3k@9T7711_bwfxcu)+8x^5~m#xv4!0Wdm4mmgl|%%IV- z{x_+fbz8+tg*(=*6#0Hg_9DnXgM1U@pNI@ZLPitKXepTl$BU1mjWq#1nVnOdL?ED~ zJw#Gu$6VM5g*nO$`D5)s~N{Q0!72;<=+h>S-Vq{=)KxJhh@$H24Uye!}dp3r(B)g`d&-MdjSI zBVWq4E6n!SSl7Y2K6jLo3UW&Odvpve8lz3V?=e;gcFl;t@_E$3&k}R+Oj_KhGMti{b?k~FR8Oz7U!`I z!Vsj^0aMSo##?d_V97b%4~)pnSk|$1L+t!WIZkywA)7KE9F8e_O3&mLnE;K%t z#NK1)Rn68$@pjd=waZg23fs5#dGfqN_zQ_$_p__uy|RPg+SI6`kGdy4;bqrEL=Hgy z&~QS!xnn50io7WrLKtW&si`8Wq>E>%Q+i0Z@;PwF^O%Vec#h$yE43F6gl!4o)ELwm zaL2>&OaAW5k3J_Bo`GES~ts5yFS97<2V~^2nNz?397+-M023f(F z8>i|okscY?NtHNTX^35IE;hueF$F*QTh-IowO6-5(^2EtqD7=sBEFelF2+t1jh+SI zt7%Egvp_5t4IVjP6h6>bE%Kbbu`q`rQIJT)N4>QFAY2ZwVUDbh8f7h!NR3fdD;l6L z-NdRiG1CaAXH!!#RF=tH^vpccm2(BvvAPERNe9%GKZFz?0{IV+3QRXZq_W{JsL`Dr zp0H4VwDY3)keYK_Z)N=CHz@REBxx;{bbbs?&>uM_84-mGw|!Nd(2YZ}_&My9>MkBj zW$-A)IPFi-%|jx6)kote39J-U!f@!AZg?ymm&>4n_foH}9?nqVM>Xu#xp?$a9jVo0 zyOtUE7Na&9#^4qrx_ZQ+f}2cJqmU!V5}Pm;t~Z;^QLUbGM>7{t!ShliD81vO0*aCr zc%!laRLaR#Vc7uCIGh+U_d2z*Rh*;YSMUy_I}C0)lknZ|5gZ__OU zIU1u17uD*>m~^7NzHTT!O}U}+BT|p=ZWBLMmv^u5Y!iiF?%rnbH94Ep%J8mKY{YTH zaVmky!EG|PEhesN?}1%=n@6k*wUuze%U>MFcQA(VC)X(;8|bbH|p=d^E(CZ&5l ziHuEt4Ev<%_LtAsgT@X_RZZCsoog&7A{iaqL{ z+b^e$ZIi^tj*b81T!C|kqI+RflSR$s2w$dmBVJ8K|CIA!XEp7j$|X^;{+2n5yaZl& zSQk6qfX+@-B@=HW;(L)u}arM^yOMO^ItB>#R=$*;P zOo-Z*b5d?(dR0$7UHi@TG<)Qcg8#rQ5ihN3LIoVF$(&{px03?i3{uGNK%RaKeGc3K zaC3-Ii9_bii5_j@Ae#3Hb@#z}r&!hi6&rmhhp>tcADJxhydz9H!B7D~44Z;?bcl4}ZwHo(Rg4$5J6xy8lOSm#k$Pky_Ah)a9lkMvn!e1vLWDJc%ARO|e;Ld;m z;JgoH9{f+*>deUnd7Npacxl2J*)WC!&;(&336QfOBOs$7=ZNUq9DHyz8JnQ&;|*h# zEP*{}5W?Q$5T?Nqb8M$8KFy)>N)Uxqg3tCMQm9{`egt*WLb6PVPmrJov0&9fHrvrL!*?W8jR0} z!2b;b0sBdNGnpCSPW5cgHq|}+xk>krqr*?tDC>a4wXzp zvS;A~VXq$~uYcH7^?sV&IZri*n@KsQ%~*fsIjE+g1>#$(d#F>~Q#d%(B4*Ps4H?w8 zRIgB)W>P_|CgL}FRtxo1yu;+dC8Ib*LQgi@6fL`_6go7~UGC+ZXtQ*iQG<{br#bP{ zV4yg*%t+Wi*iuSHasj%z-u>?|KN+!^mFyy5?1DAIcEP}eT^j7Z&KMrEJH^k{=j{&h zK;epAFT^+1e>roHm;C!Zc=rLQyvEo7!#_;BHSlM>^LnMFO7MT3p3e+-h-&qg;icl8 zgz$Olnc=Veekc!?Vdz<`&XwWI;D8r-7vuz3i=NQdUq#8)5n$fz}r4%Km zeK;pdl(xus6>b>qD8sxd?TtUY9^{d_GWn*{CzH6;L8=*ql*#LU@W=+K6N^|Wd*QU zymdD&?|MgfEw~v)%y?FA8BkZVZR!WfZVnm!QnIByr=Nv)VGEC0gU&FL{h$m z$Z7R#W^D~uUF;oCsiy2Qzfrl`nq9UCx|M$nLVr{Yf@4SWFgVs_*YD7!e1wLcB`xkM zYS2@fE7D}rNgw@4oScwvs8mGWvOs*jdzn$GMeh=Z)NJinEUT@k)kiVg>)gxN?`Rh% z)KBi{67MTCk&WgVh<@9N<2S)Sp?8YcKm8QON>1kzCVLOa`-splaU6Y1o`;=I$=~2a z&1VQ#AR4=$2hmpPek`iAb<0$^rNn$cdw`UaQE$q1i}$NTxmThl=yIXLmtCdkHhl#n z5$EXgrFd-$Ly)tvAP2GE(xa{-&u&MmbOY+SW9C-@6l+?2U~Kui>n%e!!5dMgc8tcF zF!d1pQHUn8JRyM_81hCRDR|B&%P(LCur7dh6(i}>3DzK%m!(XGy-zKKJA;nrL0I58 zxSb&Eg1!q5F`2G;()m;^jnl#orFeqcv^#Ww{%zyu%SH`$%+ zoAFw@X=p5wEM7>N5s#@Y<6YwMf<6AQnEz1-T9I$E>GZ2vlZzYRvvi=3WQnCG<^5{M zZv<_W|G#AV6DeNK23y@q7Q=q6i65o1?YF6(F|R> zf{}!G`7!k2cq@dhtOXk?mi`Ce5&OsWe1oSUNJUPaSSaM@3M=ovQ*aD7F6e7CrgRxD z@5EZu&XBH~=x&m>A_dynv^Zz%m+R?0f?ml9l&lJ8g zV(Dq@>{!0Nrp(S%K0jZ)px&GB6kjO(UH%ENZTh6-Uy(4{pF}Kh zDI5UzHz1`f@c)BLr>t12kmfKFI@D9=Ay)}|@drXPdT_&(3z6NgHc!Z>4KeT`(G zj}H<#`c#L$m~vrxW&U#mjW?Gv0lE*SPxvIEw4~6P)YwQaaRa&Zz65=HjmM@_oCHaO z06-Nt19F1e@SRp{C?nJIB1TMtOxI2Fop^i~m#&w^w{ zFp~Yl^$uQs4)0G}m;5T;R|hY)Mu1@nkjUpjz72w7C_6Vk!4>M##TDM3T3dboV(xzd DBgQtX delta 11437 zcmbVS3vgUlc|K=XyVB}yTQA#^ELoPVwJpal`4PWu*^X>kc5KCu8{3=J-YaS2edXMh zA9BnRk|B`JFcS`>kOu@x9}p(MER+JBHZ8-@KuHP|NN$-SJO;+lB&0*9orIA7-+$J- z>y1)6wa35zd+vGs=Re>7I(MIX#Fu}>*Y`?&L+$>|FYZ|Sq7Xk9WqewahZsyyBy+qhO%+sJ=VJ~IAL)d-7UdB94&@b%e%r6r5 z3SkfE`{W`S){o1M+)81u;#W(By;|6V%+pvY>@~u^g?F<|*lUHoj$hGK>xI36`Q^gi zDC{BTY1&P~-pu@fu(t?%EAwGt-zw~F%+p%ig}sCMRl?pW>|Hs2poMn}dk^!2!rm+F zeaw>t!@}Or{4Fi{0gndBR2f3Z3xlPz3%x@6(Az>e8o+KL;_7UcXggqwgqs)_( z$Ao=5^P5GNeO%Zl_<=+o7xo>@Zx!}QVV}~^``QOi3;PT=lDw9%?__?Lu+Iwn9P=dl zd0|I%qrW|Vm$0MUNPCzNc8vLb!nTF&Fi-m-WK4?0evv#NtOG*YQ^KC+cL#+X7xvxE z)1IgwA!tNcBf`EQ>;$*d{-ih&PUb)0Pxz#z-v}(Pw)|NqKHH;{!7jZ&_&I9*&*1i3 ztkCHrqsOAkb(9y#;D4*u&8m1Nl8sKeR-@sagp+hqS=Z8cRiEhy5Rk39%bCbT+P;!? zE@!NS*SE_TtKI)=sv|IBbxj?_#*p^e()DjMQ)oFK4CYqgA3+UMkPNr2d znpMr{-j0?)ERlAdaE({(I#XCvEyTOUcvl@u&!n;>XjL+8J7xxSOGT58>Or^OB3X(7 z^_NsU>tG`NM5tY^(q9UlJIR7nm6qgr)N<+{u2L&7!6J|_NDs)(ld_l*YIlP~gq$zi zB(>?!UG2!fyEanQjEyP0yDq_1)ArrYHN=J+)iSj3&ix4|rTXzWpx@Qh(OL_(4FqaX zbs+Wnsiv-W7OA9!U^c3z98&gAn))~JS}UoqSFM}`^k6k!N-Nt<#A8l8WjmKErC*ib z-28@*4rRlFgE^?;bvZa&b7#-FtLs*8y~c9S28PoqXN?Nc+i;s=$ylBi2(=lcxDTr# zmB?OLlTBZ6Qt??wZNb|nka~~?kVX(b;d$T&K-Plfc>n9bt_Nu);sv9bjFYm}DtZXF zst&y1-EYL(1um_UO(z@`O~staXe^snY6yKQBxKDLrLmx%$v6$e*?Pfv3J>cURtox4 zn?RsyuPPBwIjS3MfOT1}Lj9SR*+5p>dryIUND=E#N3PFE~B?4)Mf=I;wd*Zp^ngl8UtZf@geb=zKm@JvzN1R z+qHsZEcCw`LBB&pykL5A(si=NC}4(Gfcshj?rMe1$Xq0rCC@RFjKCl#Tz;LJutIFJ z)JAhpFz?B@6(E`7wguzOsC34v$8i9@Zp2kqh~F~Xh!<{zloG)<%8vrVHPZy_m?AZ_ zbM*MFeqpg&?Q0;rROj-?I|t>a)z}N$VlQ!FpXXb%)~izKOO?~TUb`LojHQ#Aop
    9=8yh|s$HyUmGN!4CZIZ<*(3dR8smCOKU#>SJ;j2Cp=STy5! zRTmsOdqAAL;5>I`!p+9BGko+1Fam(HYS4~HuL_)kpYa09nJO4^SgJ{CQgW&8U9zx! zoOb2H&_vaI*(4ieyMK*O-IKq4Nmeo_3#J^dH&)DN&OoVx)aoA0#b{t_Wtf#rmQ4uW zC|qOUltOp5MfQ-8SYeFS5KA~wHmQ;;?quDS6j+y0WVpsnOz69Md!?IyPp>WIYW=IF zTXXDeLS@c|4TJzSK>&$4u3PC)u&`GZizX7`das5II-S5VKs+*o=+qQFlz=`1S>a2p zYGeDZ@k86uPG+*ooL!QDR*A7DtP#m7$1$^B(%p+X z5%K(!YD`6mv;1R6kIaU)&!jG-(w9;@ttxtR((<3&f7}X!-LuE??-@P>Ld}jlX8FeU zQP6nT!0xR@4TgQ{Jf^LJKMG5)#?7WPi8#gPqJ9axXkhGqQ~%e%u&mMB!<|O2^Nn8F zf0uN^XI?sP_RtE;1YDXtt8O7)vDXPC(w9h`3$P3ox9lx|DP2hWk)G z)Ss0?mhx`m+l=)#CZiXeh?$f!l}B`9Wn4a(|KZAqWG+kH#%F^Z^J>G@MpcZLl4X>% zbdh?8Ltb5yK;dK~oXT9HHe(lXxa5=yrxPLQ-c9aZ6=$>RSmuba=qnhvt9Z&r)cFjz z!u0WECatnRk!0rcsk89a%6j@cUpD)KzU96JQazOK9{iQ>z?_?4`FZV%qvlOCtM@_L zP&}2H$wGl@p+3C!cwcdp)eg8tXfyGc8!>)7f{vDcW^IRK%}dW@X{hNCfqN@9+38qe zoHhTZUT!u@2^?>{V&5(0`$m38=RNg1y<=UUd{{@0TxoTT3@$MeKfy!eZMdFLE7sW8P0_`9SV7VQmIfHbVY7ynOs3^(6tU}a%w z4SX-cv{ggdDVYa}12a>1;B8erk&dZxu>M&R_QS+kp-GiaMmX40h)K*OQ?4h+)oF|x z)q954Hd_tmoG3LQfK#G=-%w}USsqIZL^wC`{KTpiIzKenVAWLGoQ%439%fNCh`+$QkAVCQ$mh|kzPwjO ztZ49RdF}{+cviotDm{~NY^#aZFNdMkNLZXnu){L-PU)GQahV#hN+-FM z*i3@FP9%C!KeeM-zE`*G>i2zH(SEja$L@&n+cd&X>Zf)ikheCiuIJls!L6-S3#>&i}6r-$2H{sjPswuNu8Tq&D; z>IIR1bht+5;KI#P2f?At4F~8gJSm(9gUg)2^XTv#I@Fn+{}2t>zBP|E6TzuQT=s!h zD>W73U3oJ_b&8SP(^WqbyJ&D7-X|PTSA7<9;;7ZRv+RLX5_ps=5xs0PV)W5L8FEMl-h6Xw#JdIS0gXN;)ll_0W=m3T?M@DrnRl;jn6Q zt$)}5cxaXHHIaYy&~6kZYeyIF3(@?Ai01i^jf|@Qz|y7I#&*r7SExp)mZi##HWF?s zq{R9IqaCO2$FPdA18C#GS3`NlR6G?;m|HIYMEbH-9iyqp|5ak84(*Y6GCJkBLH*`v zpYIK!XU{*o&zPLVq^xH)fsfJSIEQG@m^s6dOkpmEl-Fy<+IAK_L*>eiFr1Pv=%!-F z1we#G~U`$o`e&a;_LPLpKtGywfSG)ey?1FbE6C~O?fii3SH*_`b%n4`}Mm| zERv7w$4-QEbd@_Ep)j0DO|dl)+Ms8zDudO2!z{^9mk)HS6T9H5m(diwl88D0fyquuwW(4mw4w+GR(2xKt`8#~X=YzUqgV>Fx9JHJoV#pv@$%6Jk@i`!x(*%=J5jL=^RDrU;WEW8W>8;yXz4jJ+-M zymrsz*Z)@a(8BT9tv!m-`8YMJW^|qh!rrG5ToVY_P{fEStcE{o0XhA^iM=#NJhw|)~?st5-I`R5}}nPV;sOl)GbU!e^87l8d52qYOJHO*z1Fa#sAxnS(xD4kEaRz${=l!&qIg!nVQbKmMK{jud zkg2JrPhWJ@c)T&`lzIE* zdfs%a+?-!A-CG8EQ%@Urcr(Vs4P}BAvm%p^JpT^Tsm&1fG~P0#LnTOi^|+a$>I`Av zwv@@6*<4p+%pk~|u&&q??WvTvtkX?*cgd~!!Mh)mO&kt(pt~<|N!_b|bYU>au7AC* zO?0+zpcTB@L(yq9>rCS#tpUH@k!Y7&^}8}VR{@`ei)<67da)5Ea9^*7wxhJToKZVZ z=$~dfwU-!VV;KuK8>+noL=eCU^VN!4N{EsKQ zdw%6vEtD{cL)uu9S<{=}>0ihQ>B$29`i$Y6l4Y6KXFw4w*Z_ zvCjJch|bh5>KZ3)jTAKHE6o+2XL7;-eNRnXQSYcc#LcoWS_fI`K^j0BK`NA2@GLhe zueXq??Z#}C`l30!4*hbbL!Q<@zo%aw%!kyyGRI&F+D;7L&-T;6!mY^|MWT{h*@_wW zfE*-3U$QuWmV67l7313=D<_R%aD1ffV76fN+Z+rPyWWBu5tDAXkFr(M*F@c!9h4*b zQ1-RGCT?m>hihU}Pta^-7a1JE=nA87;m8U5zoG|+r*G(MGo4h6KRUDWru&QMXWCjH zru$_RGwE6%Hy9m0HI{#UCQ*KYaT}%rimpQ?^b-Qs%s0?lN;24CJcqW@O%V%%>mqZ7 z#9rf5ctjMX%g-aUjT<;0Pv1em`)Ec*c69O-xpv!xL$*`;$c&x!fin)tfH&%M<#gsk_d#)5FU&ioKp2)%R9YbfyF;uJL zm>VBf)g45<1}>S8k;6y2%ZVFZGq9vFr?CAZ-m44lOkcR#DL<>)g|FQ}LYCNs`O>DpcO3ZBC#q27(P6IL?QLZfwLT$*M&gM=y6ycicvaGJbo-+ARXdH}z}1 z_WrP(&Rh3~WP(rE!8+4KZn)m!!it6A+%AGncY&~MoNu!DCCPiCASMjiMhIA3cq*Wu zf1pdo^~(?ZB6p2?di5zvjOpuV3O_qDSjAZuZsSazbT=(XpW5SDFO;R<1G(5xnOD38 zE8>LcZZQ_kl4%fPOG=$fmCfYnj#GU+9H%(o;%7OQKNE|pvzV%42TkQFJhpGo+5j0H z$`*X34ap$=AcOw;mP?h^pM3AX86UvfOq1Mnv%=QE>YGCpbLGO)R;*hM&Cz;0uwFgZ zqwhNn{t(?T(6^lo%^jm+jQ`+&as^@^{nNjH&~C0HraZdq%=@}Iw>K4O#Y*+K%f;_` z@DrN|mJ8B&{N$}!sw_5fxbnOHF)YBopby+CkaiHh?pzCQ7sxq~X^?wHKwXeokOx5? z0(qE-7f8({GgnN8{!z5@cf7BFdj{m|Am0X|Zx1Km-<`vOt5-q32l8W(7eHPF`5DM- zAa8*1&0iJmq;L@ht_P$SWElv`cC~^CeytRpNUN0Eh$bp6g(}uOY7-tQKhII>2DX^7 zs1bU!8r)bsb7f7;xM;fGHH>0^a&ur%QH(E|or+_8t9yZdVxv-PC|!bCxN%$$Dp-I5 iocP?;JyNs_KAAGG_F|(;_4^-M Date: Fri, 4 Sep 2015 14:12:16 +0200 Subject: [PATCH 33/56] mastermatrix: add bins --- __pycache__/tmw.cpython-34.pyc | Bin 35627 -> 35963 bytes tmw.py | 30 ++++++++++++++++------- tmw_config.py | 42 ++++++++++----------------------- 3 files changed, 34 insertions(+), 38 deletions(-) diff --git a/__pycache__/tmw.cpython-34.pyc b/__pycache__/tmw.cpython-34.pyc index ad3e06396f8ae569c71c5f7cd5a65a0f546b29c1..5abb7d2c3808db7a0455f18bed67d12402f8ebc4 100644 GIT binary patch delta 5593 zcma)A3v^Z0ncn|Na+8}k~`lwTSoZ4DfTdPwPiu5s4^t7Fkshw%Ns;e{FhYvdQ|NA7|K%KRk ztn;nC|GoFW|M%YeKJ|I+!w0qMcQUim-@WU%+dr~}_?4LTmqhLL^cnJ0^&s@t3f(IX z3(;35bRl#Qp>i=&A^Iv+caaErM7T=yRSB3Lo-g|53w^QBm#AgcLa!5gy=pJu_4-nw zH;BW4{{fSE8-?Dgh8kwLFGHyX3U3j5kD{-dp|2JrYw&H&ThcG|+tg6=>><&&7MCW? z>kJ5eP>m925_d?1T19x1=-Wi{4GTS}#y%nR5uxu%?V_(;=pmtp75yfbwaXi^ zjwVeO<{E*aa3Er~wb&^G!AMZI%H}0Op6wnCh5`{^IAGW=MK{n}+MObIkp1v*={Xp; zwv~ND%b%dWogxE6fqiljwY?w@Vp(M)9JM+t9{@OtzxC9^5iG6>!hP1ks!ITfpR6|E zA?v;Bj%1j`4YhYc9M9Ii0QchE-ZStxb}c#xmi4nmM*xnYZ%Mv}0;7A2ZvGteciS0Z zUnmqX`(r+HSl&-G>0ziPuyYMF8Vdyt8kNzJfZvqJW7#~GVMg}^BK?l658|ojJbY)# zQh3g)sQWL?*~!HdCE|_Ed1-35T=^`nY8WD!A8m+%vuNS674Q=JmUSf6d4}-o%kG7< zR`a!)n(B5o7M8wDOgii|V^5HT7&9a=(|=|-jt3j73r|whc12~7>@ItlzMTMHY}^f} z(X)IKOzV;5uY=+@e#2tej*s0?llufuY-Jc?IBnTCbVDJD5HQVPWXPlYOkaI{eSA)x zCyXsC7GyC~uS;g~?6Ww!q6%Kc2Ud9C41ROP_G*pD{KrX=O5X|{6>sgNV!&uNIR7~IOp2DRT;ER|Mbu(X=9|* zA_3`I0euyk{D^w~G^Wv?n?4tPO3R0_dgYd`G_jw=PZE1HAw$A+iTy-jZNy}nh>${Q zVkA}U2Ufr63VlHXHWE#zDQUa{bT?_Aj*qYW$Lj8cvUXY^;*YX1;whu%U|myuX-hN` zs7G_v5^&=eS5>ZODP@4bPBDgkv4Gbt-)54B;m-`OGQ7b+^-S2Qqeh@P7%}7rO!^Cl z@v+tUo+-9KK9NLHI;6u~$c0SxKNreD3a}bhAA?+u%vs_11N_OFrEm$gwSBJBJoE|< ztUX1+p1-aJGOd;CHfp6#-149bYq^}8w=kT<<2Uw#3*WoZ15aQIEn9XSgL7&i&}Q({gS0uzS2rz5#(sH!g{DB8=6k?{4NIe zpf^QU&|=$7G2cHVqoXnTQy!Z`M43zA&5#_zvK0S)<9aAVPivv;%S`xBvQs{8ZOwsS z<87@~umBIXeg?jb*_*nu-s8>bBh$$`E3zq7b1h+pCvc*z-L-(5MR>ODI;g|{ZCf#~ zlnED^soKiJt$L<4;kNcDypP{+UkokQJM9sGjks%bo9juQvJs!&ycP2B(&lQ|f>k%4 zftT_3H?Jo9c{@C0xvJfHxT_~3z&Be2~Dxgi+Q1N%sz?M&`Ov$K|zdAPG4*5MC3%c0-;UFSalG~z^8CA@|wx|Uzl zL^HiPN&`DN7K)m36?01_ZZX@n&kXwa7;-KCsjGg)I%c|*_NNVYb9)EFPM+=_j7Chu z@1vUF8V&@9hE2(grH7^6pv)QTo(G-yneG~p)~W7B$mrz>{R|TSu)T-|({@}_e=8GW z40{<8T$D;`#~vQwZRKvoIl3bU261XfDcp`Hc0Awgq=qslP2#HiafVcuW*d5wWhXQH zHOtz@;P}6xH@j*uC=K(LsML)(JCdunphzf&Of8ENTaMuyxBMb=mM4qx#h${M^!;T| zk!#u$AM}*Fb|pk^=q)a}lR5GcM;x_caBC%8C^Q+je7z6CrrUV7k70nIBc9wHjd@11 zXu4mL}ei>KC2tZcYW zpNuEh$09?@!rr+~2$wUP5njhZAlEX486pgO7(xtD{K~D1mnY)E?ZksJxpI-R<;Qu^ ziG?qO)3AEy9I9>0cP@ne)~=lcFwn)EeJsHkx8e*`@HsjSZhLr0d5X?HAx9a;8LrwV z5hxdVg%U+c#?ev*PvVB%%ju>wlk^q!aw~D6YLiA_5I!M!m49*e26FBAVB*WCD z&Rx!Ssf6R(itX|l=6Eb1lZkAn1S974jq(795+VDDLRG|xKwhMB%fui0YF(ENl80FX zMeuJ}Gf)k8T5SVRhbA06%8sb5+N~xk-fzYDu5Dkk?O>S z+E>Mea#bRyooIMKuY?crIlYSRq;KfiKv%5a1m>^1lSRIdb;xi6Ksmv~CQ!atQJ`2S zxF&q>l!l#y<&cl@!G(0Ath%`bzdiV>s`M4jj?>+<41T54ovYv{Sq8SOyn_Y#+dydO zO~Fe;d2k)h87=^{77kZ|%W*w+1WTq=Xl2dXKdU25Ti-MQ$<=+59fX?Juh)%dlTGg1!sTXiERAV(FN zb`B*iG)Cv_g*|mHRa8EU>7lZsdx(vl-tHTg@xXAz3``dnToc+wFNG7K`EVOvjFq7s zs$0gM?eOM0#pLg*Bayx3O^JA!K39~Fl~XUWaw=Z-vKaJeprU7N_6-C=)5ppe!X>a3 zpAXl-!`7SOw;<&ZEeYbQ(RC!3J61cR$_=sFQdUJi#6X{rk8n$c`EkI`P+LWusj6P^ z<4L`s|KTV05V9k>r`%l$H*BwJg@;Q`QHgdjU>jXl!1A#w!x%z#Q)8;)$_(s@iA9no^%2 zf}LTYZ)zS^@0&}n)9d%uz$tt)Tw;yxn}mX=dBv>LkGqYS8ef?6SzemjLXS~jBrfm4 zHSxZ=?95;fT{gX%{3o9JG(HiphE;ejUR;?dZC4$Gs>Z2uS2Wv+Utn z7M&=DZ{gh&HSisLX5!sVPw|9P4BsK3>ijsjUtsW%Y3#I&dJ_#M@`%2R2Mr~CnO)ww&2o`kCko$B%&FZd?Iw;0sn^gOpz zG41Bow;BEpH_pn%40)Otsk17f!b?nhnc-Rd`9bfrYl{yR!Jo0=kQZLF`VM77%PTzN zHG-M^W7U;grQ$Hk-S0Adk3lh1JE=XVLx&yv-NQLB7tbFqhx7RM;kPI`esX8+$d71| zoirFUZMP|X5o0ha!?Tj5i5<*lAZPl0CKYOWy(YAor2}v7VXx6+BG0A;Fmu2W`olB0`a34fx4q96+dPx z$D*hCx?Y(YGy~zS8*Xf8=FW{e9kP<#bf>1r@$2~3T~E>t;NH7y3PU`De<%>@2`zuY zJ8i}@cQ^Hn6Hy*!ILdGz!$S-f!*dKT5!fk_(Qs_sIl|8}^>+;a!|*Jx8+Xcf*rMRwkuq=HrDU G-Twz)XGf_3 delta 5461 zcma)A3v`sl6`s2xn`HAOA#Xx9gpe#GAz&geX%Lb?c!U5+ffWp!{C_q}_Qm`gNJ4l- zC=>*RjOBS+5A}ekMa93O^>Nw?s6{DGM*`=vRz05N*Pni**0L#F-)1j zlnNXYAHv2ob9x1HPGm|gQ|hRnTht7*e>z)p4|C37$}*-br=gn`NA{4R7N)G03(XP} zqH=0+RAt^;=Bs1QI<&8yDQ-DDhbbLQ@yPaExev6vo;mB8vW6+0vVR`gy42y-4@Ygx z7~byjs(zPG)eLK^$L~=zy*MWdY`hr{hkqME%*btuF{0(5*q?g=x^-pbY0HRS95bT1 z>Q=k?IBYwB_lxrUO4y^f?O6lVpM+q11-Pn}mzVQ1 zsHVY))jB;v9$2q&pcT#%91@RI7LGiEO~V@C9@QV>%jqFbQ{6s%s;?0jD({EaMQPPW z;QEtQS3$1&oyp_jqBu3VDCGpHsUvU`ysm#dxd}4DrYNqE3vK8fZnrNKjZ?=YUPLj6 z)jYQIqJL_^=r?I1-UDW&_*_~@M zC>_~lLHkm0ghkoF2C0w&3HX->{9jD(nK1w<*%5K;J2f;a?6 zG|$1wMx0v>wYYhpE66X?SR!`v1O!JspHCf`qGZm@tSpj^UmxI#S0&W=M>Kd$Y?)I6 z?}{hp6o6g4HRnM%CYH@zn)nGBfnPj6?$BSD8)LDSkmAGQ?7VqaJ2lh9i1~McL(H8& zH7Aogu8~5yq@7yhsIO8SnIC|gB7VVmn5j=*;0L%@9A8*(eS$>p6@OpY2r;6rp%CVa z8}4lJUdt&sBYxR2N-SM8YP5w^e2S8eC?l!e!?n;dGQEztSx*XVV)3dpaePs7dVUld zY{Nbll?v&Q4S5#cAkHtkXq5+kPCVL}@wH!HZd{-47?igpAv>}xsTSU>&sjVITFnWl zsloJ)jjWK)8iB1khmEJv+UAX;)*A%x5HygebVDPxG7#uChgv*-MeTOP^2OA>P`tXN z1PSu-lJfk!$c_;>mqn#ovLMcq0Cr0XWFcSC^Y`g@E!_(R6*Oxx%5)7!XbJ?&@QX3V z-hSp4KF!WAi0e&NV=Gb0ku2}FF)Zi}g!p7ye;BSGG_2hrPg|$Pr-|z3@~P8F;RMu$ zCON6SgkU*|$F&FiA+62jRSj#0>T!33I4S0Kl;+j)ynoW11GVBxa}oIUFPbZ%u$ctz zBk&_Ql8j{6I+eTJY7NF8TwcR1FJ3mbd>M&&348=$z4BP34NWvahH$4`>&0csc}_Z` zR9v48vq1(V;yq4AOo#EnxwyXUrCDcitbt2M;Z%xncXE;RA$i9!j4z}meuwQyB2{-r zi%rhBf-u$mv@|6dhP29p1WO_Y6RwHTE8g#j;K>NoRwvnlk?BrLOLoxGMl>>5+~m`2O)kX%~(Zn zzgW03Z?a_01(fD4>TD&DQLBYos|ko#(sUxU{BT7aU71@&?s24=xgrk-csq#`r}#7i z55XD&H$ewMr}%8;_^L3y;!Ju;&q)8uBfk>`h+3r>Za1we##-X|s#a)SL>gVRvPP|S z1lmT zCMFsqLK#t7NZ|@SQlv2rlNE#&GkH%?FUIIDju2wjTlINCH!rL(60})wE%etNTf64{avi zB!lFEoz6n?Kr%vV4l51wz;ygEcOKRkXWYeDY2I*uC=*Zqz0&D%5r=-e(`AXqvDRVo zbooZIA?kmKP~Dj%7I|!$#ls>3h|EivYxu)@&{GgWhlU6~-0puzVuU5PyWdRp>+!3UBx^)epa>4>j=*(@egNmM5~*veZM0!yn2W0}QSQsPntK&n)|TY% z#1Z~I0{mh{L@jw6_JcU=tCt(iiavndYjC6WHRdKQVQI&3Ge<|ef(>xjN2QZS8N{#(eq)_GeSq$b45AhrP#ui7=5c-Y`n6CY6*ZA^()I@uawM4?`igqP zX(N%eCW!0Z<@gqv)-wf*+XX$7;E34VQv@gVV?FyJ{RGJjNr@GraYJ!3QQL^A@n|8J zzZbc@eS=StnD6m7gr6NqWTpcaC4V0A~Oz2nKBg0 zsF)e)*B_D2rwN`BeSIZhhQJs5vf!k+)K>&AiO>7~n)x^hoFsS&0mJ!Y)P7OGrrh)= zsCk?qQrE;yh1ox+BQ;A0nh@ zT7c7gE(hul*ToHdG7dZ9Oixg`W|qj(weJinlCz|zFOztTNxvl^+*>q!5qWP*A)M9! zv*l@+X&wVD@+^X3_0l!gcro?NAsFg4>LYjd=210e7ppg9q+xX|{TOTdu}wusW^C1# zhaH(=8m!ao9}_%JKpy3%31on6q}B@r=7przH+nSUrpEI#Br3C8ct1a-zMm01E5>Yd z-14$(TPEBPJ@|ZHe{5S4)SM-W^9WyGQaT4SHA)R@>G5TPR|up+xsqJ-7K`WwDRz4@ zSVh+MJa|joz5O~~fZQD=^>5=OBdXmKGU7tq<=5H++!t|qrjW}dv19OcAcVgRpl`#w ziQL2P9d_X2m^C@$^DhRfo{jt?rN`Im7Ke94gYnzT1!9vN+?RZOF0y`Qu*HuGE2 zgE`X%iBWM_cH?{d6;ZbHNzDFNcNS%=CJFi(g2^{~ew9|4FVc2Rsp=+$TM70M>?Jrz z@EpNug4Ym?Xn&V4*kk7K^J2@ctj)1xPbz{j9Df6$+~D=JmIr&NWmwJsCNJ(G$uxpO zf*b-nfrDTs!F+-Sf@XsI2>b+`;P!f!_h6r~IYv`mtpV=myGW#$V6e*J57DE{>-(rR z7{B>Zd^8fZHczmpyv^H1zu*`#w;EiM`u1qSU#0nj)Z%Z^{B*6;TcWzSzg$t<0={5C l^MpJBKPQTCav{G$@N0sr;>?3%VYm3>gVXSbjpE%+{{=}&A~FB~ diff --git a/tmw.py b/tmw.py index f89d46b..bba2d0b 100644 --- a/tmw.py +++ b/tmw.py @@ -321,8 +321,8 @@ def segments_to_bins(inpath, outfile, binsnb): filenames.append(filename[:11]) binids.append(binid) - filenames_sr = pd.Series(filenames, name="filenames") - binids_sr = pd.Series(binids, name="binids") + filenames_sr = pd.Series(filenames, name="segmentID") + binids_sr = pd.Series(binids, name="binid") files_and_bins = pd.concat([filenames_sr,binids_sr], axis=1) print("chunks per bin: ", bcount) @@ -539,7 +539,7 @@ def call_mallet_modeling(mallet_path, inputfile,outfolder,num_topics,optimize_in import glob def get_metadata(metadatafile): - print(" Getting metadata...") + print("- getting metadata...") """Read metadata file and create DataFrame.""" metadata = pd.DataFrame.from_csv(metadatafile, header=0, sep=",") #print("metadata\n", metadata) @@ -547,7 +547,7 @@ def get_metadata(metadatafile): def get_topicscores(topics_in_texts, number_of_topics): """Create a matrix of segments x topics, with topic score values, from Mallet output.""" - print(" Getting topicscores...") + print("- getting topicscores...") ## Load Mallet output (strange format) topicsintexts = pd.read_csv(topics_in_texts, header=None, skiprows=[0], sep="\t", index_col=0) #topicsintexts = topicsintexts.iloc[0:100,] ### For testing only!! @@ -587,7 +587,7 @@ def get_topicscores(topics_in_texts, number_of_topics): def get_docmatrix(corpuspath): """Create a matrix containing segments with their idnos.""" - print(" Getting docmatrix...") + print("- getting docmatrix...") ## Create dataframe with filenames of segments and corresponding idnos. segs = [] idnos = [] @@ -605,7 +605,7 @@ def get_docmatrix(corpuspath): def merge_data(corpuspath, metadatafile, topics_in_texts, mastermatrixfile, number_of_topics): """Merges the three dataframes into one mastermatrix.""" - print(" Getting data...") + print("- getting data...") ## Get all necessary data. metadata = get_metadata(metadatafile) docmatrix = get_docmatrix(corpuspath) @@ -614,7 +614,7 @@ def merge_data(corpuspath, metadatafile, topics_in_texts, mastermatrixfile, #print("Metadata\n", metadata.head()) #print("Docmatrix\n", docmatrix.head()) #print("topicscores\n", topicscores.head()) - print(" Merging data...") + print("- merging data...") ## Merge metadata and docmatrix, matching each segment to its metadata. mastermatrix = pd.merge(docmatrix, metadata, how="inner", on="idno") #print("mastermatrix: metadata and docmatrix\n", mastermatrix) @@ -626,8 +626,18 @@ def merge_data(corpuspath, metadatafile, topics_in_texts, mastermatrixfile, #print("mastermatrix: all three\n", mastermatrix.head()) return mastermatrix +def add_binData(mastermatrix, binDataFile): + print("- adding bin data...") + ## Read the information about bins + binData = pd.read_csv(binDataFile, sep=",") + print(binData) + ## Merge existing mastermatrix and binData. + mastermatrix = pd.merge(mastermatrix, binData, how="inner", on="segmentID") + #print(mastermatrix) + return mastermatrix + def create_mastermatrix(corpuspath, outfolder, mastermatrixfile, metadatafile, - topics_in_texts, number_of_topics): + topics_in_texts, number_of_topics, useBins, binDataFile): """Builds the mastermatrix uniting all information about texts and topic scores.""" print("\nLaunched create_mastermatrix.") print("(Warning: This is very memory-intensive and may take a while.)") @@ -635,8 +645,10 @@ def create_mastermatrix(corpuspath, outfolder, mastermatrixfile, metadatafile, os.makedirs(outfolder) mastermatrix = merge_data(corpuspath, metadatafile, topics_in_texts, mastermatrixfile, number_of_topics) + if useBins == True: + mastermatrix = add_binData(mastermatrix, binDataFile) mastermatrix.to_csv(outfolder+mastermatrixfile, sep=",", encoding="utf-8") - print(" Saved mastermatrix. Segments and columns:", mastermatrix.shape) + print("Done. Saved mastermatrix. Segments and columns:", mastermatrix.shape) diff --git a/tmw_config.py b/tmw_config.py index c0deaa9..f245b10 100644 --- a/tmw_config.py +++ b/tmw_config.py @@ -45,13 +45,13 @@ target = 2000 sizetolerancefactor = 1.1 preserveparagraphs = True -tmw.segmenter(inpath, outfolder, target, sizetolerancefactor, preserveparagraphs) +#tmw.segmenter(inpath, outfolder, target, sizetolerancefactor, preserveparagraphs) ### segments_to_bins inpath = wdir + "2_segs/*.txt" -outfile = wdir + "segs-and-bins.csv" +outfile = wdir + "7_aggregates/segs-and-bins.csv" binsnb = 5 # number of bins -tmw.segments_to_bins(inpath,outfile, binsnb) +#tmw.segments_to_bins(inpath,outfile, binsnb) ### pretokenize ### Perform some preliminary tokenization. @@ -100,12 +100,12 @@ mallet_path = "/home/christof/Programs/Mallet/bin/mallet" inputfile = wdir + "6_mallet/corpus.mallet" outfolder = wdir + "6_mallet/" -num_topics = "250" -optimize_interval = "100" -num_iterations = "1000" -num_top_words = "200" +num_topics = "250" # string +optimize_interval = "100" # string +num_iterations = "1000" # string +num_top_words = "200" # string doc_topics_max = num_topics -num_threads = "4" +num_threads = "4" # string #tmw.call_mallet_modeling(mallet_path, inputfile, outfolder, num_topics, optimize_interval, num_iterations, num_top_words, doc_topics_max) @@ -115,18 +115,20 @@ ################################ ### create_mastermatrix -### Creates the mastermatrix with all information in one place. +### Creates a matrix with all information (metadata and topic scores for +### each segment) in one place. corpuspath = wdir+"/2_segs/*.txt" outfolder = wdir+"7_aggregates/" mastermatrixfile = "mastermatrix.csv" metadatafile = wdir+"/metadata.csv" topics_in_texts = wdir+"/6_mallet/topics-in-texts.csv" number_of_topics = 250 -#tmw.create_mastermatrix(corpuspath, outfolder, mastermatrixfile, metadatafile, topics_in_texts, number_of_topics) +useBins = True # True|False +binDataFile = wdir+"7_aggregates/segs-and-bins.csv" +tmw.create_mastermatrix(corpuspath, outfolder, mastermatrixfile, metadatafile, topics_in_texts, number_of_topics, useBins, binDataFile) ### calculate_averageTopicScores ### Based on the mastermatrix, calculates various average topic score datasets. -### targets: one or several:author|decade|subgenre|author-gender|idno|segmentID|narration mastermatrixfile = wdir+"/7_aggregates/mastermatrix.csv" outfolder = wdir+"7_aggregates/" targets = ["author-name", "author-gender", "title", "decade", "subgenre", @@ -168,8 +170,6 @@ ### plot_topTopics ### For each item from a category, creates a barchart of the top topics. -### targetCategories: one or several: "author-name", "author-gender", "decade", "subgenre", "title" -### numberOfTopics: Must be the actual number of topics modeled before. averageDatasets = wdir+"/7_aggregates/avg*.csv" firstWordsFile = wdir+"/7_aggregates/firstWords.csv" targetCategories = ["author-name", "author-gender", "decade", "subgenre", "title"] @@ -183,8 +183,6 @@ ### plot_topItems ### ### For each topic, creates a barchart with top items from a category. -### targetCategories: one or several from the following list: -### "author-name", "decade", "subgenre", "gender", "idno", "title", "segmentID" averageDatasets = wdir+"/7_aggregates/avg*.csv" outfolder = wdir+"/8_visuals/topItems/" firstWordsFile = wdir+"/7_aggregates/firstWords.csv" @@ -249,20 +247,6 @@ ### itemClustering ### # This function creates a dendrogram of items in a category (authors, titles). -# The clustering is based on the topic scores of the items. -# Input: the average topic score file for the category of interest. -# Parameters -# figsize: The size of the resulting figure in inches, width x height. -# sortingCriterium: Topics to be used are sorted by this criterium (descending) -# topicsPerItem: Number of top topics to be used as the basis for clustering. -# targetCategories: Things like author, title, year, depending on available data. -# method: The clustering method used to build the dendrogram. -# Options: ward|single|complete|average|weighted|centroid|median -# See http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.cluster.hierarchy.linkage.html -# metric: The distance measure used to build the distance matrix. -# Options: euclidean|minkowski|cityblock|seuclidean|sqeuclidean|cosine|correlation|hamming|jaccard etc. -# See: http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html -# Interesting combination: *weighted+cosine averageDatasets = wdir+"/7_aggregates/avg*title.csv" figsize = (10,80) # width,height outfolder = wdir + "8_visuals/clustering/" From efd5fef23acc77babfb502a3cca8996601a2ba66 Mon Sep 17 00:00:00 2001 From: christofs Date: Fri, 4 Sep 2015 15:56:41 +0200 Subject: [PATCH 34/56] Bugfixes in heatmap, avgtopicscores, related to numerical targets and std --- __pycache__/tmw.cpython-34.pyc | Bin 35963 -> 36091 bytes tmw.py | 26 ++++++++------- tmw_config.py | 58 +++++++++++++++++---------------- 3 files changed, 45 insertions(+), 39 deletions(-) diff --git a/__pycache__/tmw.cpython-34.pyc b/__pycache__/tmw.cpython-34.pyc index 5abb7d2c3808db7a0455f18bed67d12402f8ebc4..4e549fd7bbf2da47311a88ea43b474dd225528bf 100644 GIT binary patch delta 6372 zcma)A4Rn;%nf~rKnVHOFGLs*YOcFvSh6n>m1Q+DTYWN931VSLeAZr>XGhc>D=8yY* z0RjU-QCNSr^l*_@EbZ2{v`XzccErV2j{UJMSS!-AJ?f|K`orS7sON0m+EtG0zR&$a zW@PD}PR{q_e)s3S@4fH)KKH%frEjX29#Y*Od5Ya{Jo5hfk6wS$U0b8BXwOco8DdV; zJ`hXBBI0oE@7RuPWb)JMq>Q2{rk!h1Z9AVqW?#zAlagUtm8F+Kju{Q`vu?aUZJ^ zB(i6Rqdk?>THQe>_s}~(q=?Q;6?;&8x4E1h7AKkm{zItu5|~9sYFJD5TN*tiK5SmX zPG)Ddyrs!cHY<4>&94B~1I~%3 z=LJe0N3jDC1)LTa<^?ML1;qve)23@t)23}V_}ij(ekc1lJ2ci~;u?P)ZNCNl5%3@4r3D@AGf}bd zu6c4zzd`MP0saK|1R!TGux6L4{~on}CJ5U2r{blBE7<2*|BVUeutbRazcR4wGxYf! zpb?mMeRC*1pEA zJU3b;9=mBx;m?%}Q`-BUIe2sPZFVK%Qig2`Kc!@x%1|+0qfChuDiOD`m$6+=C8;PG z>Q*GXoxj|z=qlYJ9`vk=6pK|i*U#`O5g&n%`nW`4E3i|J1wRmh^=2oj&DdWZYTZd38p+|1rW8&zd z8g^gyd$2aSjs|qepI}>q~}KI1WuZO|1&B# z5SR`l6_4o#@5hTRo(gm6^mXVUJ%1BQe-M0WSA&m+nrT9)5f zSBC*b+erPm2Gsd^i2FX^J#o+SM)sa~Y|D-DEh^um4Cxp#Vsor)XUl4fk3%60hF#orv-ou;vZHHu}h+8)l9Z5 zJ8xBzu?}%;^$N#v^y?6RTHVDeL`P>M>k=2E)#CO3CnsCxq|u7>Ma!_~=(UHxBD&U0 zo|d!hhg2K1ncl$|*9~c<4v>g+M4wpOS1yjN@l~`?aOwz5g-lezs#%@NyTuD@&R*t& z?{rmOcI}^bZL63!(O>nFIsIyx$~R`0ubspOECL}(omSkyNi4HgQ>i29=H=8lPM|2A z0lW(6Mpx;I>rtvDFkL#?GM0>J+k=JtHdL<@r`AnpK5=1P>+;3WV-k&3Y-OsSxl|7e zs3pu#ZbnYBiMofKXr&+A$3c&b9oPu*&J=?w8CCp#W0T#mDMNNK)?M941qY}_wjrLX zh<}Jihkh0fQieR>wTwfC z>j#QKHQw=825ut-%9W}-4JIj7M0*h71KUL^BzKK#R3f^1^oF_Ep0%{bky3)5RJw(p zSXd7X{_1>^F4Fd!Rvyh)>(kCHVe}6l)=A3)n)6Ll1}In(5fA zVYub`0qV`$r4v50z9!fWV@{zu#WqQ;WR0wjc%q#6!owOYPcq0T%evasGN>@MX(wjWICZlk zh}1BPiMLl6T0-yMoZ6Pm)9S0aeXvw3ul_qIgtUtkl3`X2Yp6VveSG6LSy2ch1^{8P z`i{DJ(htv4Z$5}hI=K~#qA1=dnE=2`rRvB=r0JZ?q;?AUhVJpevIahR9t|)CJEiE_yYR&*RY^Ka~ci%L&D|@VOfDQCOYzP8J zQOW>z5d?jF1myslq)E5pbsJzO;0pOUKa+=mR6%NFIoqy&FMC2f&_9v4i#{r?d> zW?f6u&}o-5`jv!l>6PbYA8e{x^a{*)1#;Yk=t%7 z%R>hlV|%Cr@4;&XfP;(6yvj#2lhweviO`e0M>>M!=x3qfp#lgV_x@TT9IGeTYL;FGON|6;~ zGQ0w$ycJ$@YoyGaxLkNy@$|iD9rKG{MyD}B@$^pUuZksCo^ih%OEN9ycS123klzLE zZ2u>|+J?PiP0U?2k78+pyJaRt*%3dKO~)Eo&iAZ4ghh&9_OoDdo+ekgSsCVsOI@K3 zKZFM9deWm@Igc_uT9R&#nq>LONh?E_v_K)E@OuDxuG2Q`XG_F};W~CWyLI?s=9AQZ zD@|#t@okjg1`;ielHUKGM((JT`9(mNE*h8~8qibm5hKXICen!t77_aree|7R&+;m9 zDN#i4Gx%iDm~5F1H$Xt)jQgtyp18eke$KhZGmHR+OGl8zDxC{wJU%EO?^%V!MJ5=v zWhu%HdSVH!KOReJNO*dLE!D)noeieWGyB~nDIv6F?9uEx{a(f%5pNj&6^GH|+ki(2f}SfQOa_hb@R@3mFj?14x zaP;#UNR)(W6TjZNx#AJTk36I3Zj8$wMb8hjOSf%d3*}``vMhwXk4{0aC9r%lNmodCC+7< z*mL5~nUAY}hz>snJV!u>%nwjLCD`uTiXWkP9FQ+-_U^{&bJ%?=(Q~Ul`YWcCcM9Xb z$R>90WMiOG%bj0BDR-3afiP)eY?JBKQ{0Fpquhad=?O4>M<<_0&9whA@R3|w_dw|c zxsz0ml0Jdtm9kcuZuK$|-J`Sj#p`<-*_rGYdw#?gS=)d~P7s(5c}3BAGip`=uCyBJ zV9p(N$bAE1)6U9rx*1737Fu=;Yh=jEy_>^u`yBV5?{A*S9P95;CA?kf6fFiP)dv;TOv zm$jckkFx}qBO6;{9kx<}%uj!Yx}O82Ksl3~bKIfuL7;G-k5vl)zB=}@xMAOU`aD{@ zzp+!A=NyeRZG$nxbQwI9)CW^Mkr#LKVNvPmPP*WZj z;^O|dW=ZW|CGm89l;)P|=8b^P0m!#>miIL8y@ges_K5@6&5CUCx73d<5T_ zP3L%lOAr4ti|}7y)P$e$modIVACA+P&}3^w3#St4lpaHJ%Hb&-_T;YtOu(Cf8RGQ) wn`Tb&O!YZS$X1?0kI!fK)Oh?JH~p1)Dm{}tMINuG$}`PlqnatA`+@2I3uWfc`Tzg` delta 6282 zcma)A4Rlo1wZ8i%Gs#RQlRqYt9|DsGgb9#>4HEG|C<(})fsjBz#W2a-B$G^LhI1!` zgg`_<(JC%E>O;l8=cBDlw65B*rCL|53tRsll2={T-X$%xWwqKr|6Z#Peedn>+%ywe zy4tYj+M9S~{FErvEF^VfksGhHjH{DfXvv ziKM){%B1^J*)y6lX^(81eG3iCU(J4o?v%%ZFVn-ayZRAJ>=o?#5~^E2lpcjT*`JL@3!K?vW?w21+Cbc^RynWW3LawcF+$m4BJL zoKB`LpZ7cciqBPyIYQ0OLn|Lp08ne?izns!`Q2>)$@vL7DrYZPN2lb#f^+mrYV+kY z_44O=Xp@y~_C*pRK4=QU)Sm$Em-p6I(J}dStw}GWY8I{{dM7n_#RAGdhwh&PHUY-u z4;BUTA3<{^pd0Xl{QaVU?<|^?3|59|bXyt5uB138E9#o){ZzEhLG9--09t}h`)3O8Qv( z7vHof7!r5Bbt?eh;lcj}ov=N8q4|_EK zo@{MI*G96r<X<>&Jc4}ds>*XmrB{eOahdKLojdnz|HVE>$OQUd>))Fj` zrm)v?O6_D^uJ_O_o!dM+bN}gL*Mt7$!E}KVXnl^&%YLqO{ zdJDDQBAg39n+aLB{H1a`Y_W`rYOxSa*eZ_6sST|;%CSyfO1vg_H^P5C>I}Sd$))WB*bgzD`26>X9(ul_LW0$L58oss$?oA;U*OkBQh4mGA?>taMJnxII z*XQ}>aE_VCFD;{xd{jymy4Wab!>>Za{l8;Cf7Qb;V;zvojPmEN71q z>F!C|F&^O2qCo}R^vz|nnh`HkS*B%p^XP%fl+#YRZp+il?L<-m z$%AmsUK-piwPO`g7|aySP^MT;F}9sS&-OXw_N_(!VQdNnt#KBWkVnsD&J%<3wykgV zOv(Zk_9_>+{`tHGRqSIy%fZBC2p(n!IxlYJGV!iX8vSNVPkb;o$(C*DecA@O98|ww13`0aj>pg}OgDy)+eMDy&359%Fz$Kok&XuslhG zOM5gl6dy>+(6+LL={R%Qai(0QT&sNWdEU2U?9{eWF6+*2n?)lj#|<6S(Sl_{uzwJ( zVF2eGQo%$!fG*|TBpwF%ur@1tLzAStKCZ{avjW6ea%0hYO#Tr z_#%#G{aj=znb$OGtY!>+8(qnfOB=Jm0`S)4@H=d`x#teq7bu z`K~PQog+_&JfpnhUS=Q%2&S8f&}Km{oLJ(}dYMz0^}JRMlP(Dn?7@fW(+{S<>#(l1x{sKxyETY}^D^k)1Yy%QvQ6F2_)*u?nxv zyPEyj(#GXl^A?7dpjS3;{OagLj@fmM3eR+1VW%CJN#pe`j=qWA|3w+dRe3Tj!WfT> zN2rT)3qM*;uB?X=hR9^G<%qElc=9o5)t7NJ>Q%TxhxY)uyF0E)3Ih zvUX#qX}QgKzhTS7L?Bhq{3VZ40TKrZF-4^b&`MDk;w)4yFpjoy6?E)0pKP z&%@(JCDqBF7?smO>vqPeWK(IE&bZx@;wAG{ex|c5%g!=ow8}CS{77g!KkVGv-Zh;r z${V{X`Tg%dx)xCVK@7MHThFnRNyTh{ZMXAC?LSS|N?$sU?8KSbQ$bT?L(eQa!iiHV z5A?hg`lph|HOYbL8upJ^9L^Fq!Z>(J?1P1uCRCApw$CfiMBM(baN10;x2l1-j1e;R zZlsdZp=H5~e zZo-@cfQfj)E=p*re6z2D?n(W%?*YnF#J+|nS$cGk>(YPkpI23jNw+f$7cMP z#lY-PhZ&C!B!l7(`EI|DI;AtV%{hz}FUUx&g0{;OVyZkIo3{YYYYOJu^#0EGKTUicEtMhnlPrs^o@Vb2UDXpX2dxKqPCB~dRjrSz^&m}>ljz2r{;iN-L`l41zlMmT;41Zn5hDvhZL=y!PfAKgW zX!VjCe%;=g{T+<>~*1X3`U>nZx&z?=fhaRI=c)R*vnSm4}S5 zvKk|1G87x;6mX7QTL|w*w(uKXP#53B(8r`0sj361TF$OW_kb`gci4!9MZ8;t`X|*g zOEEfK)-6yvOnPH$5{tN-&tHAJD};`85WqD;DN?gkOrVtI8dlX=n@Rre zMw2ec>-JUAi>c9lkJB>y7_i8b43<;9ZkS>YBx?bedJQtTbGQPPa6q2hU0lR9s`6vD z?Z-}o9eMBmp3d{U_;6;8xwBibjP286+xC-C^aH?C0614X4Nz3wiq;PS>1*=~ojWPZ zbHodnq%P3()=ok8Jm5+BjRQgD@G0`w2a4%$@?(B}HC1)cON}pL#H$Q{&ukotefug6 z19*HE@MC~ds8&*Ijt3N8>kr+WM}_j}%@uS;e*fm*^0&$dZmDWg?iu5mRz_DOX}OXj z6f?WxqJL7})xm`-(6jhOF3BGe+1KeUP2Py-P$Mmo&4)_q4cT|-+$THGK`2!Ee&jL&X%mB;-Gy>KGHUhQ++5rNv6Y$xW5V414oNq>`BQ9d% zFh=YJOjK6l2p(0na1UA!suVkp<~JFvT(dKh7^>-v4&bA&l{MZFD#Aa@BH}sBnus%T z2J?MpUzERC#%jVwXS_cVHzUbNJSGq+0{#?l0)7E_3s5ar-M)QJxo1XRRz7($JlUSS iJcp;uQ{r*+ze11CGsTnR@p}B8S={UPRLIwEpZ)*!*1?Ma diff --git a/tmw.py b/tmw.py index bba2d0b..a9dc2db 100644 --- a/tmw.py +++ b/tmw.py @@ -322,7 +322,7 @@ def segments_to_bins(inpath, outfile, binsnb): filenames.append(filename[:11]) binids.append(binid) filenames_sr = pd.Series(filenames, name="segmentID") - binids_sr = pd.Series(binids, name="binid") + binids_sr = pd.Series(binids, name="binID") files_and_bins = pd.concat([filenames_sr,binids_sr], axis=1) print("chunks per bin: ", bcount) @@ -667,7 +667,10 @@ def calculate_averageTopicScores(mastermatrixfile, targets, outfolder): for target in targets: grouped = mastermatrix.groupby(target, axis=0) avg_topicscores = grouped.agg(np.mean) - avg_topicscores = avg_topicscores.drop(["year"], axis=1) + if target != "year": + avg_topicscores = avg_topicscores.drop(["year"], axis=1) + if target != "binID": + avg_topicscores = avg_topicscores.drop(["binID"], axis=1) #avg_topicscores = avg_topicscores.drop(["tei"], axis=1) ## Save grouped averages to CSV file for visualization. resultfilename = "avgtopicscores_by-"+target+".csv" @@ -704,7 +707,7 @@ def save_firstWords(topicWordFile, outfolder, filename): #firstWordsSeries.index.name = "topic" #firstWordsSeries = firstWordsSeries.rename(columns = {'two':'new_name'}) firstWordsSeries.reindex_axis(["firstwords"]) - print(firstWordsSeries) + #print(firstWordsSeries) ## Saving the file. if not os.path.exists(outfolder): os.makedirs(outfolder) @@ -729,7 +732,6 @@ def save_firstWords(topicWordFile, outfolder, filename): from wordcloud import WordCloud import random - def read_mallet_output(word_weights_file): """Reads Mallet output (topics with words and word weights) into dataframe.""" word_scores = pd.read_table(word_weights_file, header=None, sep="\t") @@ -849,13 +851,14 @@ def get_dataToPlot(average, firstWordsFile, topTopicsShown, item): def create_barchart_topTopics(dataToPlot, targetCategory, item, fontscale, height, dpi, outfolder): """Function to make a topTopics barchart.""" - print(" Creating plot for: "+item) + print(" Creating plot for: "+str(item)) ## Doing the plotting. dataToPlot.plot(kind="bar", legend=None) plt.setp(plt.xticks()[1], rotation=90, fontsize = 11) - plt.title("Top-Topics für: "+item, fontsize=15) + plt.title("Top-Topics für: "+str(item), fontsize=15) plt.ylabel("Scores", fontsize=13) plt.xlabel("Topics", fontsize=13) + plt.tight_layout() if height != 0: plt.ylim((0.000,height)) @@ -863,7 +866,7 @@ def create_barchart_topTopics(dataToPlot, targetCategory, item, outfolder = outfolder+targetCategory+"/" if not os.path.exists(outfolder): os.makedirs(outfolder) - figure_filename = outfolder+"tT_"+item+".png" + figure_filename = outfolder+"tT_"+str(item)+".png" plt.savefig(figure_filename, dpi=dpi) plt.close() @@ -996,11 +999,12 @@ def get_heatmap_dataToPlot(average, firstWordsFile, topTopicsShown, allScores = pd.DataFrame.from_csv(infile, sep=",") allScores = allScores.T ## Create subset of data based on target. - stdevs = allScores.std(axis=1) - allScores = pd.concat([allScores, stdevs], axis=1) - allScores = allScores.sort(columns=0, axis=0, ascending=False) + standardDeviations = allScores.std(axis=1) + standardDeviations.name = "std" + allScores = pd.concat([allScores, standardDeviations], axis=1) + allScores = allScores.sort(columns="std", axis=0, ascending=False) + allScores = allScores.drop("std", axis=1) someScores = allScores[0:topTopicsShown] - someScores = someScores.drop(0, axis=1) ## Necessary step to align dtypes of indexes for concat. someScores.index = someScores.index.astype(np.int64) #print("dtype firstWords: ", type(firstWords.index)) diff --git a/tmw_config.py b/tmw_config.py index f245b10..9ef6935 100644 --- a/tmw_config.py +++ b/tmw_config.py @@ -100,10 +100,10 @@ mallet_path = "/home/christof/Programs/Mallet/bin/mallet" inputfile = wdir + "6_mallet/corpus.mallet" outfolder = wdir + "6_mallet/" -num_topics = "250" # string +num_topics = "50" # string optimize_interval = "100" # string num_iterations = "1000" # string -num_top_words = "200" # string +num_top_words = "100" # string doc_topics_max = num_topics num_threads = "4" # string #tmw.call_mallet_modeling(mallet_path, inputfile, outfolder, num_topics, optimize_interval, num_iterations, num_top_words, doc_topics_max) @@ -122,17 +122,18 @@ mastermatrixfile = "mastermatrix.csv" metadatafile = wdir+"/metadata.csv" topics_in_texts = wdir+"/6_mallet/topics-in-texts.csv" -number_of_topics = 250 +number_of_topics = 50 useBins = True # True|False binDataFile = wdir+"7_aggregates/segs-and-bins.csv" -tmw.create_mastermatrix(corpuspath, outfolder, mastermatrixfile, metadatafile, topics_in_texts, number_of_topics, useBins, binDataFile) +#tmw.create_mastermatrix(corpuspath, outfolder, mastermatrixfile, metadatafile, topics_in_texts, number_of_topics, useBins, binDataFile) ### calculate_averageTopicScores ### Based on the mastermatrix, calculates various average topic score datasets. mastermatrixfile = wdir+"/7_aggregates/mastermatrix.csv" outfolder = wdir+"7_aggregates/" -targets = ["author-name", "author-gender", "title", "decade", "subgenre", - "idno", "segmentID", "narration", "protagonist-policier"] +targets = ["author", "subgenre", "binID"] +#targets = ["author", "author-gender", "title", "decade", "subgenre", +# "idno", "segmentID", "narration", "protagonist-policier", "binID"] #tmw.calculate_averageTopicScores(mastermatrixfile, targets, outfolder) ### save_firstWords @@ -151,7 +152,7 @@ ### make_wordle_from_mallet ### Creates a wordle for each topic. word_weights_file = wdir + "6_mallet/" + "word-weights.txt" -topics = 250 +topics = 50 words = 40 outfolder = wdir + "8_visuals/wordles/" font_path = "/home/christof/.fonts/AlegreyaSans-Regular.otf" @@ -172,9 +173,9 @@ ### For each item from a category, creates a barchart of the top topics. averageDatasets = wdir+"/7_aggregates/avg*.csv" firstWordsFile = wdir+"/7_aggregates/firstWords.csv" -targetCategories = ["author-name", "author-gender", "decade", "subgenre", "title"] +targetCategories = ["author", "subgenre", "binID"] topTopicsShown = 30 -numberOfTopics = 250 +numberOfTopics = 50 fontscale = 1.0 height = 0 # 0=automatic and variable dpi = 300 @@ -186,8 +187,8 @@ averageDatasets = wdir+"/7_aggregates/avg*.csv" outfolder = wdir+"/8_visuals/topItems/" firstWordsFile = wdir+"/7_aggregates/firstWords.csv" -numberOfTopics = 250 # must be actual number of topics modeled. -targetCategories = ["author-name", "subgenre", "title", "decade", "author-gender", "segmentID"] +numberOfTopics = 50 # must be actual number of topics modeled. +targetCategories = ["author", "subgenre", "binID"] topItemsShown = 30 fontscale = 0.8 height = 0 # 0=automatic and flexible @@ -205,20 +206,20 @@ averageDatasets = wdir+"/7_aggregates/avg*.csv" firstWordsFile = wdir+"/7_aggregates/firstWords.csv" outfolder = wdir+"/8_visuals/distinctiveness/" -targetCategories = ["author-name", "decade", "subgenre", "gender"] +targetCategories = ["author", "subgenre", "binID"] # one or several: "author-name", "decade", "subgenre", "gender", "idno", "title" -numberOfTopics = 250 # must be actual number of topics modeled. +numberOfTopics = 50 # must be actual number of topics modeled. topTopicsShown = 20 fontscale = 1.0 dpi = 300 -#tmw.plot_distinctiveness_heatmap(averageDatasets, firstWordsFile, outfolder, targetCategories, numberOfTopics, topTopicsShown, fontscale, dpi) +tmw.plot_distinctiveness_heatmap(averageDatasets, firstWordsFile, outfolder, targetCategories, numberOfTopics, topTopicsShown, fontscale, dpi) ### plot_topicsOverTime ### ### Creates lineplots or areaplots for topic development over time. averageDatasets = wdir+"/7_aggregates/avgtopicscores_by-decade.csv" firstWordsFile = wdir+"/7_aggregates/firstWords.csv" outfolder = wdir+"/8_visuals/overTime/" -numberOfTopics = 250 # must be actual number of topics modeled. +numberOfTopics = 50 # must be actual number of topics modeled. fontscale = 1.0 dpi = 300 height = 0 # for lineplot; 0=automatic @@ -239,7 +240,7 @@ # Interesting combination: *weighted+cosine wordWeightsFile = wdir + "6_mallet/" + "word-weights.txt" outfolder = wdir + "8_visuals/clustering/" -topicsToUse = 250 # = all topics modeled +topicsToUse = 50 # = all topics modeled wordsPerTopic = 50 methods=["weighted"] # list metrics=["cosine"] # list @@ -250,7 +251,7 @@ averageDatasets = wdir+"/7_aggregates/avg*title.csv" figsize = (10,80) # width,height outfolder = wdir + "8_visuals/clustering/" -topicsPerItem = 250 +topicsPerItem = 50 sortingCriterium = "std" # std|mean targetCategories = ["title"] # list methods=["weighted"] # list @@ -258,11 +259,21 @@ #tmw.itemClustering(averageDatasets, figsize, outfolder, topicsPerItem, targetCategories, methods, metrics, sortingCriterium) -### itemPCA ### +################################ +### OTHER/OBSOLETE ### +################################ + +### 5c show segment +## To read a specific segment, better than looking in the folder. +segmentID = "rf0166§0118" +outfolder = wdir+"/9_sel-segs/" +#tmw.show_segment(wdir,segmentID, outfolder) + +### itemPCA ### CURRENTLY NOT WORKING averageDatasets = wdir+"/7_aggregates/avg*.csv" figsize = (10,10) # width,height outfolder = wdir + "8_visuals/clustering/" -topicsPerItem = 250 +topicsPerItem = 50 sortingCriterium = "std" # std|mean targetCategories = ["subgenre"] # list methods=["weighted"] # list @@ -271,15 +282,6 @@ -################################ -### OTHER/OBSOLETE ### -################################ - -### 5c show segment -## To read a specific segment, better than looking in the folder. -segmentID = "rf0166§0118" -outfolder = wdir+"/9_sel-segs/" -#tmw.show_segment(wdir,segmentID, outfolder) ### 6b - create_topicscores_lineplot inpath = wdir + "7_aggregates/*-lp.csv" # narrow down as needed From 4ce326c01d51cec7d5901c71bf2649d0cf3cb527 Mon Sep 17 00:00:00 2001 From: christofs Date: Fri, 4 Sep 2015 17:34:14 +0200 Subject: [PATCH 35/56] Added simple textual progression plotting based on bins --- __pycache__/tmw.cpython-34.pyc | Bin 36091 -> 42916 bytes tmw.py | 266 ++++++++++++++++++++++++++++++--- tmw_config.py | 39 +++-- 3 files changed, 271 insertions(+), 34 deletions(-) diff --git a/__pycache__/tmw.cpython-34.pyc b/__pycache__/tmw.cpython-34.pyc index 4e549fd7bbf2da47311a88ea43b474dd225528bf..f0a6ae42c3e7d25b7dfbee2347a056200d6b8841 100644 GIT binary patch delta 3072 zcmcImYiwLc6+UzCKD_qz`<0FD_O`AXHceI`qAiWm21;pqlXpJ-*1w6a(x%4-|1mWG+K{ z1v*c3@({@^$(-RoK{8LWVE9jxERrl4{(h2Wk`==rCV7gaGW-K1eUj6Lznf%0^0?tY zN^+6ptl{q=d79*B48M;gkrY%oa)3@AA$f-6StIzA5qyCPQ&c=kCy$blRgzycvT2ev zl1qj^#;a!%kE}j#-@L3>riNC}FWf8kZqInZTt$^L%JcnfximMQReo@ytfap$j3Ybj z2fI%UintgP+bp%F|CWu5SM}3N{dH;4vPECHL`xwGEXvuG3sEgZ(k2rUT5fWCHNM-$!GmlmOb4X0E%TBp#R?CzMPFy`MdrT2+;bp{ccD933eIHH`NAR&ms<2Q-kHlQ3x&W_ z?rgb|P5bWUR~|-3dL?((I~(9Uh1BA_x|26~3Zy!OdHVOo_$^&5Pd)6X7fRUx8{Emt z03%Xpq+SrZx0E}oUPLd?r?M~_q3YenaBpiz6tX4HxO#t~oK6+|1IhgvujHMrr~{1< zTsNnGS$ZU@UScf=^{eUrJ-7WY+nr?nqtH*beyTh&@>{G=hq#ZY4Xp!h0?j|S8M^8J z4|D-kfEHNY482Ho;?QC#9U#hFy_Vaz`@f?wsW|7T*O>eQWD2-#Lh`0t6K(2scFqR! z)L(eo{Y(%nq#o6s3)|`jimAc}Lb1Y5>%l^@H$hQEEcF(^kO+`}RT-m{f#7 zABI8^AYkoQoZ`qiivpYKkV1$IVWJ#yNSqv+nSnEaiC_fqGT9NEytjVS7IE#)mlaYFg41nA)&~dq4&24&Am1{c-TBL}DEEe_&ZhMri$nLR zHyJ0_qxl7NGuYHTjSx^XR}CdArHr}cIj|=K#$FLf4oD@ z@qR8#*#0Io4JvuP@|kplG`h_p4{el~O<`VN5A!WCyPh@9F=AN}*73ajWVTpa-@>K<$j2l^i2)qS(%A=j<-vz0 zXsLt2Q>!!PO$SA8Ln8*88j(#lG{P8^#-`FJiBpf$i4u$g_F)AFB)F*I5MMKiSattK z-f7jj7TBbB+AEYRMXwaN^)d>98zA8?rV8$+FM?!s^hgSMYr&K2;!TAu*=R%3_R@hT z8#cIxjr%houI`vFc&L$XQ|_kR)2TvMGF|ix;&&9w(%X@&h7XoY-k#(XGMmBK8;loD z;A)mh3iqYPm~d7y7n^X@>-w?9T~V&QK%f4y8k%bK`Zt!{ml!U4u#`gWsI(EhHr^QC zi0A}lBKWXr{$C#-u-b)sTVI@t;}dDW*sh*|lArTVH+~GQzL))~2(Op_-BI;5%=}GQ z|1LkW`mg*&F|bvQ);}mt4y)(c@aqwX-g-TantJ;8u>P(Z(Jz-)+*XkAf}K3gkUNi5b&6$$k%3?zj5cJuGz7EAEmO7DlYK6PJMU4k6eBy>o9$7+i};pc7j zve1pvsNUs!QMG~z{|xKz`{VkDPyVWRFajT-A)z7Agu1f&7~oXO4-kHWmQ*s!ZjxE>WK@ti%U9p;fb-^9LE`OhMYcU U#QBiZf&W2g)Ulk-o&l%xKSqS#Jpcdz delta 280 zcmZ2-p6T~YCgpcLyj*t{y$pSKn}Ojm0}@~cvK@f9xM-uY5wj};LrOYBjvGU+J42K^ z17k`CLrNw?3j;%x2SZ90LrON7?a7b=RL97W Date: Fri, 4 Sep 2015 18:00:01 +0200 Subject: [PATCH 36/56] Added complexAverageTopicScores --- __pycache__/tmw.cpython-34.pyc | Bin 42916 -> 43742 bytes tmw.py | 28 ++++++++++++++++++++++++++++ tmw_config.py | 33 ++++++++++++++++++++++++++------- 3 files changed, 54 insertions(+), 7 deletions(-) diff --git a/__pycache__/tmw.cpython-34.pyc b/__pycache__/tmw.cpython-34.pyc index f0a6ae42c3e7d25b7dfbee2347a056200d6b8841..2d0224f702140357fbc0b5e5ed375c9d3a83d8d3 100644 GIT binary patch delta 5804 zcmbVQ33OD|8NUC^WRgs_$ueY;OvpwcLrP4d)F4YhBP0+)FpEhnfyulHnJ~P+6}&}65JeV`r!9i7QkS~a<HpuCWQG_OP0ssr z-(CKD|Lwc~%aK==_uo|V&&S1^mOk@gli`5aoNGPQ_pWvn%e zsf?+>JQG=0Hfzn1uRmg*T(&xowdOGx9?oa2`AnV0)MB}85>u~WYKd$Y(0a9$sby>{ zi1RRzvVf^oatJ$h>1ASr;+oLH)K=L)YovcR>zX4jDE4#*Q`_axb>Ty-wL;8{a193J zPz_VpGW7=8E@$dGrZ&j-45s!lwMn+GVQMc^n`L|Eh^WP^bum-dGqq3l*UJ8znP&-G zUB_DMP|1F#4#=^kOx?iLLD{~66sxyy{7_3VTs=F|gyxGEgPsXu;%1Y`pHn;7Z(v<9 ztRJx!qImHH}^TxDcl#sZ8+ zaS5onIDrKcS-?!vw{Q>UwuH>;a7ANMdNLif!^`ozIamPwmgRk4| ztatjj%YQ<_@Scc5nPP%uFyk{xu^`4lrop6GApuNEI&iZ#XWsM(NEU(0R`{cMweo7n z(4wyIft3G|glH)t6Q)9_utgS^nQd^k~7R zs~~?FO>h$QAlMUvi4HxKDjfj_Ztq9EN}DGaFQ+MM3DyyW%^lZGeMUr{gh6rqVPeS=8P6|dsDT;Hw{88G9M-{L5G;OEbHoWYQ9RxbyKU0fEa@# zk1^g!u#{f0!B*lUf4E0%T9$c@ba-;e;oM8p$>021f;9xRlKTj{2?7Luk#vXknqrj2 zLqQOu2Z0P;=>=)T@f@WGz|6LE&_rX~6xgT@w6#O~jU?JZLi?#jn)8ha_C(%G?Hn4C z?paT-eFPf_#_SP_gEO>39z&jrf*}`vFPtl;!zM~s@#u=z3=JqkB-CZa>9QpS>Rzu= z*kgueh zq{BkKl5Q%IbRTk4lTz z4F+*&C>8cc|H39RAKVYG|GhACEnj@OWTcWgQkU!PTuG0^afnRi6=IA zH)4EU^j4RWS!?ZPJ;g33yD+h`%0tc?CRS->X;vB0(pf})!B}J$x6@zaXm@$Io>rzn zlF8%tx`?~i7{tW``3gUz<#zuQV23bacB^6`FC0;$mltM`8vH&2d>DU_S~AiOiYK`< z@_u~Zi`v-49RIWmT_Ytlb@;{zMmoB?J?p=jLL_nc#B0m4N@OV#QgS#V$5ZkFI?IS& z>B=nz5q}8vc}QCrxE0!u;%Hx_EQD(9k-CwI;rTvVc8K611bf`rfR(Wt3R8KHl+i+I zUKzFb&}uzutJbB$0kLe|Gm%9iMwzl7**-^2OQo$zFTbQZ%roWxSO%Qu`B!uT6scz)SnL(B4NOq9pZ&Cvga9LCv%^Ebrd9ipf4PMe*gFgk>BEH<1iD!p9z^ zJF``^Z3-yHNO5{o9=xJ`zUespSU)Vyd>kQY^iW#(`4!Z&gkY>8sqcjN^Ok(Gv}Uw! zO{WXZnXuL095Uww-PKRgf?pFnO(5C#8EQ%DX{Oe*1eamzqL8U$`5$PJB;$|@zoovH z2_6@=ZT4aFw`@yOtdW9mD}>jz{o4|t@)er$I>JbplkPf|OA(TsU!cAh38X~1mE3gL zzvz>@wkN`s!n{2T-neXTUNck-H;M;_EO1&oHRK{bx9ynJCWCuCHa<-!2xH@5!lfmi zPW&?UWG2eHcwCx3xnmMI#albF;VmTRH6T{*v}VZKXdwm8y|mE*YVDUTv2SO^2<0Jj z!*224o$0ZX^5MM_N%AmpF@htB%6nEq2A*D^NJw;C$>2mqSvYZTknRlSIZcNn@ zt5+>KBx5ZdL|Q=_fVR^=1?U6mtLZqHPBQY76y13n1LrmHTkPA;uh|Y_MNL>; z5sDfV7M^X1BKHBC$n;J7VHsETeKT$x{isa1sC{+dEEH1ZV+Vd%|8r!V$y@RGR-=gJ zbBvN~j*2jyFSC{^K%J{+>q+wUI*K2oZND*10yHYCsOs-qMUA5lRY$2ttf~H$RAxlB zz=tv93N!GtTEc_zP<;+(1S2}!fnZF4JG}l5A734oE9K-)x(0($__ZIv64)LScAJIY zVd~+^%zD~HQWBO%F<`_maQsy!1OHgOaQJT}@-F)r#Ua=A;vfCXB1{VP<dL~BMbb^ZpuN)iom)hIdf4`}oZ!C8c$#lOnqa&T|4>Y{%Z`uy&I+lT*jpb2DF ZPSy#Lj5vEN9gc{L$L2tmnDtQS{{g4;B#Hn4 delta 5331 zcmcgwdvKK16~E_8vLOk1Vn{a0HVGjnOUN36F$94SiAV(V2F#*_O}=GW*hg}AlaRo& zk)o!SSG*boe2Zui9Pv}DZLRMawVeXhO1rgKaI7P?jMIls$IjSu?zd#u2x9%CVfOc( z`#SfYbMARuPW?_fa7-yWH#s?R;Dt|?pZtok4_VY-4AKkn=R13}h(iZsDlnIct^WbD z<}($WGnlJ@xeJ-KkO8%Ecp9@#V`>>wXR>XKS&Nuj&Qyz}r_%)W3Z_=DZNRL>Or6El zN*ogBATUK=K8sl^nR*>lmr(x}#XeQ{vhW@RXjT44!Ozn~zEfQD9 z*n`cIu4byAsjZTp!_+>ewoAH(sr^h{A?dkN7(DzkW?jbA0j6%0{Wr+|LFQ^??j~k! zLLrBkx=D^TGj%gl1Cnkb!P?AQJ|W#Tb}RH@T3Kw_d@WFKj)7G1*mYJoDwHMFkfqIC zG5}LPz)8MIG*8Ke8IY>*eC_pyd??8W#zF}WyT8-Hy?s6WU7W%*u#=}Fn3MQY>c|s` zOBaGkEL~cwe5{D=OADY_99jB7^5?`?9w3HWw7tvX6(|=EH8vTFiAA>fym2;I#Ehnz zoIL6{L(C+{0-~;!*m3w$#jWyfMB+6a6gix5iip;$1IQ@ODQIw26;8O2DrrTd#y7 zD@|BK;72g0gi>w&c5ZXn>ur7;ZtuhMtE(=z+(c72fsY`G`qipb zaEd>!%7u;Mi&Zb93ZB2IEUlF${Dn@`K%iS@`|6aWwNCEyuk>=&7ZCY3XTqSUy7^-; z3e)P@W?hlEGL-E1_B8nIZeQCv?|>(Y(dx*?+gE4jjF2%?a0eEXp=3jXVnmasDSR`C z2d>W6-di1nBpWU1BIp)t)|i4-L>{A4ttHAqP(@Aj9meehE!0GUIq)a{xD%U027g0m z*I@ZjeEFui6|>dAP^_iLxAfLJz~)5s0ehUt{;)QbTg-BA2*i#R=`lm@6z_W7ZRWMJ^2+G#NtLe(bqZhyJ$dKL8?CCd;~1>=B53R#%?8u zQoYNaVpcKIw0oN|>Rm%yUl!gPoi@gZTTg~UNJR(cW75po&V~FpHFqw?+5EZ*#$M4}8j*Jzw6ohzEwlZ2^fb zH81bKv?qncP!dKaXSZ*et;_DhXbZlX&@L>#Vz^%%4;VGK?-+nv%gP_4B(Ed_6)$*!4{M%HqmUK}gBdhGU8Dvq@Pt?pm;;BzK%f%o zLxEy=T00&XgUll|XF>|75}Sf0sXK5a6kqT3`E8ydzEd0xS~Exz=Zy-AtMKP&bZu2Xj#ESS^r@>p=j9Z_CtMtRt%x4io1{ZmO zkI$r@WdswINPR~{%I!sIQj?Q(O?KN+m36oK+9Rr*s+;(ETJQqF&j@5F{y9-H3@s<> z7X+73b6G^qN&HP(AVYKH_^(sn8wAgarV(>k_1j0XlrLk&qa&qoO#9793e>+vQ;s2= zzuF|XoyFsC$UoD}KGJ_I!NA z2oF1jWw$X&MnoKEe7`t-ze|S24dS)k&EGG+S51h4mx%8?90*58=YUnbJz5O!X`hW& zLhZ#=hxPeyNt#vysh~*J8&OcFu!-v29FJ@TJ}@Z!q^1cCzD-xdcUMH| z$|G~hyR>J-<@RH9ykHH*+FY*Ar~0kO=_DMKa?u7<>K+0Kce1T@!FUPwu;kZW(a9Xd!B=icWCW<1d;G9tF4pN z7eYWUj3`@f|LqAz+M^Rh8lNgE_D+HOME%|xIQ5;~>y>>|;;7hpL;J(NmT+0rmO-bv zkQ33&XeWxQNH66%w9yzYKdwD{?=xV!xX<7m(b1b9brqt@ipW*KX<`2{P+9T!I5C?q z`M)|4NkoOg8Ew{sA3-V=J7(Yy>Ys-M4jZ=NA-{|h3;Gr$NP(%!h!*bBHIo!>l0mHK zUvdrIyhcpP_|a>t^Uv)^u;=2pyq8AxbRxc2$wZP+@$*F-mQ0_eXByDQ@y8gwM1h~t z>K;voh7WN@C}yqGA4>3Zo5#1-%iU4ed=4F&vRx=1U-o`X66S=c#1);c_8hE-tHk>U zjc`_ce(=u~GDV%mr5K6&@S{H;qE=8}0rA%p6Fu6e$5T3LUR1Y;POIk)%tJAhhfKgf zC2p-^{^PrXSxAP`-8Mh{cZJK@W$76tDrC^VdWNo}EwTwp3GxYw2`mKH5;POE5m*V< z5cml;5)2Y-CZM+}--3XOu-m$VUhd($s4qw`{%GQYn$qFOZ6YNoKZwmxvaj3OGid47 z^KB?TOmJCV9Hs?N6Hr0LUnLmN^ZYn9Wr(D3l&i<1@c_h6(eMWZXT;f`YB~h diff --git a/tmw.py b/tmw.py index c2f728e..dfb7d1e 100644 --- a/tmw.py +++ b/tmw.py @@ -681,6 +681,34 @@ def calculate_averageTopicScores(mastermatrixfile, targets, outfolder): print("Done.") +################################ +# complexAverageTopicScores # +################################ + +def calculate_complexAverageTopicScores(mastermatrixfile, targets, outfolder): + """Function to calculate average topic scores based on the mastermatrix.""" + print("\nLaunched calculate_complexAverageTopicScores.") + if not os.path.exists(outfolder): + os.makedirs(outfolder) + with open(mastermatrixfile, "r") as infile: + mastermatrix = pd.DataFrame.from_csv(infile, header=0, sep=",") + ## Calculate average topic scores for each target category + grouped = mastermatrix.groupby(targets, axis=0) + avg_topicscores = grouped.agg(np.mean) + if "year" not in targets: + avg_topicscores = avg_topicscores.drop(["year"], axis=1) + if "binID" not in targets: + avg_topicscores = avg_topicscores.drop(["binID"], axis=1) + #print(avg_topicscores) + ## Save grouped averages to CSV file for visualization. + identifierstring = '+'.join(map(str, targets)) + resultfilename = "complex-avgtopicscores_by-"+identifierstring+".csv" + resultfilepath = outfolder+resultfilename + ## TODO: Some reformatting here, or adapt make_heatmaps. + avg_topicscores.to_csv(resultfilepath, sep=",", encoding="utf-8") + print("Done. Saved average topic scores for: "+identifierstring) + + ################################# # save_firstWords # diff --git a/tmw_config.py b/tmw_config.py index 2978ca2..1b84ede 100644 --- a/tmw_config.py +++ b/tmw_config.py @@ -136,6 +136,13 @@ # "idno", "segmentID", "narration", "protagonist-policier", "binID"] #tmw.calculate_averageTopicScores(mastermatrixfile, targets, outfolder) +### calculate_complexAverageTopicScores +### Based on the mastermatrix, calculates average topic scores for two target categories at once. +mastermatrixfile = wdir+"/7_aggregates/mastermatrix.csv" +outfolder = wdir+"7_aggregates/" +targets = ["subgenre", "binID"] # 2 targets to combine +tmw.calculate_complexAverageTopicScores(mastermatrixfile, targets, outfolder) + ### save_firstWords ### Saves the first words of each topic to a separate file. topicWordFile = wdir+"6_mallet/topics-with-words.csv" @@ -250,13 +257,6 @@ #tmw.itemClustering(averageDatasets, figsize, outfolder, topicsPerItem, targetCategories, methods, metrics, sortingCriterium) - - -################################ -### OTHER / OBSOLETE / DEV ### -################################ - - ### simpleProgression ### ### Creates a lineplot of topic development over textual progression. averageDataset = wdir+"/7_aggregates/avgtopicscores_by-binID.csv" @@ -273,6 +273,25 @@ +################################ +### OTHER / OBSOLETE / DEV ### +################################ + + +### complexProgression ### +### Creates a lineplot of topic development over textual progression, +### but does so separatedly for different target categories. +averageDataset = wdir+"/7_aggregates/complex-avgtopicscores_by*.csv" +firstWordsFile = wdir+"/7_aggregates/firstWords.csv" +outfolder = wdir+"/8_visuals/progression/complex/" +numberOfTopics = 50 # must be actual number of topics modeled. +fontscale = 1.0 +dpi = 300 +height = 0 # for lineplot; 0=automatic +mode = "all" # all|sel +topics = ["25", "44", "12"] # if mode="sel": list of topics +tmw.complexProgression(averageDataset, firstWordsFile, outfolder, numberOfTopics, fontscale, dpi, height, mode, topics) + From ee0eeff3018091398f159be513783b6d9bd5c5c1 Mon Sep 17 00:00:00 2001 From: christofs Date: Fri, 4 Sep 2015 18:32:36 +0200 Subject: [PATCH 37/56] Started work on complexProgression --- __pycache__/tmw.cpython-34.pyc | Bin 43742 -> 47229 bytes tmw.py | 139 +++++++++++++++++++++------------ tmw_config.py | 22 ++---- 3 files changed, 94 insertions(+), 67 deletions(-) diff --git a/__pycache__/tmw.cpython-34.pyc b/__pycache__/tmw.cpython-34.pyc index 2d0224f702140357fbc0b5e5ed375c9d3a83d8d3..182978792a8bc9a6be719c150f7c9ead6e91264d 100644 GIT binary patch delta 1616 zcmaJ=eN2^A9R5A$-uqs-ycf~zPv{vcXIj^~~**e?#z0Wzn_dLJz zKJW9pc$z)@g-yI`dOe$t-}aS{23?~e!=Z4tnxRbRv59StrdnztDqBpvj(EL|^NFt~ zzQM)?#2bi5Y`lbcBk{LvTu8i$c#(}~65mLCsf}MCzKM8=jb{_FA^u>pg<9o z6%lVH-eT_;Pqh=R6j(~tJ}UDO_!i=C+sP8*ZN#_Qc$q$3oY5{*o%6GhDMrrmixczm zn|KUGK&#zkou!CDRbEm%l^`BP+(XSkqd_&Qv`JO56dp$5IQ@PzQz>e66ZcZYO-VeK zk{Y8Z!YC4@P_#`ooFdW0UBnII!-&VJBMf21r4s6lA^;bQ<}Mb;Un*r*iA>JP1jdQ{ z+{74|NOp`wEIg3ax$ipggJcUn1^$ws<(&hV+_n(FaJhZaN(U0gA6tD`YZQ=7J2^F9H3W@UC|2clLubA%5Y4!m3tY!m)#Y0z;80v z-^TDj_EZXn(Q-=Fx4?9HcXhYsY^{C>tdPyYqYOWw zbJ}+TdGg0?&oTI9^o}nXHpuTf4Pcc#*>wtNlDoyr4s^*Y-ES})5xKkPh`rSbl6CaK zWt|3*II?}_lAcJ_!fyW+&6&f3%mE@g72oNmyDnH845JnSDrCN6cRxvX=#l-bprHtC5MhE@x# zs`0N;wSE>SyC$VvQ->;?idYY~nqDD9<#yNVz8B(w2jI zR4Gfj^`gok%3Ov&s{ytx**2Y zjY)qj%zv{!7ACLrKJ!i8 zP~2hf9HxkaR0oC~6m}AK>gB)f8b$T9b%W-vNE63;CXHX)K+9@8Ht!>Gb%jS#DuK0# zPxUVMSeMkTtjqelntNx!ExzgQOwUh8*Y{0IM+M7w_$1Xca|!FS2!pm>QZOeIMQgcF3%c6T$WvqNMh9@%g^3 zXcu$$|2q7TW>=F)X&wjbj+}SE3k-^agB>^_E*u<%d*b(l*Hygm$^4WvLw}Xm5e<`# zfpuS9A){1g9Xf*1+Kbi=9klk=Egf#_a7X-fI5}oe!+Sd17jgZQaa7Fc&qj)<@6Ssc hX_{udnP`qMlg%kMidQJzj8QuYW{Sy7Z(O|T{RfV5%sK!7 delta 857 zcmYjPeMpsO6hFW7-g~9%bG_!SCR}xaR_m{U5je$Vxma`gT`5x8@>w#`)vmf*>y~TI zaL$Z+#-ds&%%E>|r?A==1i`jGNX8%xY9+dATxqbi1-0#cU;3lx;hgj1oZtC)IM36E zeCrp^nN|UR<#+d27J5Mw^xUy*Tx3h@BWon5+v=#IkBRE?NWV{dg^OPy9VNZW#rdQ+ zldg4f0cj!)YA&U^Qqo&Uf8hGdT>n;TE~l0Xs;eOA80l?ptdjJ0(j6|Ya+0g(e|X=S zrmTS-L(=HUD@3XpSS<6}h+;nKgk2z!nv$q3NNvd!19h_F-IhRFQ7pj}Eee?vOM;n6 zq=R;VVJB*vm_W1K8{!cl*m8TF<^zXue<}Sm71D3 zW0Nec`wQ491*?mJ1Jbdk3pgU)`nwE~dv668KgpwpDDbDe`*t_uU-`SyFf!!Rri;Kr zDQo`RsX5v56sVVpji;G+c+A*v*qn~s#(UD&n#AalL+!J``<%c!lNKG|ogVYeNVeRJ zh8SPS!!17o-^+!q<%}Q1zrE7JxUAmM2~5kcA9XQ&vZ3pc^B~VBhk*t1WzTM)Mi%rg zWHiV-edih7lD)?VM9qo*Ia0p&7_dtoo7XM$%i?|O85hjp{#WHvPbEzGQ7_;1%BrK| z;J8CtL6OJlfb0u7!4;**`7$;QPbwg=v)k%E2BloGJ$h%WacQPLMlzAsIYoT QCE7MsWhhpG Date: Sat, 5 Sep 2015 17:35:55 +0200 Subject: [PATCH 38/56] Fixed extra slashes; number_of_topics --- tmw_config.py | 76 +++++++++++++++++++++++++-------------------------- 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/tmw_config.py b/tmw_config.py index 41d08c2..d749995 100644 --- a/tmw_config.py +++ b/tmw_config.py @@ -17,7 +17,7 @@ # 3. Posprocessing Data # 4. Basic Visualizations # 5. Advanced Visualizations -# 6. Other / Obsolete +# 6. Other / Obsolete / in development import tmw #print(help(topmod)) @@ -122,7 +122,7 @@ mastermatrixfile = "mastermatrix.csv" metadatafile = wdir+"/metadata.csv" topics_in_texts = wdir+"/6_mallet/topics-in-texts.csv" -number_of_topics = 50 +number_of_topics = num_topics useBins = True # True|False binDataFile = wdir+"7_aggregates/segs-and-bins.csv" #tmw.create_mastermatrix(corpuspath, outfolder, mastermatrixfile, metadatafile, topics_in_texts, number_of_topics, useBins, binDataFile) @@ -158,10 +158,10 @@ ### make_wordle_from_mallet ### Creates a wordle for each topic. -word_weights_file = wdir + "6_mallet/" + "word-weights.txt" -topics = 50 +word_weights_file = wdir+"6_mallet/" + "word-weights.txt" +topics = num_topics words = 40 -outfolder = wdir + "8_visuals/wordles/" +outfolder = wdir+"8_visuals/wordles/" font_path = "/home/christof/.fonts/AlegreyaSans-Regular.otf" dpi = 300 #tmw.make_wordle_from_mallet(word_weights_file,topics,words,outfolder,font_path,dpi) @@ -178,11 +178,11 @@ ### plot_topTopics ### For each item from a category, creates a barchart of the top topics. -averageDatasets = wdir+"/7_aggregates/avg*.csv" -firstWordsFile = wdir+"/7_aggregates/firstWords.csv" +averageDatasets = wdir+"7_aggregates/avg*.csv" +firstWordsFile = wdir+"7_aggregates/firstWords.csv" targetCategories = ["author", "subgenre", "binID"] topTopicsShown = 30 -numberOfTopics = 50 +numberOfTopics = num_topics fontscale = 1.0 height = 0 # 0=automatic and variable dpi = 300 @@ -191,10 +191,10 @@ ### plot_topItems ### ### For each topic, creates a barchart with top items from a category. -averageDatasets = wdir+"/7_aggregates/avg*.csv" -outfolder = wdir+"/8_visuals/topItems/" -firstWordsFile = wdir+"/7_aggregates/firstWords.csv" -numberOfTopics = 50 # must be actual number of topics modeled. +averageDatasets = wdir+"7_aggregates/avg*.csv" +outfolder = wdir+"8_visuals/topItems/" +firstWordsFile = wdir+"7_aggregates/firstWords.csv" +numberOfTopics = num_topics # must be actual number of topics modeled. targetCategories = ["author", "subgenre", "binID"] topItemsShown = 30 fontscale = 0.8 @@ -210,23 +210,23 @@ ### plot_distinctiveness_heatmap ### ### For each category, make a heatmap of most distinctive topics. -averageDatasets = wdir+"/7_aggregates/avg*.csv" -firstWordsFile = wdir+"/7_aggregates/firstWords.csv" -outfolder = wdir+"/8_visuals/distinctiveness/" +averageDatasets = wdir+"7_aggregates/avg*.csv" +firstWordsFile = wdir+"7_aggregates/firstWords.csv" +outfolder = wdir+"8_visuals/distinctiveness/" targetCategories = ["author", "subgenre", "binID"] # one or several: "author-name", "decade", "subgenre", "gender", "idno", "title" -numberOfTopics = 50 # must be actual number of topics modeled. +numberOfTopics = num_topics # must be actual number of topics modeled. topTopicsShown = 20 fontscale = 1.0 dpi = 300 #tmw.plot_distinctiveness_heatmap(averageDatasets, firstWordsFile, outfolder, targetCategories, numberOfTopics, topTopicsShown, fontscale, dpi) ### plot_topicsOverTime ### -### Creates lineplots or areaplots for topic development over time. -averageDatasets = wdir+"/7_aggregates/avgtopicscores_by-decade.csv" -firstWordsFile = wdir+"/7_aggregates/firstWords.csv" -outfolder = wdir+"/8_visuals/overTime/" -numberOfTopics = 50 # must be actual number of topics modeled. +### +averageDatasets = wdir+"7_aggregates/avgtopicscores_by-decade.csv" +firstWordsFile = wdir+"7_aggregates/firstWords.csv" +outfolder = wdir+"8_visuals/overTime/" +numberOfTopics = num_topics # must be actual number of topics modeled. fontscale = 1.0 dpi = 300 height = 0 # for lineplot; 0=automatic @@ -236,9 +236,9 @@ ### topicClustering ### # This function will create a dendrogram grouping topics based on their word weight similarity. -wordWeightsFile = wdir + "6_mallet/" + "word-weights.txt" -outfolder = wdir + "8_visuals/clustering/" -topicsToUse = 50 # = all topics modeled +wordWeightsFile = wdir+"6_mallet/"+"word-weights.txt" +outfolder = wdir+"8_visuals/clustering/" +topicsToUse = num_topics # = all topics modeled wordsPerTopic = 50 methods=["weighted"] # list metrics=["cosine"] # list @@ -246,10 +246,10 @@ ### itemClustering ### # This function creates a dendrogram of items in a category (authors, titles). -averageDatasets = wdir+"/7_aggregates/avg*title.csv" +averageDatasets = wdir+"7_aggregates/avg*title.csv" figsize = (10,80) # width,height -outfolder = wdir + "8_visuals/clustering/" -topicsPerItem = 50 +outfolder = wdir+"8_visuals/clustering/" +topicsPerItem = num_topics sortingCriterium = "std" # std|mean targetCategories = ["title"] # list methods=["weighted"] # list @@ -259,10 +259,10 @@ ### simpleProgression ### ### Creates a lineplot of topic development over textual progression. -averageDataset = wdir+"/7_aggregates/avgtopicscores_by-binID.csv" -firstWordsFile = wdir+"/7_aggregates/firstWords.csv" -outfolder = wdir+"/8_visuals/progression/simple/" -numberOfTopics = 50 # must be actual number of topics modeled. +averageDataset = wdir+"7_aggregates/avgtopicscores_by-binID.csv" +firstWordsFile = wdir+"7_aggregates/firstWords.csv" +outfolder = wdir+"8_visuals/progression/simple/" +numberOfTopics = num_topics # must be actual number of topics modeled. fontscale = 1.0 dpi = 300 height = 0 # for lineplot; 0=automatic @@ -281,10 +281,10 @@ ### complexProgression ### ### Creates a lineplot of topic development over textual progression, ### but does so separatedly for different target categories. -averageDataset = wdir+"/7_aggregates/complex-avgtopicscores_by-subgenre+binID.csv" -firstWordsFile = wdir+"/7_aggregates/firstWords.csv" -outfolder = wdir+"/8_visuals/progression/complex/" -numberOfTopics = 1 # must be actual number of topics modeled. +averageDataset = wdir+"7_aggregates/complex-avgtopicscores_by-subgenre+binID.csv" +firstWordsFile = wdir+"7_aggregates/firstWords.csv" +outfolder = wdir+"8_visuals/progression/complex/" +numberOfTopics = num_topics # must be actual number of topics modeled. fontscale = 1.0 dpi = 300 height = 0 # for lineplot; 0=automatic @@ -297,14 +297,14 @@ ### 5c show segment ## To read a specific segment, better than looking in the folder. -segmentID = "rf0166§0118" +segmentID = "rf0166§0118" # indicate here, manually outfolder = wdir+"/9_sel-segs/" #tmw.show_segment(wdir,segmentID, outfolder) ### itemPCA ### CURRENTLY NOT WORKING -averageDatasets = wdir+"/7_aggregates/avg*.csv" +averageDatasets = wdir+"7_aggregates/avg*.csv" figsize = (10,10) # width,height -outfolder = wdir + "8_visuals/clustering/" +outfolder = wdir+"8_visuals/clustering/" topicsPerItem = 50 sortingCriterium = "std" # std|mean targetCategories = ["subgenre"] # list From 22b24daa187d297c66a1ff3f606018f05812b6e6 Mon Sep 17 00:00:00 2001 From: christofs Date: Mon, 7 Sep 2015 10:42:52 +0200 Subject: [PATCH 39/56] Fixed issue: https://github.com/cligs/tmw/issues/10 --- tmw.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tmw.py b/tmw.py index 847f721..a16e0ea 100644 --- a/tmw.py +++ b/tmw.py @@ -216,12 +216,12 @@ def segmenter(inpath, outfolder, target, sizetolerancefactor, preserveparagraphs # segment contains words assigned to the current segment segment = [] - # go thru paragraphs one by one + # go through paragraphs one by one for line in infile: text = line - # remove special characters and space-chains - text = re.sub("[,;\.!?—\t\r\n\v\f]", " ", text) - text = re.sub("-", " ", text) + # (optional) remove punctuation, special characters and space-chains + #text = re.sub("[,;\.:!?¿\(\)—-]", " ", text) + text = re.sub("[\t\r\n\v\f]", " ", text) text = re.sub("[ ]{1,9}", " ", text) # tokenize text From 470ddb161609ad3f91144f541df1313cdf6570e6 Mon Sep 17 00:00:00 2001 From: christofs Date: Mon, 7 Sep 2015 10:49:29 +0200 Subject: [PATCH 40/56] Fixed issue with ambiguous lemmas, https://github.com/cligs/tmw/issues/13 --- tmw.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tmw.py b/tmw.py index a16e0ea..b620786 100644 --- a/tmw.py +++ b/tmw.py @@ -448,7 +448,7 @@ def make_lemmatext(inpath, outfolder, mode, stoplist_errors): elif "NOM" in pos or "VER" in pos or "ADJ" in pos or "ADV" in pos and "|" not in lemma and "" not in lemma: lemmata.append(lemma.lower()) elif mode == "esN": - if "|" in lemma: + if "|" in lemma and "NC" in pos: lemmata.append(token.lower()) elif "NC" in pos and "|" not in lemma and "" not in lemma: lemmata.append(lemma.lower()) From a8d0a31555f2aa2c04bb10efcb320645492c072a Mon Sep 17 00:00:00 2001 From: christofs Date: Mon, 7 Sep 2015 10:58:36 +0200 Subject: [PATCH 41/56] General settings at the beginning: https://github.com/cligs/tmw/issues/12 --- tmw_config.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/tmw_config.py b/tmw_config.py index d749995..1ed87c0 100644 --- a/tmw_config.py +++ b/tmw_config.py @@ -12,6 +12,7 @@ # For information on requirements and usage, see the README file. # This config file is structured as follows: +# 0. General Settings # 1. Preprocessing Texts # 2. Topic Modeling # 3. Posprocessing Data @@ -22,8 +23,19 @@ import tmw #print(help(topmod)) -### Set the general working directory. + +################################ +### GENERAL SETTINGS ### +################################ + +### The following settings depend on the system used. +### Path to the working directory. wdir = "/home/christof/Dropbox/0-Analysen/2015/hybrid/rf10/" # end with slash. +### Path to the TreeTagger file (language-dependent!) +tagger = "/home/christof/Programs/TreeTagger/cmd/tree-tagger-spanish" +### Path to Mallet installation directory +mallet_path = "/home/christof/Programs/Mallet/bin/mallet" + ################################ ### PREPROCESSING TEXTS ### @@ -64,7 +76,7 @@ ### Perform lemmatization and POS tagging. infolder = wdir + "3_tokens/" outfolder = wdir + "4_tagged/" -tagger = "/home/christof/Programs/TreeTagger/cmd/tree-tagger-french" +tagger = tagger #tmw.call_treetagger(infolder, outfolder, tagger) ### make_lemmatext @@ -83,7 +95,7 @@ ### call_mallet_import ### Imports text data into the Mallet corpus format. -mallet_path = "/home/christof/Programs/Mallet/bin/mallet" +mallet_path = mallet_path infolder = wdir + "5_lemmata/" outfolder = wdir + "6_mallet/" outfile = outfolder + "corpus.mallet" @@ -97,7 +109,7 @@ ### num_iterations: How many times the model is improved. ### num_top_words: Number of words to save and display for each topic. ### num_threads: Number of parallel processing threads to use. -mallet_path = "/home/christof/Programs/Mallet/bin/mallet" +mallet_path = mallet_path inputfile = wdir + "6_mallet/corpus.mallet" outfolder = wdir + "6_mallet/" num_topics = "50" # string From 1c598a6218210f2728ad9e757342afa6343c3684 Mon Sep 17 00:00:00 2001 From: christofs Date: Mon, 7 Sep 2015 11:54:44 +0200 Subject: [PATCH 42/56] Fixed issue re numOfTopics: https://github.com/cligs/tmw/issues/11 --- __pycache__/tmw.cpython-34.pyc | Bin 47229 -> 44424 bytes tmw.py | 59 +++++++++++++++++---------------- tmw_config.py | 57 ++++++++++++++++--------------- 3 files changed, 59 insertions(+), 57 deletions(-) diff --git a/__pycache__/tmw.cpython-34.pyc b/__pycache__/tmw.cpython-34.pyc index 182978792a8bc9a6be719c150f7c9ead6e91264d..cd15355dc422cef5d1e5baab998eea1fc4a3f51e 100644 GIT binary patch delta 9520 zcmc&)33yahmVT$IQkBZSZ$*U)AtVI^!;YXV5;lQ=kPtvHR8p0sl1f#$RV)b!fq4(0#eI89O9p!gOj zOBX2a4#mAe*{mo_7bCS?m8J@1X@vsrPQ|@Rwk=ZJn-%xXvRui77dmfImzs*> z4KD$%9FT$glADspL0gQ{5UsQ+$|g`smlvWv#Q}g5Q!*h57o~)vo0YbBYN*H8Q$}OI z)D%d-t5Qqi61hLPjWlaAHl{wG&}zBS8XprIw_I3BXmxJdH2~vqYkEnMF&)y4aS#uM zAZAlr4Qs}Kx%9-+GnJm;eMS_IJ zU{{x9?}yaZtl<;INwDYWyz84q0fo6~P>wZtD-zu+}zqauNwFk{MD7 zG;=6WMV+0wqCeL;WF1f8uNI5BW(h+ALkhzje8cuHrXnih)_#LvH6H30j81|WNCDBJ z4zgDRRN-y8N5PF#`tMC!M;)~&;i>bi6@#hl1ks6~_s@mJnCM7{ZEAsI7r-`rI&Tj)ndloh9M<9e1D*YLQ$>x5fdA6y z$)bmkDfk>8O9p;Dux{W`W1CrNF!plAW^A`AA&b&xRoo`I*Cr!wE*KF#!O#{7IFlgxPh&h61 zz@7M0=KHrnLk$)qrWFqB_z~i?85DP_(r8dbOph+4*UJVs5mVRkWka_v$;t(~gnRTX zJLCs!gW6R%2A~}?om-$4_c@2aaXjwak+Pl6NQ(+K`@A7Bj>=*hHVv+TnRsk)AN}X=-wXJZnRHaPpAyj4a52f#5Ie{Ieac4t{L78_u!e~Mer3SkI08(IC{heGL9o7<|R7} zB#4;>ffz%sK^#{zMiv-s&rwC079DK#HjBWTpdfJ-@jSyGTr@g2Yd#n6CD6<+YOZ^$F$Eabcuv4UGoUSFU_RB_SRN^87`e;I8v*Km6! zCXUIC5>HYYCyf~jLG+E;3WfOPm`U)dI(F<(NT197PBFa2P|olgt{vw{`Xv|TGAsm5 z`^Y#)`g>f=BhZZG*)$`axj2RIj~fc}Fy`t!IHwN1Itr@fgglqite^049(Ru6QwFJ4 zRFg;4d7M^pmaAT3_!EJa)Z_}1n#3sKT`LcQ6%a2oe9mwNcZ?qjEAZ*@*|x8!qJ}nJ z{;_+p&@+{u;1oPJJ|A-@Oc*OS`vs~Ke`7e$@CAdE(4!-tw~q+|V01#_#i)bkU{!K}}WbFZzQ#)42A0mV-?pn6wxuW}Bq{|7^_n(|aM z%c`K~I&$mcYi|4!zqqy#I!Jy?B(Iz7!dvV0h#--|Uni{G7Q--D!Li#Ca8qeMsQ7TH6B5*ur4tOJNyMG1=#(^>5-IMm9QWB2pJ= zCv1b_O@?ax?TkUwrGjxsA7J6k z@o)mGXO5-7yJzN!*}6Vq{iNa2EZv$(ZdhHx8js&iTU9>YY0~X2M!J!ZtJ>$$Ou>K% z3Bbd%mZyH(OL2n@21BAj2v9=K-;6NOhlwF(^Lb>Gm^xj4#g4hba7Pb{@H;_&K-3e%w80VSE}{ zagao!Y(bQ$U(c}^%vbS@d(k{^TI>oQTFAe1aM8Sx@g=-MRLC15f4zvq2p*W1PKgY4 zkX}P>q=N?y=eh|Dr}3kC4YZka=MRAyYRh~-P;@(U!z}Z!xZg}1yI?+y#9J5S!vcJE zYc@W=;82ym=`>oA#;4QiHTqdD9cT9mN*!sVQn}wnMEZ>CoJK97)=MO2HUuRdbP}K| zn2Ki>CZ^A3?PI8`Voag%nGJRWS$ACdiDKQ?P(ztBT6bCDRdjQwHq9g{T8YsWg``wT zMQM6P?~H~7ij7%@6oXi%-d2$Y)!`|155rv4;V8Z!GuaIJhdA zpghoAOpmuF=%_vRlo;<2ui|w}Mi1e$b0+Jmv(ag$W?vvACi2oITDn;?uMK%?8iQgA z{(Q+bS6|1oUPT>A8W)$BFjR4OYi+_%>2@q|lCFl1RGY{3c3Y|w(0%YJj6PC6WPo4?eTP z-XUr7D)koiTpuD zkYY%wbZUK4lR(-2KWaupM~{b_tJ90->f6JKh17IK*QU!A!L(C#@%_V0`+Kh+3VXH< zYNy8!9lgX0`&P;?LvGy{Ns)9y$_(N>i^$THIg>8W&5hJSEaa{=4D6uvEJTmxlw!6t zdt`>96-9adp)sSx2JW+*fE-^hnRUyRv1BIhs=LY@v5lwdGT}S?vaSHOsP_6AfU&sI zTUZpP;|;WQYm%F?hT?jUE7as_mMrWhAHL?z>mMeYc4gJB6w>RVgs*g7$0F>iXjRPia;cm$K#p-O5L~`) z!$i`VQ{uD37%cM@!kxsNk6ho2HP(n@k}0;#tT-U*cb!faOeTw)SUTe+OnRj$E3?8x z&VsLtN)LZ1meB?AgWcI`$0N}ZEYkC1-8Edu7-xKhB)vgXk7#Ku1d~HtD2}*`8zhM& zH>_c9XmNa9BlpgJuAg5|-J}6*<}tT0M9}4xrUa*(f<<{?J_W87tWR3EKIHt2*`!Y# z{ z3)v0gipQhIg-O~SZXLXyWKtKGBuP2=@g<(4`y5xb$0wuhP!PB+SOEL5WPP^!$KY{@ z+C)Qc$NN@|j+HsP)T;*HUscqje(dTZ2dgjkGSH)lf8vrfzI(Z(Z$SUcZGgCsI^Dyn z&zz>KZ3s`(oo8rKuI6T6%jLs@j0N51+OrdvnyZ7QyJ#owXA#+42jYP>(S|K1eCklX zK|G|szorcW^2p6@PmPOW>o2HlYm|ItQbrzFo(;e75W9GeeGCr~IO8t1YH846Tb7x} z2p-tQAWi#j9<7`9yDe#OAAa6a3=d;&>$&cHL;B`D+`o@|hOp6ax?yzR@@2-`k7tzb z7Vw63_36(r&t*nS?`QaeL00>?`s%usP$e%I(ns?lf6ASl3A%p3vxK9eG|p763tzJ2 zYSk>+GzMv*?{JB86MC)i7~bEWpTX4EOl)@g?c_u5@hHCCJ_26CnDqsqswM01hMWUD zxo$pOlG%i`w7J;TF(7dpRce+}^5iamtJsdo8?GOa$Q`ez`699j?_#>TI1S<`w;jOt z4f!w;AJ~xP2>bMlG7EWs)YFN~unk$n6FkFVhF_y)V-e`Ced@+6co&y#EP!XQbL08U z$GL;b@C*S3u3vKbcX&sqE&U)DA7hA|i*FhyI+Nh9*wvW{$JDPn*TFT<@vcx$Mf{N= z9FS!g?KMHo!uLkLfXiFGWP}Y5Zq6JMK6;rM%3-pPWWpOXDP@RjSe7MtZgU7MSbK9m zysU1$`2dX3wdBSl1e)2$o*^iPaZMRRU*~cF{af;rZy96^eu_)d|5tM9X@>7}Xc6WfD_-JJGBAgAdYn*svY9?a0oNL5kYxl?`F|uA+ym+$)Bx zcFf;7|0e|MlCYEcUjnu0j#1}!jsqV)x~~A%VELOo_HBkm3@dng8BgBfpYIZoGYRjr zT>i>QfjkNqk3Nm-cNf4xd~o;GaO(fTJ|pDrw#h7#>LP-4XV>R;rs0Xb1K~Ysm|Z`@ zFr9rdgSMe>3%znu31)oo{?cpTqTv@1>BpqiKipbZ?zCB4QfLwlL1 zAfMDw7h5;;-0kM*fh!iTXz?Yp>yM^nW6h_E;rkR_LQPH&EGvfroaQ-{)07K=c0b z9r`<7ErC8iR(k|}fs}V9aaoH$O+$Zzu`|;nFNk<>e+G=k7xs@deL^$isY4IbTZAo7 z6eKS*lQ*V^zKGG6RuG@z(@&g+d-3OLeuvEc&QgEPSnCaG){t=dgS7$C6ycx7@lu>| zQ8HQM3Q>|mxq3KnipZ9zk8n`$p4X|R-X@Pf=neRTnz6aYC4xND1wxk|PPNhhDCmwB zbll;IeG7DY$i z+@ik^InK48GMr-we`NfQf5!3-B`{=T*^&MoGLN0gzjGJ{GvqN8GVrGoF@s?NLj}V! zhH8e@3@r>7zIhZKR7XeUaa9L|U+mx>@GcYy98HUfW6N27P=}u!U6)!Mmll^nk1ehs&JY)ulol8NKUz(!Bme*a delta 10270 zcmc&)33OCNy1sQg>1>^SPbbhJWN9EkkRSpMD3L`lAS7%9#!k8k9g^-)-C+sQuqxm? z3j7KJE~B!@$Wu8j%8ZWs+?~Pa=(t{W#O0v4jtf3if6qtb0TM@gI88XST%nl}{b5b>#`|JuTv6;qn;%28Qe}dd$+)HO46S!XiMRlemh{~+<}Fo-(7KiTw;8JZJvzd%I`#Z=y>`lAx6-!%)w4QQ}92F zO(cp^At!T#2J93*1^i9m?^yl@7SZ9p*|fG#9zC2XMJ`ojIYnORw^?OE*h6cwt3^dH zL+cQlJ4;)wYuDvzR|u^mUu()#mD!p*heZ@(Cd+u~EG1}uXi)BMK_BK7@Jg-rGI0qF zviBC_Xo~&ZVd*@lVGa808Ul-*1+q6k$}&|~@AUQ|9<^YlaX@E=UDf>ZMonI_k zX>Wc&m+hgx1@%JArK@{AAY3%M_l^wSV$Fz_o<`47*^kRkAy?C9y?bTX3SOm+YqY2| zk<8nXZ5gP`>q142twLN!4;Bs-*U(3W(ZHM(c>v+)-+$ztw3~@sQX-dPf2iq?8^imL=coZkyJTqAj#)lFJEN zN3ymsMwJu8WqP4BBl>%N(N8+HlnfXW(JAnua3}w)0p2eepntDZTIs;NGfy~EP1vDp z?sRRTuF0QHh{vKntasS7CYCl;YqDvpl`@D$JRURa>=zUIsC1YRS5l$#YO$R5I{S-f z=w;`|w2i#0Ms%RH*&CF@xh%)gl76*fB0blyR9sBQ`i&HKQbqq9v6U|FUz26y)9fo8 zdLPyghh8X(m=f~7(0%i?kf!snX%N;q24`8d25cDtf`07j+Zlgbxmx#l3xZExdhJGmjK%d&<_=@LhjATp) z1(&G+HDiiAfRW?8zSg!N$R-_lj`y{-)CFm1#U!Eo6xE!Ni09Zn+>!Zr6bl&)OTg1;SUgLE@*R3(_y94PV$UlS zUxoUe7cJ`4fEdeZHIMla`h5lX7@*b|-Qtmr9%r2V3{`&y7z{>Aiz~ouZgB;rcd0s* zH}LK+fG+_b(X}H6h{8hGh5`8|Rh;27(4a_{HjvRHaI)kUV zh5Q;;?vG6b-~@brd>GJ$4Wc zMp1xhfE5q}$iv%p060vwSzXe}SiEciC8?7$dUO0_95Hcly^a_;87R&ias?kM|CIAskD89(%* zFcFl6h+tCv1)E zgH~#xX;@x-cawm87q2D(jsOl*_Jk4Q5KWtKuE?Ve6P}-Bt|+`hrNTT)*V+_4;#`3S zkI&5;RWr^RWoll$vMxbaeY3}i3ixGET4?9QdFkJQ;<4Oc)9s?4NY-VaP{zec;(XXD zIUT?Xr8%QzAIKWV9I9)S{Qk^t63#|-Eqy*CFBwZT zV%XbPx!4$`QO?YP9C41HSs)rho|*eZ;c&EtqijoMsPVU!^Oqf&WB%h0$q^3uJPny$ z*&pZTOf{Dd^)sScoBcsK8dFE{)UAefY0%rSFd#3a&9g_HHx{ERGQ`Syl;;30NAtM( zeqS)q;9{MyF7S987X(dKICxk&UL8i>oIK&9$~mQCNod}jp`vphdLy)zOBkFe#q3JaA73rc4FSifsa9G)+td@$T_Q)6{wC z**AhJD#B;-

    *lhj<}7sD0j>3nI9p$e|d+kzG#{Du!S}Bbs^ck=)Fr15d6DIC}fC_`EwH6hb@AlBVZ~I6aqHoTAH4LC=lM3EAs9MI9%P z>{kHu8LHc(c=H`?iQGp~$Pomcm`I9f&vUi(qHz;IxwS@=763p)wKk})`p9NFe!0C; z(Jw7bKj}kruuonHSOh>%=?Am|g0x~w!H9u8j103e&OB6gYU;F);C)YoFqs%~<%<3E zoU2r<2px0Pi~5-u+J#OXD0Ko>FgTOtQj{Hdr4D)-9@_y|0#2DD%r|8wl^9B@dUXhg z8j?6Mc%z|Gtb_@?+Ncn(r-52(xS&-RR91 zqv(LQFfU9Y#U8~YlS0g%KK;YQi7|vu`#TI5nE*&B|dvvRD^|6?vWju+!{LEQW(J$ zGg>w^`(#!)6C)()_L`xXVc0ZPX_PFhtji(w5t=OvIF;DGIG;$CSAZDM0+oB8m9hFl77P#5t;=gV zFt*}^vJ#4Ja9qgI;iMOIX}PIGo3SNJIqC>FPcwgui7(QCN#I$G6MnW4gPWoxlu zBg(~bDc!QD_ldRsqpi%biQmYZ0sPhE?I@|0--434)?VgX@g8o`g#IYq%8hQ|)laN4 zSFVRg=?WW+Xjf}%^Rm-h?jYvr(mmdcf7U$*COTy^euqw`?#&LQ(oJo#?4xg5QY`j_ z9$V5OlGNE+-JS*)yTV;Gd0EcHZRm0vU=ITatIBTIqC(A}lu=F7z1?yMrdCIw)bd6& zHr4XcWf@{Sy}Ybk>?KS47hO?-vYRRy>LFE7CXx3D~5=t=<_Rz#eq=fm0LvK zy%_uyZ>|)nnx5?Jlf02%8!=U!%ej2*aua>id2ydA42414qXxEedWqfk&eC~uUH$>X z-AnZ=iiW~M8&=LO+N8&b=bY3n{f*Mq65;7j2>}W;TDc`ZJFHAG@;;1rKi~oSWJQ@U z8B?}0N4!QCtSlCfQRB+5va!MP0N^nO_BD5*{3tD2WzXD?;@yDAv2I&cl=}&e%8ZfX z^<@5+$aitQgnTme)~e;A6E@mZhW|z>>`_-^ux`!d6pHghq}eKWSG39J7UQbrx!jKr zM46H_j1gvx3ii}Ms+S^=3N~X*R?iXi>Y9LvqDgCu#G#OX?cHLGxeb{7AcJ9Th7AeG zL8zGm=;;`7zU>7F9Ub)7&a8Bf=hT_TnrGVJ;X}RZ>ISzuR457`YG85Xp=Oy2cn~u@ z1o#sGrz;-@DC0j1rAGjMKr5if(~YnW3Gy#!rkrwkZBL`_8Nflhew~v~e1BsGy|^xm z^UkCE{9-75eUhj;gcdI{oUjAf6Vnc;5tMs=9Cc3s)PQOxHD?#2;3SuBNEQ{eZbO0i z+yAk9{^1&X_QdF(T=PqGxFt9A=C<;)bI)D2BfXnqXX~S1!qzAQav7KSdO7Lj>`c}c zp~Z~=WkgTsr8B7CwM85=RbSg%yux03y~ykK&~iJv-vr@ys1l`a9DTxpv9lfu2WBMI z@J*?!KG3kh=MD1ZW|+(TR+G0dp6bBQ?9|#RcGJ{f=FSI?g`AN|umdtE*P^x|>|ere zZ1dJw`eIXo@Y2Rxi^c1q%FQvNsoQ8&a{#IAz|TG;z7=>G;?x4vNx-z57h>Uf2|g`@ z{zyd}%KV*jg%O&2Tj-33;JjmS#35v=DB@Mj5#IS*Xuy`-F<5N1Gu;$lF$RW8UG1Gd zCJ0-BAblZoHt&p{6yDSXGb9ghs#=>`R(NOAcv>S_MN8wlFMnj@ei!STqN#IdnRqKS zb?g0N-Yby)bwJpGsvAcGb#F1StqAX>n*Pi&gWB$T^gcoj+ls|LTEFc)@y`E~c1FnF zrI9%-+38d=ogBAC)4?4M@v2hGcYmf@I(uRSZ$;0#*!zqD<`2=d{+6myuW|Q?nBixK z*}q6H$$M*w_*g0BdvyP;pU>{rNeRDREoU&E__3`g?X z*}~dw(Bt{ar58tmX+j?Pnd;cm)8?OS1K&7xGk>*)j{XH2D&J%=BD9ib?ko|RbmdN` z_<-)*c@AB-wUSbHWky-FU=%rb^`h^$I;e5imDab>@i2Y7YeZewFP*2=-y_`g*QoCG zyG!_5`oZ1jh!0P=cop4Mo8et|C?{WFDwtQ(=Dp0X$-uF^Q*te&#IsC(AnZJGX~QW3 zRs2U*=y*%)+4=i_Z#sUGdmq43hrJ4HaQHGcB5T~nFB+`$tlu75bgfUN~^l;zX7?!{BLIuUDohwueH+ZwxQ`^0;)aS#jPXv>ui(!Guz-D*ro{=n%?}MvwXQ5 zGM@IrBZ#-AQ)ho(P!-;@uKP6y4zCTZ!V!|`tWkH$b|HV?V5OUv6w#Nv_i)^}eNS=f zozM*a^3TV7RW0Nv^w&M_i<{}doki6uUHX)J8J79ppb;08E?;22U$#WJ`r(j3C7m(+ zBL!ke&b`7JMr2-5ve&^s^^f0MB%Y;(dvnFtw07^|!78QrnkV6qYbm5wIYdN1)HyJV znMmNiH!g<%`k-a%*b82v{lCxbJci!0q#cc*;^rR^%|%J&eym=a917?Kr~sS?m<(tK ztOZ;L*bLYXxDRj;@C1Vq&A*nkE;FzG4x#pAz*m}~YomrB7}~)ha;u?Wx>? zM>c-fMBGhp@5>Wz`flHH edw*MoEsMW)TbV7vmS)Se<=b>yVoHWB@qYolMq$SQ diff --git a/tmw.py b/tmw.py index b620786..520c467 100644 --- a/tmw.py +++ b/tmw.py @@ -236,7 +236,7 @@ def segmenter(inpath, outfolder, target, sizetolerancefactor, preserveparagraphs # Binning # ################################# -def segments_to_bins(inpath, outfile, binsnb): +def segments_to_bins(inpath, outfolder, binsnb): """Script for sorting text segments into bins.""" print("\nLaunched segments_to_bins.") @@ -326,6 +326,9 @@ def segments_to_bins(inpath, outfile, binsnb): files_and_bins = pd.concat([filenames_sr,binids_sr], axis=1) print("chunks per bin: ", bcount) + if not os.path.exists(outfolder): + os.makedirs(outfolder) + outfile = outfolder+"segs-and-bins.csv" with open(outfile, "w") as outfile: files_and_bins.to_csv(outfile, index=False) @@ -498,7 +501,7 @@ def call_mallet_import(mallet_path, infolder,outfolder, outfile, stoplist_projec # call_mallet_modeling # ################################# -def call_mallet_modeling(mallet_path, inputfile,outfolder,num_topics,optimize_interval,num_iterations,num_top_words,doc_topics_max): +def call_mallet_modeling(mallet_path, inputfile,outfolder,numOfTopics,optimize_interval,num_iterations,num_top_words,doc_topics_max): """Function to perform topic modeling with Mallet.""" print("\nLaunched call_mallet_modeling.") @@ -515,7 +518,7 @@ def call_mallet_modeling(mallet_path, inputfile,outfolder,num_topics,optimize_in output_topic_state = outfolder + "topic_state.gz" ### Constructing Mallet command from parameters. - command = mallet_path +" train-topics --input "+ inputfile +" --num-topics "+ num_topics +" --optimize-interval "+ optimize_interval +" --num-iterations " + num_iterations +" --num-top-words " + num_top_words +" --word-topic-counts-file "+ word_topics_counts_file + " --topic-word-weights-file "+ topic_word_weights_file +" --output-state topic-state.gz"+" --output-topic-keys "+ output_topic_keys +" --output-doc-topics "+ output_doc_topics +" --doc-topics-max "+ doc_topics_max + " --output-state " + output_topic_state + command = mallet_path +" train-topics --input "+ inputfile +" --num-topics "+ numOfTopics +" --optimize-interval "+ optimize_interval +" --num-iterations " + num_iterations +" --num-top-words " + num_top_words +" --word-topic-counts-file "+ word_topics_counts_file + " --topic-word-weights-file "+ topic_word_weights_file +" --output-state topic-state.gz"+" --output-topic-keys "+ output_topic_keys +" --output-doc-topics "+ output_doc_topics +" --doc-topics-max "+ doc_topics_max + " --output-state " + output_topic_state #print(command) subprocess.call(command, shell=True) print("Done.\n") @@ -545,7 +548,7 @@ def get_metadata(metadatafile): #print("metadata\n", metadata) return metadata -def get_topicscores(topics_in_texts, number_of_topics): +def get_topicscores(topics_in_texts, numOfTopics): """Create a matrix of segments x topics, with topic score values, from Mallet output.""" print("- getting topicscores...") ## Load Mallet output (strange format) @@ -565,7 +568,7 @@ def get_topicscores(topics_in_texts, number_of_topics): scores = [] ## For each segment, get the topic number and its score i +=1 - for j in range(1,number_of_topics,2): + for j in range(1,numOfTopics,2): k = j+1 topic = topicsintexts.iloc[i,j] score = topicsintexts.iloc[i,k] @@ -603,13 +606,13 @@ def get_docmatrix(corpuspath): return docmatrix def merge_data(corpuspath, metadatafile, topics_in_texts, mastermatrixfile, - number_of_topics): + numOfTopics): """Merges the three dataframes into one mastermatrix.""" print("- getting data...") ## Get all necessary data. metadata = get_metadata(metadatafile) docmatrix = get_docmatrix(corpuspath) - topicscores = get_topicscores(topics_in_texts, number_of_topics) + topicscores = get_topicscores(topics_in_texts, numOfTopics) ## For inspection only. #print("Metadata\n", metadata.head()) #print("Docmatrix\n", docmatrix.head()) @@ -630,21 +633,21 @@ def add_binData(mastermatrix, binDataFile): print("- adding bin data...") ## Read the information about bins binData = pd.read_csv(binDataFile, sep=",") - print(binData) + #print(binData) ## Merge existing mastermatrix and binData. mastermatrix = pd.merge(mastermatrix, binData, how="inner", on="segmentID") #print(mastermatrix) return mastermatrix def create_mastermatrix(corpuspath, outfolder, mastermatrixfile, metadatafile, - topics_in_texts, number_of_topics, useBins, binDataFile): + topics_in_texts, numOfTopics, useBins, binDataFile): """Builds the mastermatrix uniting all information about texts and topic scores.""" print("\nLaunched create_mastermatrix.") print("(Warning: This is very memory-intensive and may take a while.)") if not os.path.exists(outfolder): os.makedirs(outfolder) mastermatrix = merge_data(corpuspath, metadatafile, topics_in_texts, - mastermatrixfile, number_of_topics) + mastermatrixfile, numOfTopics) if useBins == True: mastermatrix = add_binData(mastermatrix, binDataFile) mastermatrix.to_csv(outfolder+mastermatrixfile, sep=",", encoding="utf-8") @@ -788,11 +791,11 @@ def get_color_scale(word, font_size, position, orientation, random_state=None): return "hsl(221, 65%%, %d%%)" % random.randint(30, 35) # Dark blue for white background def make_wordle_from_mallet(word_weights_file, - topics,words,outfolder, + numOfTopics,words,outfolder, font_path, dpi): """Generate wordles from Mallet output, using the wordcloud module.""" print("\nLaunched make_wordle_from_mallet.") - for topic in range(0,topics): + for topic in range(0,numOfTopics): ## Gets the text for one topic. text = get_wordlewords(words, word_weights_file, topic) wordcloud = WordCloud(font_path=font_path, background_color="white", margin=5).generate(text) @@ -897,7 +900,7 @@ def create_barchart_topTopics(dataToPlot, targetCategory, item, plt.savefig(figure_filename, dpi=dpi) plt.close() -def plot_topTopics(averageDatasets, firstWordsFile, numberOfTopics, +def plot_topTopics(averageDatasets, firstWordsFile, numOfTopics, targetCategories, topTopicsShown, fontscale, height, dpi, outfolder): """For each item in a category, plot the top n topics as a barchart.""" @@ -968,7 +971,7 @@ def create_topItems_barchart(dataToPlot, firstWords, targetCategory, topic, def plot_topItems(averageDatasets, outfolder, firstWordsFile, - numberOfTopics, + numOfTopics, targetCategories, topItemsShown, fontscale, @@ -980,7 +983,7 @@ def plot_topItems(averageDatasets, for targetCategory in targetCategories: if targetCategory in average: print(" Plotting for: "+targetCategory) - topics = list(range(0,numberOfTopics)) + topics = list(range(0,numOfTopics)) for topic in topics: firstWords = get_topItems_firstWords(firstWordsFile, topic) @@ -1018,7 +1021,7 @@ def get_heatmap_firstWords(firstWordsFile): return(firstWords) def get_heatmap_dataToPlot(average, firstWordsFile, topTopicsShown, - numberOfTopics): + numOfTopics): """From average topic score data, select data to be plotted.""" #print(" Getting dataToPlot.") with open(average, "r") as infile: @@ -1076,7 +1079,7 @@ def plot_distinctiveness_heatmap(averageDatasets, firstWordsFile, outfolder, targetCategories, - numberOfTopics, + numOfTopics, topTopicsShown, fontscale, dpi): @@ -1089,7 +1092,7 @@ def plot_distinctiveness_heatmap(averageDatasets, dataToPlot = get_heatmap_dataToPlot(average, firstWordsFile, topTopicsShown, - numberOfTopics) + numOfTopics) create_distinctiveness_heatmap(dataToPlot, topTopicsShown, targetCategory, @@ -1176,14 +1179,14 @@ def create_overTime_areaplot(dataToPlot, outfolder, fontscale, topics, dpi): plt.close() def plot_topicsOverTime(averageDatasets, firstWordsFile, outfolder, - numberOfTopics, fontscale, dpi, height, + numOfTopics, fontscale, dpi, height, mode, topics): """Function to plot development of topics over time using lineplots or areaplots.""" print("Launched plot_topicsOverTime.") if mode == "line": for average in glob.glob(averageDatasets): if "decade" in average: - entriesShown = numberOfTopics + entriesShown = numOfTopics dataToPlot = get_overTime_dataToPlot(average, firstWordsFile, entriesShown, topics) create_overTime_lineplot(dataToPlot, outfolder, fontscale, @@ -1191,7 +1194,7 @@ def plot_topicsOverTime(averageDatasets, firstWordsFile, outfolder, elif mode == "area": for average in glob.glob(averageDatasets): if "decade" in average: - entriesShown = numberOfTopics + entriesShown = numOfTopics dataToPlot = get_overTime_dataToPlot(average, firstWordsFile, entriesShown, topics) create_overTime_areaplot(dataToPlot, outfolder, fontscale, @@ -1450,12 +1453,12 @@ def create_allSimpleProgression_lineplot(dataToPlot, outfolder, fontscale, def simpleProgression(averageDataset, firstWordsFile, outfolder, - numberOfTopics, + numOfTopics, fontscale, dpi, height, mode, topics): """Function to plot topic development over textual progression.""" print("Launched textualProgression.") if mode == "selected" or mode == "sel": - entriesShown = numberOfTopics + entriesShown = numOfTopics dataToPlot = get_selSimpleProgression_dataToPlot(averageDataset, firstWordsFile, entriesShown, @@ -1464,8 +1467,8 @@ def simpleProgression(averageDataset, firstWordsFile, outfolder, fontscale, topics, dpi, height) elif mode == "all": - entriesShown = numberOfTopics - topics = list(range(0, numberOfTopics)) + entriesShown = numOfTopics + topics = list(range(0, numOfTopics)) for topic in topics: topic = str(topic) dataToPlot = get_allSimpleProgression_dataToPlot(averageDataset, @@ -1584,7 +1587,7 @@ def create_allComplexProgression_lineplot(dataToPlot, outfolder, fontscale, def complexProgression(averageDataset, firstWordsFile, outfolder, - numberOfTopics, + numOfTopics, fontscale, dpi, height, mode, topics): """Function to plot topic development over textual progression.""" print("Launched textualProgression.") @@ -1598,8 +1601,8 @@ def complexProgression(averageDataset, firstWordsFile, outfolder, fontscale, topics, dpi, height) elif mode == "all": - entriesShown = numberOfTopics - topics = list(range(0, numberOfTopics)) + entriesShown = numOfTopics + topics = list(range(0, numOfTopics)) for topic in topics: topic = str(topic) dataToPlot = get_allComplexProgression_dataToPlot(averageDataset, diff --git a/tmw_config.py b/tmw_config.py index 1ed87c0..dea8274 100644 --- a/tmw_config.py +++ b/tmw_config.py @@ -32,7 +32,7 @@ ### Path to the working directory. wdir = "/home/christof/Dropbox/0-Analysen/2015/hybrid/rf10/" # end with slash. ### Path to the TreeTagger file (language-dependent!) -tagger = "/home/christof/Programs/TreeTagger/cmd/tree-tagger-spanish" +tagger = "/home/christof/Programs/TreeTagger/cmd/tree-tagger-french" ### Path to Mallet installation directory mallet_path = "/home/christof/Programs/Mallet/bin/mallet" @@ -61,15 +61,15 @@ ### segments_to_bins inpath = wdir + "2_segs/*.txt" -outfile = wdir + "7_aggregates/segs-and-bins.csv" +outfolder = wdir + "7_aggregates/" binsnb = 5 # number of bins -#tmw.segments_to_bins(inpath,outfile, binsnb) +#tmw.segments_to_bins(inpath,outfolder, binsnb) ### pretokenize ### Perform some preliminary tokenization. inpath = wdir + "2_segs/*.txt" outfolder = wdir + "3_tokens/" -substitutionsFile = "./extras/fr_pretokenize_subs.csv" +substitutionsFile = wdir+"extras/fr_pretokenize_subs.csv" #tmw.pretokenize(inpath, substitutionsFile, outfolder) ### call_treetagger @@ -112,13 +112,13 @@ mallet_path = mallet_path inputfile = wdir + "6_mallet/corpus.mallet" outfolder = wdir + "6_mallet/" -num_topics = "50" # string +numOfTopics = "50" # string optimize_interval = "100" # string num_iterations = "1000" # string num_top_words = "100" # string -doc_topics_max = num_topics +doc_topics_max = numOfTopics num_threads = "4" # string -#tmw.call_mallet_modeling(mallet_path, inputfile, outfolder, num_topics, optimize_interval, num_iterations, num_top_words, doc_topics_max) +#tmw.call_mallet_modeling(mallet_path, inputfile, outfolder, numOfTopics, optimize_interval, num_iterations, num_top_words, doc_topics_max) @@ -134,10 +134,10 @@ mastermatrixfile = "mastermatrix.csv" metadatafile = wdir+"/metadata.csv" topics_in_texts = wdir+"/6_mallet/topics-in-texts.csv" -number_of_topics = num_topics +numOfTopics = int(numOfTopics) useBins = True # True|False binDataFile = wdir+"7_aggregates/segs-and-bins.csv" -#tmw.create_mastermatrix(corpuspath, outfolder, mastermatrixfile, metadatafile, topics_in_texts, number_of_topics, useBins, binDataFile) +#tmw.create_mastermatrix(corpuspath, outfolder, mastermatrixfile, metadatafile, topics_in_texts, numOfTopics, useBins, binDataFile) ### calculate_averageTopicScores ### Based on the mastermatrix, calculates various average topic score datasets. @@ -171,15 +171,15 @@ ### make_wordle_from_mallet ### Creates a wordle for each topic. word_weights_file = wdir+"6_mallet/" + "word-weights.txt" -topics = num_topics +numOfTopics = numOfTopics words = 40 outfolder = wdir+"8_visuals/wordles/" font_path = "/home/christof/.fonts/AlegreyaSans-Regular.otf" dpi = 300 -#tmw.make_wordle_from_mallet(word_weights_file,topics,words,outfolder,font_path,dpi) +#tmw.make_wordle_from_mallet(word_weights_file,numOfTopics,words,outfolder,font_path,dpi) ### crop_images -### Crops the wordle image files, use if needed. +### Optional. Crops the wordle image files. inpath = wdir + "8_visuals/wordles/*.png" outfolder = wdir + "8_visuals/wordles/" left = 225 # image start at the left @@ -194,25 +194,25 @@ firstWordsFile = wdir+"7_aggregates/firstWords.csv" targetCategories = ["author", "subgenre", "binID"] topTopicsShown = 30 -numberOfTopics = num_topics +numOfTopics = numOfTopics fontscale = 1.0 height = 0 # 0=automatic and variable dpi = 300 outfolder = wdir+"/8_visuals/topTopics/" -#tmw.plot_topTopics(averageDatasets, firstWordsFile, numberOfTopics, targetCategories, topTopicsShown, fontscale, height, dpi, outfolder) +#tmw.plot_topTopics(averageDatasets, firstWordsFile, numOfTopics, targetCategories, topTopicsShown, fontscale, height, dpi, outfolder) ### plot_topItems ### ### For each topic, creates a barchart with top items from a category. averageDatasets = wdir+"7_aggregates/avg*.csv" outfolder = wdir+"8_visuals/topItems/" firstWordsFile = wdir+"7_aggregates/firstWords.csv" -numberOfTopics = num_topics # must be actual number of topics modeled. +numOfTopics = numOfTopics # must be actual number of topics modeled. targetCategories = ["author", "subgenre", "binID"] topItemsShown = 30 fontscale = 0.8 height = 0 # 0=automatic and flexible dpi = 300 -#tmw.plot_topItems(averageDatasets, outfolder, firstWordsFile, numberOfTopics, targetCategories, topItemsShown, fontscale, height, dpi) +#tmw.plot_topItems(averageDatasets, outfolder, firstWordsFile, numOfTopics, targetCategories, topItemsShown, fontscale, height, dpi) @@ -227,30 +227,30 @@ outfolder = wdir+"8_visuals/distinctiveness/" targetCategories = ["author", "subgenre", "binID"] # one or several: "author-name", "decade", "subgenre", "gender", "idno", "title" -numberOfTopics = num_topics # must be actual number of topics modeled. +numOfTopics = numOfTopics # must be actual number of topics modeled. topTopicsShown = 20 fontscale = 1.0 dpi = 300 -#tmw.plot_distinctiveness_heatmap(averageDatasets, firstWordsFile, outfolder, targetCategories, numberOfTopics, topTopicsShown, fontscale, dpi) +#tmw.plot_distinctiveness_heatmap(averageDatasets, firstWordsFile, outfolder, targetCategories, numOfTopics, topTopicsShown, fontscale, dpi) ### plot_topicsOverTime ### ### averageDatasets = wdir+"7_aggregates/avgtopicscores_by-decade.csv" firstWordsFile = wdir+"7_aggregates/firstWords.csv" outfolder = wdir+"8_visuals/overTime/" -numberOfTopics = num_topics # must be actual number of topics modeled. +numOfTopics = numOfTopics # must be actual number of topics modeled. fontscale = 1.0 dpi = 300 height = 0 # for lineplot; 0=automatic mode = "line" # area|line for areaplot or lineplot topics = ["25", "44"] # list of one or several topics -#tmw.plot_topicsOverTime(averageDatasets, firstWordsFile, outfolder, numberOfTopics, fontscale, dpi, height, mode, topics) +#tmw.plot_topicsOverTime(averageDatasets, firstWordsFile, outfolder, numOfTopics, fontscale, dpi, height, mode, topics) ### topicClustering ### # This function will create a dendrogram grouping topics based on their word weight similarity. wordWeightsFile = wdir+"6_mallet/"+"word-weights.txt" outfolder = wdir+"8_visuals/clustering/" -topicsToUse = num_topics # = all topics modeled +topicsToUse = numOfTopics # should be all topics. wordsPerTopic = 50 methods=["weighted"] # list metrics=["cosine"] # list @@ -258,29 +258,28 @@ ### itemClustering ### # This function creates a dendrogram of items in a category (authors, titles). -averageDatasets = wdir+"7_aggregates/avg*title.csv" +averageDatasets = wdir+"7_aggregates/avg*author.csv" figsize = (10,80) # width,height outfolder = wdir+"8_visuals/clustering/" -topicsPerItem = num_topics +topicsPerItem = 40 # can be set sortingCriterium = "std" # std|mean -targetCategories = ["title"] # list +targetCategories = ["author"] # list methods=["weighted"] # list metrics=["cosine"] # list #tmw.itemClustering(averageDatasets, figsize, outfolder, topicsPerItem, targetCategories, methods, metrics, sortingCriterium) - ### simpleProgression ### ### Creates a lineplot of topic development over textual progression. averageDataset = wdir+"7_aggregates/avgtopicscores_by-binID.csv" firstWordsFile = wdir+"7_aggregates/firstWords.csv" outfolder = wdir+"8_visuals/progression/simple/" -numberOfTopics = num_topics # must be actual number of topics modeled. +numOfTopics = numOfTopics # must be actual number of topics modeled. fontscale = 1.0 dpi = 300 height = 0 # for lineplot; 0=automatic mode = "sel" # all|sel topics = ["25", "44", "12"] # if mode="sel": list of topics -#tmw.simpleProgression(averageDataset, firstWordsFile, outfolder, numberOfTopics, fontscale, dpi, height, mode, topics) +#tmw.simpleProgression(averageDataset, firstWordsFile, outfolder, numOfTopics, fontscale, dpi, height, mode, topics) @@ -296,13 +295,13 @@ averageDataset = wdir+"7_aggregates/complex-avgtopicscores_by-subgenre+binID.csv" firstWordsFile = wdir+"7_aggregates/firstWords.csv" outfolder = wdir+"8_visuals/progression/complex/" -numberOfTopics = num_topics # must be actual number of topics modeled. +numOfTopics = numOfTopics # must be actual number of topics modeled. fontscale = 1.0 dpi = 300 height = 0 # for lineplot; 0=automatic mode = "all" # all|sel topics = ["25", "44", "12"] # if mode="sel": list of topics -tmw.complexProgression(averageDataset, firstWordsFile, outfolder, numberOfTopics, fontscale, dpi, height, mode, topics) +#tmw.complexProgression(averageDataset, firstWordsFile, outfolder, numOfTopics, fontscale, dpi, height, mode, topics) From 443b7ad58a57685dc60474b1be7f5ebc8c203ea9 Mon Sep 17 00:00:00 2001 From: christofs Date: Mon, 7 Sep 2015 12:01:11 +0200 Subject: [PATCH 43/56] Added wordle font_path to once-for-all variables set initially. --- tmw_config.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tmw_config.py b/tmw_config.py index dea8274..2c6ce7b 100644 --- a/tmw_config.py +++ b/tmw_config.py @@ -35,6 +35,8 @@ tagger = "/home/christof/Programs/TreeTagger/cmd/tree-tagger-french" ### Path to Mallet installation directory mallet_path = "/home/christof/Programs/Mallet/bin/mallet" +### Path to the font for wordle generation +font_path = "/home/christof/.fonts/AlegreyaSans-Regular.otf" ################################ @@ -174,7 +176,7 @@ numOfTopics = numOfTopics words = 40 outfolder = wdir+"8_visuals/wordles/" -font_path = "/home/christof/.fonts/AlegreyaSans-Regular.otf" +font_path = font_path dpi = 300 #tmw.make_wordle_from_mallet(word_weights_file,numOfTopics,words,outfolder,font_path,dpi) From b812b671ae3c67a71560c46f94cb3ff71a44b97c Mon Sep 17 00:00:00 2001 From: christofs Date: Mon, 7 Sep 2015 16:20:15 +0200 Subject: [PATCH 44/56] First alpha version of complex topic progression --- __pycache__/tmw.cpython-34.pyc | Bin 44424 -> 45250 bytes tmw.py | 98 +++++++++++++++++++++++---------- tmw_config.py | 14 +++-- 3 files changed, 76 insertions(+), 36 deletions(-) diff --git a/__pycache__/tmw.cpython-34.pyc b/__pycache__/tmw.cpython-34.pyc index cd15355dc422cef5d1e5baab998eea1fc4a3f51e..542f1e316d8e1415d6620d15dd64ad309c0b2a72 100644 GIT binary patch delta 2001 zcmZWpZ)jUp6hHUA{C#~%)23;fcKu^lg^Cm`PK2RztV0KGOgo+XFlUw*OVTuHZ=Mq5 zg;3*QAN-(Rkj)LIo11Q^a}0)p_(4DTK^O%=B%&ZH3Zibp_HD!8xk)-Wv^bSF!EwJwpSVpj; z-ebJO3(nfKAjyrA8-Z(4ce)~n{GQuIzC-hS;r+>H0p_P|z2%*k5(3GoO$?p^m=5ci zW5YcI;Hn(cF}?~+--EHy$eD zy^Y-r>&D5CQxVr2Ge_HGgxjEipl~2a4|W=Yq+s3u#n3=+;Jy%z=>QydsEE6avn{c* zLVG=%iapdM&|-{=aNNsw>Iw1^ zfmb1sa=$c>ZRORmvu3d?x!H*u^ago!FiF3v$KM8Lu*4UmgJ zSgCbZirm55SZz~Y1CAj|lRHFihTLIzkj6~f1U1AS-4V*T8#rDzI|{R?BkCq;(IPb} zIeMHO>kJa-T}FcpQ*nff8_1Vj%<2}&oY4+xG!nGC-6M4*0_)?H=F+U+ZQ7lnwBLse z@1k_Q-$Vp22x5{avYYX^DT`}ikf!uUC$o?4Qs1Ff)f&Jab)3;-3@-8LTxRTJ#*Q!u zFnN@*w*gwgQubiJr0+YmFTpd|617&ae6Ucis!e+J)Wn$Th1Sb~ndS0=+6;7B-ZD5n zq0gU66&~VqRyFS_!Dc5J{K;OnGkAo_&lx<=q=|DclSu}n%v)kC&tNNqWd~pQ5pM^~F{TO-i#STn051q^eas!Y1|7uP7ds(L%@A7eFmE7q18k^ox+W~%G1v*P zh*{;Si&E`5J~_942d^65KaLRqhcTM%dc-tiwFQk+tQk0knYLeq1ZmrKgoIu=T@1I2 zT+`p5p4Y*n!;`#5P4?3<^)2Ij7)&5(^%;Yj&`W1_ZhMNW&AmfW8?_7JvG7DYUrWRc zH`V#FjHOY)kQk7*P*?RoXV&^&VLN!Suuy%Wu3cPvKPJR={nH1piIaNqY)0JBhtB>6 zxOi^6IIl09ONpBozdmx}$ar2yBrCRU`m9o0d3O5+s V)SrI%Rcy$4z?pHR<0J;0*ninglSBXj delta 641 zcmYjOO=uHQ5T1Gamuz?AnzXSb81N8la`2{j5EZO=s6Ui8)PuR$wBm}Uyrke^_YiVX zsHcI(;Gy&&8XL6|@G5xr4?G!9LP~|!BBTe8&RY?D%Y1L%`@WfFzW46Cxc^PqdyZ?L zD*r6aCSKc{YiS++E{ivp<2*;rr$!@1zb38H6ofj|1^62HIs`gYG^j8dP%)v2A~c{5 z5Ez^T^5!#Pse^AqXhxjU0nV|0`WnMbAx4RZRabuCKV6X}#)3u1A7>JX0DbeOtS&Fr z_^LrUyYEKFg~|+lIypuk*NaJq3qZ3mi8-NA4tl@ooG|29E~hiAEg}1KX)RAvPjez; z(nRZ6a*L_?xGLnO8d!98C0z(Cs0g?%KwgGALKs(*20RWz5QiWJO&vlT>Kb)h<8zCk z^jf6vK%hl>Hisu>XEbOEHE0-vd#6?HWO)L@5vZFGj6?+I)_>jkGU`t4XFjbzN>RFV z(d#GfF5kJaAg?aYR+nzg*Qngd;DGY&Y5LMhB~F<9I7KytZlgo1?IWgV*eJKD@ci?^ z>x$(7U|T$tJ3V{DMck$F?n*XLq^C%BEsiE0P@g!8a!7BxTR5aAFD@K4m#x2w{jf_()=YIIX%Z*I7wPS(jg!g0J&$NdB5 Cx|db} diff --git a/tmw.py b/tmw.py index 520c467..475bbb0 100644 --- a/tmw.py +++ b/tmw.py @@ -1540,26 +1540,31 @@ def create_selComplexProgression_lineplot(dataToPlot, outfolder, fontscale, plt.close() def get_allComplexProgression_dataToPlot(averageDataset, firstWordsFile, - entriesShown, topic): + entriesShown, topic, targetCategories): """Function to build a dataframe with all data necessary for plotting.""" print("- getting data to plot...") with open(averageDataset, "r") as infile: allScores = pd.DataFrame.from_csv(infile, sep=",", index_col=None) - #allScores = allScores.T - print(allScores) - groupedScores = allScores.groupby("binID").groups - print(groupedScores) + #print(allScores) ## Select the data for current topics - #someScores = allScores.loc[topic,:] - #someScores.index = someScores.index.astype(np.int64) - #dataToPlot = someScores + target1 = targetCategories[0] + target2 = targetCategories[1] + target1data = allScores.loc[:,target1] + target2data = allScores.loc[:,target2] + topicScores = allScores.loc[:,topic] + #print(target1data) + #print(target2data) + #print(topicScores) + dataToPlot = pd.concat([target1data, target2data], axis=1) + dataToPlot = pd.concat([dataToPlot, topicScores], axis=1) #print(dataToPlot) - #return dataToPlot + return dataToPlot # TODO: Make sure this is only read once and then select when plotting. - - -def create_allComplexProgression_lineplot(dataToPlot, outfolder, fontscale, + + +def create_allComplexProgression_lineplot(dataToPlot, targetCategories, + outfolder, fontscale, firstWordsFile, topic, dpi, height): """This function does the actual plotting and saving to disk.""" print("- creating the plot for topic " + topic) @@ -1567,11 +1572,35 @@ def create_allComplexProgression_lineplot(dataToPlot, outfolder, fontscale, firstWords = get_progression_firstWords(firstWordsFile) topicFirstWords = firstWords.iloc[int(topic),0] #print(topicFirstWords) - ## Plot the selected data - dataToPlot.plot(kind="line", lw=3, marker="o") + ## Split plotting data into parts (for target1) + target1data = dataToPlot.iloc[:,0] + #print(target1data) + numPartialData = len(set(target1data)) + ## Initialize plot for several lines + completeData = [] + #print(dataToPlot) + for target in set(target1data): + #print(" - plotting "+target) + partialData = dataToPlot.groupby(targetCategories[0]) + partialData = partialData.get_group(target) + partialData.rename(columns={topic:target}, inplace=True) + partialData = partialData.iloc[:,2:3] + completeData.append(partialData) + #print(completeData) + ## Plot the selected data, one after the other + plt.figure() + plt.figure(figsize=(15,10)) + for i in range(0, numPartialData): + print(completeData[i]) + label = completeData[i].columns.values.tolist() + label = str(label[0]) + plt.plot(completeData[i], lw=4, marker="o", label=label) + plt.legend() plt.title("Entwicklung über den Textverlauf für "+topicFirstWords, fontsize=20) plt.ylabel("Topic scores (absolut)", fontsize=16) plt.xlabel("Textabschnitte", fontsize=16) + plt.legend() + plt.locator_params(axis = 'x', nbins = 5) plt.setp(plt.xticks()[1], rotation=0, fontsize = 14) if height != 0: plt.ylim((0.000,height)) @@ -1581,25 +1610,32 @@ def create_allComplexProgression_lineplot(dataToPlot, outfolder, fontscale, os.makedirs(outfolder) ## Format the topic information for display topicsLabel = str(topic) - figure_filename = outfolder+"all_"+topicsLabel+".png" + figure_filename = outfolder+"all_"+str(targetCategories[0])+"-"+topicsLabel+".png" plt.savefig(figure_filename, dpi=dpi) plt.close() -def complexProgression(averageDataset, firstWordsFile, outfolder, - numOfTopics, - fontscale, dpi, height, mode, topics): +def complexProgression(averageDataset, + firstWordsFile, + outfolder, + numOfTopics, + targetCategories, + fontscale, + dpi, height, + mode, topics): """Function to plot topic development over textual progression.""" - print("Launched textualProgression.") + print("Launched complexProgression.") if mode == "sel": - entriesShown = numberOfTopics + entriesShown = numOfTopics dataToPlot = get_selSimpleProgression_dataToPlot(averageDataset, - firstWordsFile, - entriesShown, - topics) - create_selSimpleProgression_lineplot(dataToPlot, outfolder, - fontscale, topics, - dpi, height) + firstWordsFile, + entriesShown, + topics) + create_selSimpleProgression_lineplot(dataToPlot, + outfolder, + fontscale, + topics, + dpi, height) elif mode == "all": entriesShown = numOfTopics topics = list(range(0, numOfTopics)) @@ -1608,10 +1644,12 @@ def complexProgression(averageDataset, firstWordsFile, outfolder, dataToPlot = get_allComplexProgression_dataToPlot(averageDataset, firstWordsFile, entriesShown, - topic) - #create_allComplexProgression_lineplot(dataToPlot, outfolder, - # fontscale, firstWordsFile, - # topic, dpi, height) + topic, + targetCategories) + create_allComplexProgression_lineplot(dataToPlot, targetCategories, + outfolder, + fontscale, firstWordsFile, + topic, dpi, height) else: print("Please select a valid value for 'mode'.") print("Done.") diff --git a/tmw_config.py b/tmw_config.py index 2c6ce7b..272ee90 100644 --- a/tmw_config.py +++ b/tmw_config.py @@ -154,8 +154,8 @@ ### Based on the mastermatrix, calculates average topic scores for two target categories at once. mastermatrixfile = wdir+"/7_aggregates/mastermatrix.csv" outfolder = wdir+"7_aggregates/" -targets = ["subgenre", "binID"] # 2 targets to combine -#tmw.calculate_complexAverageTopicScores(mastermatrixfile, targets, outfolder) +targets = ["decade", "binID"] # 2 targets to combine +tmw.calculate_complexAverageTopicScores(mastermatrixfile, targets, outfolder) ### save_firstWords ### Saves the first words of each topic to a separate file. @@ -294,16 +294,18 @@ ### complexProgression ### ### Creates a lineplot of topic development over textual progression, ### but does so separatedly for different target categories. -averageDataset = wdir+"7_aggregates/complex-avgtopicscores_by-subgenre+binID.csv" +averageDataset = wdir+"7_aggregates/complex-avgtopicscores_by-decade+binID.csv" firstWordsFile = wdir+"7_aggregates/firstWords.csv" outfolder = wdir+"8_visuals/progression/complex/" -numOfTopics = numOfTopics # must be actual number of topics modeled. +numOfTopics = 3 # for testing. +#numOfTopics = numOfTopics # must be actual number of topics modeled. +targetCategories = ["decade","binID"] # two values, corresponding to averageDataset fontscale = 1.0 dpi = 300 height = 0 # for lineplot; 0=automatic -mode = "all" # all|sel +mode = "all" # all|sel ### only all is implemented ## topics = ["25", "44", "12"] # if mode="sel": list of topics -#tmw.complexProgression(averageDataset, firstWordsFile, outfolder, numOfTopics, fontscale, dpi, height, mode, topics) +tmw.complexProgression(averageDataset, firstWordsFile, outfolder, numOfTopics, targetCategories, fontscale, dpi, height, mode, topics) From bc27d043f15f4eaaf39d989ddfc40928e50eaafa Mon Sep 17 00:00:00 2001 From: christofs Date: Mon, 7 Sep 2015 17:10:44 +0200 Subject: [PATCH 45/56] Remove comments, add link to tutorial --- __pycache__/tmw.cpython-34.pyc | Bin 45250 -> 45234 bytes tmw.py | 2 +- tmw_config.py | 46 +++++++++++++-------------------- 3 files changed, 19 insertions(+), 29 deletions(-) diff --git a/__pycache__/tmw.cpython-34.pyc b/__pycache__/tmw.cpython-34.pyc index 542f1e316d8e1415d6620d15dd64ad309c0b2a72..e9407ad37a3d9472034fbe6bd950030a2fdab8ed 100644 GIT binary patch delta 79 zcmV-V0I>hU;R3SZ0txO64GIRN?Nynv2_&Kcg0n871OXLl0CWrhWC8$u3jl)x07P9` l0DTVtlU1YP0S}Wmq$dF}vtOhS0|7#lZ>J&w0<)B-E(r>&7@YtB delta 95 zcmV-l0HFV};R3?p0txO64GM^y?Nyku2_&Kckh3nL1OX#>0CWrhWC8$u3jl)x07PA9 z0DTVtbN~Q+3;=u&02zY;00EOnqu~J^lPRPp0Y9@*qz?lDM3Z5sA_D;q0kedsE(wf4 B8(9DV diff --git a/tmw.py b/tmw.py index 475bbb0..30bb384 100644 --- a/tmw.py +++ b/tmw.py @@ -1591,7 +1591,7 @@ def create_allComplexProgression_lineplot(dataToPlot, targetCategories, plt.figure() plt.figure(figsize=(15,10)) for i in range(0, numPartialData): - print(completeData[i]) + #print(completeData[i]) label = completeData[i].columns.values.tolist() label = str(label[0]) plt.plot(completeData[i], lw=4, marker="o", label=label) diff --git a/tmw_config.py b/tmw_config.py index 272ee90..d7f5224 100644 --- a/tmw_config.py +++ b/tmw_config.py @@ -20,8 +20,9 @@ # 5. Advanced Visualizations # 6. Other / Obsolete / in development -import tmw -#print(help(topmod)) +# You may find a tutorial explaining the purpose of each function +# as well as its input, output and other parameters at: +# https://www.penflip.com/c.schoech/tmw-tutorial ################################ @@ -38,6 +39,10 @@ ### Path to the font for wordle generation font_path = "/home/christof/.fonts/AlegreyaSans-Regular.otf" +import tmw +#print(help(topmod)) + + ################################ ### PREPROCESSING TEXTS ### @@ -51,9 +56,6 @@ ### segmenter ### Split entire texts into smaller segments. -### target: The desired length of each text segment in words. -### sizetolerancefactor: 1=exact target; >1 = some tolerance, e.g. 1.1= +/-10%. -### preserveparagraphs: True|False, whether \n from input are kept in output. inpath = wdir + "1_txt/*.txt" outfolder = wdir + "2_segs/" target = 2000 @@ -62,6 +64,7 @@ #tmw.segmenter(inpath, outfolder, target, sizetolerancefactor, preserveparagraphs) ### segments_to_bins +### Assign each segment to one bin over textual progression. inpath = wdir + "2_segs/*.txt" outfolder = wdir + "7_aggregates/" binsnb = 5 # number of bins @@ -106,11 +109,6 @@ ### call_mallet_model ### Performs the actual topic modeling. -### num_topics: Number of different topics the model should find. -### optimize_interval: interval between hypermarameter optimization. -### num_iterations: How many times the model is improved. -### num_top_words: Number of words to save and display for each topic. -### num_threads: Number of parallel processing threads to use. mallet_path = mallet_path inputfile = wdir + "6_mallet/corpus.mallet" outfolder = wdir + "6_mallet/" @@ -228,8 +226,7 @@ firstWordsFile = wdir+"7_aggregates/firstWords.csv" outfolder = wdir+"8_visuals/distinctiveness/" targetCategories = ["author", "subgenre", "binID"] -# one or several: "author-name", "decade", "subgenre", "gender", "idno", "title" -numOfTopics = numOfTopics # must be actual number of topics modeled. +numOfTopics = numOfTopics # actual number of topics modeled. topTopicsShown = 20 fontscale = 1.0 dpi = 300 @@ -240,7 +237,7 @@ averageDatasets = wdir+"7_aggregates/avgtopicscores_by-decade.csv" firstWordsFile = wdir+"7_aggregates/firstWords.csv" outfolder = wdir+"8_visuals/overTime/" -numOfTopics = numOfTopics # must be actual number of topics modeled. +numOfTopics = numOfTopics # actual number of topics modeled. fontscale = 1.0 dpi = 300 height = 0 # for lineplot; 0=automatic @@ -278,37 +275,30 @@ numOfTopics = numOfTopics # must be actual number of topics modeled. fontscale = 1.0 dpi = 300 -height = 0 # for lineplot; 0=automatic +height = 0 # 0=automatic mode = "sel" # all|sel topics = ["25", "44", "12"] # if mode="sel": list of topics #tmw.simpleProgression(averageDataset, firstWordsFile, outfolder, numOfTopics, fontscale, dpi, height, mode, topics) - - - -################################ -### OTHER / OBSOLETE / DEV ### -################################ - - ### complexProgression ### ### Creates a lineplot of topic development over textual progression, ### but does so separatedly for different target categories. -averageDataset = wdir+"7_aggregates/complex-avgtopicscores_by-decade+binID.csv" +averageDataset = wdir+"7_aggregates/complex-avgtopicscores_by-subgenre+binID.csv" firstWordsFile = wdir+"7_aggregates/firstWords.csv" outfolder = wdir+"8_visuals/progression/complex/" -numOfTopics = 3 # for testing. -#numOfTopics = numOfTopics # must be actual number of topics modeled. -targetCategories = ["decade","binID"] # two values, corresponding to averageDataset +numOfTopics = numOfTopics # must be actual number of topics modeled. +targetCategories = ["subgenre","binID"] # two values, corresponding to averageDataset fontscale = 1.0 dpi = 300 height = 0 # for lineplot; 0=automatic -mode = "all" # all|sel ### only all is implemented ## -topics = ["25", "44", "12"] # if mode="sel": list of topics +mode = "all" # all|sel ### only "all" is implemented ## tmw.complexProgression(averageDataset, firstWordsFile, outfolder, numOfTopics, targetCategories, fontscale, dpi, height, mode, topics) +################################ +### OTHER / OBSOLETE / DEV ### +################################ ### 5c show segment ## To read a specific segment, better than looking in the folder. From 903edcb4c9e30ad0013d8403e32e64ba8868a9bf Mon Sep 17 00:00:00 2001 From: christofs Date: Tue, 8 Sep 2015 14:34:50 +0200 Subject: [PATCH 46/56] make_wordle: Simple default for colors --- __pycache__/tmw.cpython-34.pyc | Bin 45234 -> 45195 bytes tmw.py | 3 ++- tmw_config.py | 4 ++-- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/__pycache__/tmw.cpython-34.pyc b/__pycache__/tmw.cpython-34.pyc index e9407ad37a3d9472034fbe6bd950030a2fdab8ed..1df52ea958c10136a77ff1df64dc927dc84dc080 100644 GIT binary patch delta 13359 zcmc&)dw85xnV&N=naSkdHffqBZF6bbPSX4RPD@L%C4`owx6hVwGWn8BJGY*1LfW)t zxGYkj%VRkftRO79ifEUj<7IIb(G?dE)YS*w{R9M`RkyB>$m4=854-I9d(ULPNt$Tg zKUR49%bD|?%X{AQdoSmF-{)U<-~GBf@SC#ok}F^O&F+8sT;Q*L!ch17<`6#R`vKt@ z|N2EnVBfVJJHyg4rCpT5U!%lI%V;W;4)Ktsig8$^@3t2pCpp(e6eRR%u1ZVD9#iwVB@=<&6Sm-n6=_AX2xO-nyj z1%0+ZX)R2J(*qLiY+qke#=~h@ML%S%dNnY0RUL};c<7HL2NP)$)EiIsn!1DEGW!$Z zxG5XatSJx8s=cIDw5weu2Y2&K-rSgIfvMyif}mHguW)wJe~c1YbHV`%O~z@v7EkJbfFrwf)kJK@Ud7P*V0l_N0Of!R0Pp4`l(2re7(nW1cgZCvE(O#O*uHQo zWhQ!MJN*dO$Z2@MOJA-YuCB|h!0$2AunO`s8_+{vG-mRa+P-Lle}Fy`dXg&vP^s;W zMH8l+iDD7!Y>cYxKyH6b`4ly>R=d=ynss7h)~Z=pB<8D+PrrH_Xl)m;SN;ctfthnC?`s;Um+F_F9II7Ds>W0Q3Xm zP!;t@l`q-XXPIej6wsVel+K`+QN*2$fF2_hNt3S_jEAT;(PQz`M2`_*E2TP&$YonZ zjUpc0fVrk*GG&zI=M7oX2h@_K^B)pESedNhHV(_YzNmI@)IA z;gsz&tw=a!+TOz^?O%;LKdWvb7-JM$gFV(ZYHWK^l5h`pku>{r78ewt01R`rsk!>%62#R3Zn9pEo(At@PPt@XQI;n_RxMsOv@T;)R@QiM#8aJ zu*@zdLrcb>N=QTW4EOib4~Kw*5DoTh__a-1+83-tJ?-&St^YXf`>3tEWA;Wdoc;Ff zjoy8{+3->Ch`>b;{k`ZWuswaUQ--Nzcsj4Ub~vzpFmX7M97=37y!2#`;n{Q5^@b0{ z&6^F+=B+ybRBc+FhP!hM`RoJ1O~Jfwf^K;OhV{ZH1chB}rIV>xl;U7cbI%03MX{RR z)}U23S*wiQqqAJzJ!j+C0ZI&8WYJ)0xroY=4ZSFq93oBe*1`w!{^=lqa1g9^ESw{h zT{$99CPe*`(qi@&sG3#Ejy6vUBf!&gl=f3BC!Z9S>uLDh1Gqw?h#%OVRMN7G*r^+( z33F(~P-P>egA$0`H`5;Hn?NxH;UcQE?6e+%D%hcm+o!j zip3J~Xk#{t#j290Fn=JLNDZd3d|9W0OS)R}y^LCrBSgfAMy!zbcOf)1)Wb`r?KCDO z2h-G5HweLZlm%4Qla}P&JbL0yJ6r=#hF(56BG6Rx3CSg+-*$H{^cnjbp zf#FNaD5WTJ2WqY+u*q&yv2eufmdJ0SZ@%~5dmQb!6n%?+2CMX@uilUL#c*f{Qbuu< zR>p#W@v;j~z0p`QBCki$GfZNhrIMlkdqu5p58CWd(^t-_K8?z9t!4^^lw(o`dhN=F zsR!^NKtqOjDYl1-a<0m(Tu^S5q)f_UIkPgZQNSYr ztWkc9pgUNim%(arxXZEmRaEqw#h0^fYl=jM=f~7RpPuv_#`-eg0synvMU>w5o56tmJgV*o{4MIM;$e$4t7l-I zLSgbFfG+_aN2{{JR)xNm9;(OJ&JjgSU)MM+K#nagr*2RTS&!eu`usy>>b`;Z(}2Ag zzsMS(AUv}EI@Es^kk^F7x^4fUWnPJt;p=z^$qHpaizyM}WZDdAjVS-d+2UK-`5T`T zb6_zAW&%BQ%I)amJ!Xa_+lhRis^2tId_%3>v?@5E?Ck1UPoT8CPknS#b1RlXw1U*O zUnlUE_H^v5S=Qz1^-UW`k+p5}wc`6~{>HlOnay7mtrN;9pTSHpWfJl5mLQ3EcFT&1 z)jL*1bo@5pp8(GSo&!7&m;yKqmZ_(O&W((Y`3zBiTTr zUw#)4zXvFAaVoX-diM`p*>7!K>&`Gc;G`s0+o>0YePFM|XGY$pcf^8WcjLTwL{M&{ zr*=tSlqT7WqmQA>I;Ag6xpJo&XZ1{EUgX1=VTa3=LF$s^Xuk`?df=qk=6UgJ(Cm7^ z-vD+K7-do)v_j#d(OimOhKCNDS!HFm_G1tccB1uGkCv>wmoQezX7+Ww4Ebq1+WfzKA^8_RulRP!-qB(}vuIc92op+_>MQ z(2w@8@C3MLpTldR3re6Sd@;YiFs^JF8_m9G zzrG(Sldrlq)10Re?`sfqL8$Zqyly{(zv$Q>Y!thRYFf(p2?y{)QK5x=~eG`eQyDK0S;?) zc0`MU`E{VecyuR1u6l?1L-*VwOlqjvyPHPs+|J$AHGA^2xGU&3Y?^zrTb5*x?XDI* z&NNyNaBDih`U#b`Q4DmJou5{I9Id=$Z`vF9Px_a*nQI$3KenhOm9#N{Ar0kn6V=fx# z^lJbte-JF|PNo*nKg#qQ`~LB(a&4o0P%YgbTy1#L-66WmOi^HiDFi2Jy&_WB-W23` zlZ&(lx=El5l*nOA{?4V;{<#OnGr#|&*4pZdWhK^H3@9g{TN=0qI%}J@Z@^?ev-Pnt zzKwpoT83sd)tD)yT4Br;E!l%co6rt~t*8V;RsmWFf)#cp*^VS4Ly;t?B|Ug}8iH5p zL}d#(nSPuG=mr4KngGoJM+Dv)7B_$gkXbLG39?OR)LQ|)z;|nno>Gkm>ct$j{=ipq z*29FM?nsEoqsk)~5HW*-6^nWv6%!grEewRF+nzS#R^C#2tp`EjGV1CgLzyPZ+~q~0 zM9>jp_~6Sr3DGGavzDn$C))AkVd_F2yNKMMDXE#}DXoD+KF{1m&N!icC5u`?PYV>( zgf*~!Ag_AqhU)2$Lcj_wpnIAKxa-K$s@$?os2A5w%hrc7A~Xr3Oa@FL=p6QvQX2=6 zcra5NY2DEX@s;`fAm*5g#( zqB6k@{G17>2Q&a00kZ(((Hv1mI$RHEJ@CFWgLp$nNY+ruvP+15Av{BTQN0+RE84Sf zh4&Q_y$I8D%$$o-8=#$l;>2u}*>dt+m#wG^0!C)eP3$n6^D!2VWn~L(!de)3_bSUr z=*_$Sg8E5h1x2>Yhz-(BbN|txpxHhl(Wipw19ZBfYs&(jzYQ9ZyC+gO5n(}=k_X|? zTxSGe1RY@0z1m-sJT9r}KLy1n0g$qjj`!a{1m8M)kp_Hg3*7_oy&biHq-k5Np1 z7C%9?<4}v3sZxiU#M z>(tlpdhSo9Ihx9W0**N6EaW%(N@)?~1P|n52?RKEi8XH=4x3@&ftJLX7(W?9KTw0=~-8Ldx}!nR(504LEeQ%Wzjl zArg+hJ)zHTY^zJqXDMKO%6>N4Aj0a| zu6F|KaiTbyk{fSRAHA_bDY<~{0Orp0x?IOUp3i6HJG{^3H_S?=Jo{GWA`%N*pX99N z9pN4`Cg08rnwQp|mb=N(l!%)l++f4`TFSCo#kTA@>m^Y=dWjM`IjwFTTv6?$=TNon z4bzENJ#=K&h?ajlIIj^}mRADkA6lnVqkS4wh`rTs*?GU&?QB5ZShwR+UprPS z9#StITOc}>XZX#~Wn5)&u%b(h+(qhs%=j{ZgAnewY%jU*-k{g6#2IWa=h6HU&rn^* z+txLqF#`wlNx9Tm5RQB+7cT?enIzVFRRVBw2Fx8zvVaY=dA`cn6Sr2^{T58rr-KrACH-G6 z$^{+z`m(RwdR%aX){f=@^ihBh12{d)J3o3GPnUbRn0*cp9|3U2*U7JVh-$nJ&MO>q z0t@v(+>5P?4`hCA{>4K5_^(J&QcFbRt_2_y7Piu&LkkqomC zPKQ`%-Kfv&nuWPb;e#}gya5mb@cHHgDDA-1d?69WPsb1cIXpBA&O4`O6wX~h$f^_f;tY1)}>zj#Z&ckt9J2JP{gxS?hNFPLx(V6oM6O1=Vy>S zE2o9MXk-Fd2$qiJbVO`|CK%W~RzMPm{VV9k)T*W?E>DRlcL#8!Ru9}&A`*Ymym305 zIK2ezm!|TcuByEjVwSO(xeGEl@uYe!UYC9O+=9!?9doXH!b8x631kuqW`C{|NV{V6 z?ThrRMGKCJ93n3#g`~yUEx0!L?U|{i&Mnt|)p^e>`tl@k&z)j=KB1IXV1g6WM{YuC zvzqjwD@L*&$}N|3ZduVyhn;Yb6&XlG)0gCz5(&`Q{PMCgp_^3knVo;BtWxGuA99LiLBFgR~`5I^}Gz>x&6Mn@t^$0t`6c%h8L5~#!$NA02nf6@6L=)=0IhQ>#e=0;Q@ zT`M30U0dX*4K|MJycS^!`uYa1s%e1N@hV^PZPOb3cJjF{{N&|vQuibLWS7D2Zwh7~ zo!QwH^bJ5bO|NU|>-ccoDo6n*bcgVS8T~3}irxp_)l2*#P3AZuHq#*pkRdQC5`%I2 zEov)jOa0H*r$^^s}}6sRA9)IN0;~<}|IAzATNV?LbQZ*;Az0O0N)4vz~%C~Bz@v?`tg~3VIbc^HGdW1qrK`)#^r0M`5EAsfZqcItwz6O zE7>jLZpL;!sN-c20SWCFm+nPZXpghrmg)02Rkgsa{q^TsLfK;Tk delta 13391 zcmc&*3w)H-m7hDAOfr)vB!K`4On?AGc!oC-1bHY3F+dPdWSC68BqQ_keiI-;9I>`^ zwZ-Zc1!)+N!P3KJ2ws?9W}BKBZl{U2B&;|MN}eo5aM@-QRD^ z@0T<8yU%maJ?DSUx!=dFZ#lpCmecdw;u5d>AAhs;_+y?=x`kGEyQdKSy89KPw-;PA zTYI)|+|Uw~rXh8H0>9d8CS^DgNCrDiU22t$h!HhnNmHx$3wBf&I2;aLv_BDOi?{bA zjr~dOVydErHDbEDr7-25L0zO*m3i)L8cW6QLL%QfePHRF`RxaLU9*z=lhQ*~ezz`& zo3j$ZWS2xg?e2)nXfP=&=s}KGZ+J#6@}W2pjRm22cPvSax}x!R!x~Uv8lACV)R2C3 zo1P^Ns=-??=BrKKYp&##T$wr1153%-MEou}2jx1DMv%eNpl&>o0R5S6A`UT9j1p9* zmLIB4*AxX@AtvZHzVPY z5stMR`*Y=jIp(KdE&q3?7^}Wo(UgL8dJC`iU2|aWj3oyp_R*WaIvz7-$s#KIM_Ft! zGvhG&5)gY|C9z0y&#Yv8j}gNjR-ttyNEt{u$OsTV(n^%Df4KyN#L?i8OHphBsUf1> z!9>D{wafYR;IEPs(BK1ErcPCkNnMQMAZ}O$*_Bxi9pQ+<=c?V|7(YOA5^|C&Kp;}> ziiBf^oPuIL%WROSv?p^uhWt8pG8a44qN*lwdD^U+l`j^k`^H|n*2|+=&}&&_Ht|=p z@JL}v!*!n7V_KX;KDsCrk3@`6G8~VYa)X*Zu4*icjAgV2J@TUAn9d8wlCoCq8aFL9 zP?*}Cj0ZyIUjKMoM%vvLj+wDGxt+$#9Uv?!K9CKV)0t?ef%}u;c2m1)bNKZ@K2RL0 zqVe1m@8~d%q-7bbbNR^>M(KPWq&?Ogfl!hFqdOX)+E|;(t+6)kVN<0#%*bUe!aAQP z*Ra`yj3;z)cHMv}wZ~c-#ZokwfncmXfc?m6*vNDU+B%;Gb)@E~=t)nmohha)Ks8$) z%-`?S&RKJ`OUCz^xreFtjep$D_|Tx{PPk+V)KTY=__SLZL4p$rW3c zbs-sAJOWXI8`831XD2<_3aG#u>{%CBHfdT|a2e|9j7M4WkJ7mhIJ(1=my6@+XC^Oq zZRf)VM!hFIeR=fT=fr$@9kN*-Jz^8A=6c zTwI#0y*Z+0kuuO`m4dJg%d)F3Acszv6qM`0kJSJu)cM??^Ad5gWd?O!6f^eq8>;Oe zNx4tcSSM-s!iIw)oqoLG7LmGxs#<&9>^{8y0!u7>?6yAJs(dH*=Fvq_(jOxk;4t7Q z)mhq13gOV@{TmIC23|{ODYLWbBIYk(8Qd|98=&3aojFf@CH?-)cbsd7WQ#fC*#;dH zvsEr1$NHXdEYY3B_N7nx=Wdyl9i>M@IRc~@;gA`y@GgLkTAi9(-J(atyOT83>JR{S zWUJcaq0BfvazHOL8Kn4zx02OG*#JH&)u!2%YFy)F@swK8c)-n}LWlZhvEV0B)u8_h_3PU$8tQlpZ!9!C9*!Su@ zD#|%3wQy#M_9hGpWjV6a>WPIl1B7&DVXNp%FIY4~tl5Zl-URs(2=*vnBGT&jTH9c3 zwsp4>9UcVXU0{#$JL-YOzS%!T5w=Ed-smJ`p%51nYY&|^LfY_vFif3WJYSqmPhXNR zQoKHv_Peb`pT}HhLHa;glKB+lb{c+<{57hc1^EdPT^8;fJ_qtH$S=^V zIJZ|Wwo;>dWa$*~C-uXn`**T>JcCic2l)fYe}FJ|E{i+f^vkF{N5r2e|C4@n5jpDB zgeYra0R@?d+?CXWO%uhR)8A`~3b(b6x_H^T6f3eVU>h9%gcbe_(vJD_%`S4n{rmT! z{*NG8Nk~?df^O5e1R=wJp%I+r@<5NeB1AfwGy;}HRKfDe;(U6>@@K^qSWJ$YKn~6F zN{n&!7(vN)BDbn>E5?iW)zTG<{KN8IftgrNP-*#Hb`H9D)s}>9|-@!bB5rHY3 zAbB7zkbDsKOo!2h)03R7NH^LCZKD8ng+%n!@^CvrNVp?R21!N`>y#cedO>n9P9;`v zc9x3tw^uK9rdU0IQWCpuwl;->U|8bJkzoY(evin@&Q`%2Ow?eQY@`ZD-hft z&cyhGoC=GqI0Iy05r$&P4dJz=10x_Z?@9(c-iADbw)G%ygS(4%loJHWLl>bvVt|_R z=iuT2kbeVVd1Zw5%;gK`Po*V8`P71c_;t}AGE1|PNV{+(9gb^-quuFf&vSgl=@_-e z?^3=ElNMHXnUQIYjdN%C7A~l-pW&--udny_Ax`g`M1;~3x1_i>9;NJ*e#w)O7#>i8 z4f6+>`^gRCU3qejNI$vZc|mtHpJ<-hpuOa!<1*0E9Sh0XkjV*}Q5SZ^>H2_j?!Ia~ z=26C`)!WU}ER)|&brQRgV~Oa(PJV(mZaHWI0oUEM0;7GS$Rogn059nHg5e&rM8S(#!dtPlB?5LyA)<-L&oKlHJM&& zaO8~?Rbq^&b&^wiWy^~*86%#dhGt4F*|*uNS$+)D(Ul1S`8M@J>(qR#snvU}b@Kkrw>6Q?WJP z5{YNBJ>D7z;VenJIiPB{)rz)s)3#$`ERK!B-8eb9##rFNlT5~Lf1ZJt-;qhRi?Mgv&1IL_gM zIMbz&0&v|rt;0|pu;Fld>E5+O6pJ#VRiZ)29-1SB^dJ9#(?t$7dFMZEvMh@AhzOq~ajEsTRe~65$oHSG8R8(>1o`NLE_bWJ-UB zT9-8!>*{<0K)Sdt$f&8hZdb)cjE~I3?%^afnr$ndV8aU7>iB?qa@T0*L7~3iRGmJ% zDDvI~`A5Qdu$lsZ9L&`JeIC;jn1xktUYdt8duHFM9w zVNev^p{=B!L=qs^fy6-KASMWA%el?61AMp-X)CbDQU^XZ))N**zbY-w)uqAOOB3np zOju8vV+p1$C_`OAnS5M*E;v=(kbXY6U0AM>wfai1wHqw-qI3}C1|sAE_Mz-U6YB;0 zvi+#L9;Bb63`4eFpTP)bn)krwI7qq6O8T{%d3;_y7Fr-u8Beg&(py?%Xip%wP}6WO zfCf<}#{3Sg!X?JIs&iQswxR#V^fDusgTl5GqC_^?ZsohXhelyuaX*x0uy(pgoqJ{J2jp%a?5rX|JcuNKqJ_hVfdkC+$ zaP+BFu+!H#thh(PQo1PHqift8Lph(P(w&pV;q+^r6{1SJ>8=ZJmAn8<3TuIwcZ8dq zi-cMht}7p8Ae&SyTw__o2$E3D)J5<(AsUM}VFj7xax8<5g3ZIWjA4y}P0Kb4iWMOd z762OsYhTl3RQe#)p-f@CWeV!uJ=LOCt-ZERsC3u0r6SczBYMkxaBWG7^?mfRZKr7~bhk}lD4nlim2an@L7m%EAr>l6WU9E6bpGeo z6&0h6t##J*i=}p~@SYq6%ZYigdFYY62`q5B`Vp}GVMajRHKS*o99V0(G)j?VHe=kX zJ{zqPN7G-9*5#Okbpc?ZS8(0*GKT(j*swFQPo7X6*UfY@ z{2x&-AFY{l8%?LZ>w{gg*XY7KRa?uc#nFdY!}b7WLY3}a)&Ski+>Bc|HxS+4^IU`^ zV}C*}zey$b)~FR`p}KQ#mHL=0XM+J)u%=z8>liBH;X|+uBop@dh&2nD&!7Zn#Q_NJ zh;ctVYD2Kihi`9o1Ta7*$dG~2Ced-sAO&VpDKTE8%# z>7M3;Fy#pl$WDG1CHAPFM#&Oh{a+ED{0#LyMq*o+5tY-PEE6T_e0nh&>5;=q&%|;z z#$15(GVtO9+icHg9FDq-_)cQBq2Q%jUB9Zr+o#R zchvZd`a;i4aa#SP=WHg;;y~>lOvm9L&6aZw!m%4&fMjlAIm@cSg6z5I_M_y$I7-R= z2c{25*>@cnHTFqYe^TZ2ig98(WjOL-8X9t{mzyfn_Z`@hdlRw+%R-_0yhP|L0hW=# zQg&Fv7*c^c`>u&u!iAG{{mek}8Xgf;(%tAo)V*KjTjN{lV~%oUYgcq^BcyM)d?uT& zb|G%GZdu;K1Yh8he5bSVgO}Bh92_N{RZkzR7ikqY$E001928?80dF>LaQMg&vr(N$ zO)NWt84Fg^^1)cIyiI*CwccNbA?sD(4*k}OC2`0y4e}%ridr5-`RgD)+NUmv8p*CW-AAGzNjK%AXv0Ab z;wm>C!^kp5u2qq41SjOJe+LzVO*`_u&N642hZ2wyFZV3i36O zxH)07%dr`-Z%p4M! zM^QA5@26P0S;cX7j#1zExL>@K?)$h$MEG={!#Xb!=?CHL!jhTcQ16@Q^eqtPjhSZl zZO#WBZYdbdF5Ed^?YgB_y!>|y^~NInrYwg)^g{De06WMQoNErethkPg?Jfc zu7LKlFQ*LT9?AMMQO+suw>_7u`{id*GDYuo?W#=SsJ1uR^@7Abfb7?+*&o zgJ>OO8k0KJ?P}t*G4$}T`G!Zdc$fnh=?kcA=^8LH~;`Q{DyC2Kk?fd|1 z{uty+klo;eS$qwTuM;82v(J$E{P4JwkC~5@cQJ^!5oSL}9)cOuq`TioRp9s%@#fzF zWpXx_K^D3iz948JG3m(!mnv~`wED)qUhz|gnKOSQ%=m{=N|O2i0b7LHb#hJWM>I)J z%=rJENsfW?44DwArc=|!y9_gLr9W}1EeB>K;;?2bBx4gr4A%cm)KJh; ze|fJfZ#jpA)(a?n1!Q@ZFN3!i@c0Ny>~36mwA>BV(Whew)YQqNxi<|tFoo@Fv0vxM z_?ylJm}eO1l0QKs0=Jw)ryL>6fZsPHk}bv(^!*LqRnr6$1-;9czXF?UKGS+#ILNQ4bTPh9rPrSP@)ZZ>mO#TZDx6~Cf%*9hx zaC*8JuNqHZF5XSwa5^A7KSPf(U>8yk9QlIrY-#-%jx-ZpAUWx{W62b9Z`|Zrds)avULW-t$M=FnwVfPSQuu z^zPn7jGg0NpmeNSlv zs&84f@;Z1?6zR3>e|AIp)w30DCg&~%;@($)~&>95@+-dGd)ngA=rT!ap zZ6Ewa>DAzkgWwq`%?H8f4LP6lUyjEfkefk{fZPsp93&0$6_BTh5Dauj6Ft_=?enPZ z138P&Z{(j)Bzwsv zPt&6=F+<@*&#aJ@%hN{!RzW_2z5{O%6r z4hZa1o(1_m$T=dq%G^WWg3H(}x}$@-n_;}Ame6bo)sZ;!jOKYEa#oNA_2Q#Jk+0_d H^Hu)?bLD;8 diff --git a/tmw.py b/tmw.py index 30bb384..aa15584 100644 --- a/tmw.py +++ b/tmw.py @@ -787,8 +787,9 @@ def get_wordlewords(words, word_weights_file, topic): def get_color_scale(word, font_size, position, orientation, random_state=None): """ Create color scheme for wordle.""" + return "hsl(245, 58%, 25%)" # Default. Uniform dark blue. #return "hsl(0, 00%, %d%%)" % random.randint(80, 100) # Greys for black background. - return "hsl(221, 65%%, %d%%)" % random.randint(30, 35) # Dark blue for white background + #return "hsl(221, 65%%, %d%%)" % random.randint(30, 35) # Dark blues for white background def make_wordle_from_mallet(word_weights_file, numOfTopics,words,outfolder, diff --git a/tmw_config.py b/tmw_config.py index d7f5224..bbd06ef 100644 --- a/tmw_config.py +++ b/tmw_config.py @@ -176,7 +176,7 @@ outfolder = wdir+"8_visuals/wordles/" font_path = font_path dpi = 300 -#tmw.make_wordle_from_mallet(word_weights_file,numOfTopics,words,outfolder,font_path,dpi) +tmw.make_wordle_from_mallet(word_weights_file,numOfTopics,words,outfolder,font_path,dpi) ### crop_images ### Optional. Crops the wordle image files. @@ -292,7 +292,7 @@ dpi = 300 height = 0 # for lineplot; 0=automatic mode = "all" # all|sel ### only "all" is implemented ## -tmw.complexProgression(averageDataset, firstWordsFile, outfolder, numOfTopics, targetCategories, fontscale, dpi, height, mode, topics) +#tmw.complexProgression(averageDataset, firstWordsFile, outfolder, numOfTopics, targetCategories, fontscale, dpi, height, mode, topics) From d64754223849f73d37f8957adbbff8b455daa558 Mon Sep 17 00:00:00 2001 From: christofs Date: Tue, 8 Sep 2015 18:13:22 +0200 Subject: [PATCH 47/56] Fix stdev sorting bug: https://github.com/cligs/tmw/issues/15 --- __pycache__/tmw.cpython-34.pyc | Bin 45195 -> 45451 bytes tmw.py | 24 +++++++++++++----------- tmw_config.py | 2 +- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/__pycache__/tmw.cpython-34.pyc b/__pycache__/tmw.cpython-34.pyc index 1df52ea958c10136a77ff1df64dc927dc84dc080..0e49b07b889ca56e94fa3393fe1c9f93175c0fb0 100644 GIT binary patch delta 13896 zcmc&*3w%`7nZIW;naNC^kOV>qFySRbAjm^N5CnNt5MqERScPFSa}x&U<+&3efpmP- z)m5v-qgJIqE%n!0ZL76Y+qK)R+lTa_qE_2(Ppw7kZkK8ww6)vX?z-RqyOX&yV{q;6 z@3-YgzMOlW-~0Q{cW-Wb#dXuqT>f9x)cSufetpUF&5!zT_6V&$;+aYCtDf%>JmNiX zp7!n7ys0}ZEmP{sG=6Kpm65S@C=>3tbe-Kc<7UE4W-P7#;=QcJYZ!*EK9~;mrlNxx z^I%518LC88o0zR`tjc=k&=l!c^}gGew^DPUion+&IJE5SMbY6QcW35cM*65L=+WMk z)tL@w_DIarp1zb!gfp^{9^_>8vTxjy0Gd-V=#8WXk{LqOok&GZdx6(7`;*~>DTA0c zy8t>=hku4xq_+5XUdkuA^Lt_jj*{~T1l{s%wA%p|0F-ZozR3ha^kaDl7-Fm#C#X-Y zyt*aVRvmIT@OGf^lpr8|>ZW>Y!xCx+>*QSYFliNWGbuaqIG=zX8yS!iJT?27wXq>n z)?joJ0FIP(0QKs94U@(*mH|97Cp?+3>^C!79cq}dkdI$TW7;dtIGCa94y0t%ipL^m zEEzQq7V`yrEXuvm@NX{Bs=nN~JPYm&Rb3v~d1(HebB8797^+;GN}8Rrn%co}Hd>6# zM6A95z@e))8PDwN%%t|2NzkwcqhkT;0U7|t0I*1#&;tGPTmWK6has1txg4O4fcAvb zX)_s>i|8TPET>=~0I)*c*EAt}KAL6NFbfJNGave5ag*h$J+UM|fO8UjlB)o~Qtgh% zlBS%AW+n5ij8)o~myapGN|UUmhFa3RTwIj1nma4SIqKuB?_2NZ)y(Mi%rXZATbOyI zu%zL-!Wy(}%)uXB9ZALGW+W3!B`vv0&70WV$}D3Zt;39pL@cQ*V#$o0q;^f5l`Una z_GD6_h_ydB+2N7)^v04_vRCe)_3|cHeLc-TL5sQFVaI2Y=uD~NsCBh4_m6A9&pk4dDVI=0Ow!fEX> ztw=a+YWF^q)CX z$cQY6$lF)2#x^Y$4&&nnD)JUAVzD{;;E0}e9sVI$Wy}3WOb-CC8v|+*I;3kOaWl-i z?|8^+WAI~HT&THvwXRsb zb|V0Fn^u>0b*&+veMN9pu)v$3OAcW(H+(`+=qf9dO2=aq4D+0OGT0;hs&#go&9bpJ z%h)|Ss7Eqox$&kh0z2f-%C!Z}Ro`VoY(AzGJ` z7He-2tC^+jXbVa~z6{55sP}}N5Gvwj$;SYLkwy@Y8eRfp&cF^Y2)d|ucGZW%8;124u zwV4#c&cEGrYH{Dw&>=%VZZ9!+Y?ye2hq!F2HrwQ9j352p%!>T3%YmeBn6f_Cv# zu6^MQSN5B2%?@UofWkLvr&4qn0PaU83D3-CLD-vcmqZW}u$`g!!eNg!Au|AT(%Y6{eA z$)l`?1$grh#V4r;mroUc&ON<6Aw2du>iiWOvaHAsgB|DaN1X6yfGGB_wDwRC9-%*g z{yzW|I3Zb4yaSeb0aAuPVGx)V<3NwPF+w_-F+(;-ly~KH@pf*`%I}Dou$Uq<0Ux^L zrC8%0G{chZL|&pMu9__VqL!^%5zCfi5X`^QW|98BRK9`6C@n(SaZ&q`CE=75E0nI z1yBLt2B-w!$aFoX@b#pK6zRcu85v&mRT0oLiLSlU}GDtFlWWV%b&<{}L<5YU> zR#%B6L4vYcI{Hxj7F&XkC#x?@H2JkWg-QbY9%_I*!k{@FN zv$R94xNwtef>1YIcw&>yLWiY4z%)81&rH^DU)YM!m5Zd)v!3q6&|K|j*PRLXa(mVz1B6$Hpt!7C(pvU##D* zcGSqpVl|z~5n>}y2ms6l9!mgN$7W)o58R5xQv*?57d89B1Mv)U1M}4MWRd1k59NQ`sfPI+gW0Z`rfu@=d#5er(?S4 zqQ;TJ*~^X!pn(byS-_G?#xSOi5 z>nXU5a)kULNZkTNUqEXcT9XK%wvn-OC&I7?-QDP}BL?(vR=bU~E=gHh#!SFJokjXY znL47pv^`annmwuRcq)G}${Uhz`+Tu0w|x6uq7~ewEHFihE!8H&#=7_L#B$JsDf|1HkM8Ok*PQl)Q1-dLeTQ-#=R{U=gwk z@4&2G01!j4PS=y-%5c94&JciM5Mv zZ=;>S+JqByC3y&{yL$7gqpsZ5IE5VuA9uy5^xVH|k!w(>XO_3*PVdT!>JVn_1_-M| zyC*GS)i^=B%P2Zc05;HGv?2iA7^Q~MPe1%m_N!-iPt3CB;>2dkOD{ojHX6#q0-SMpDiDj0Sk_)9g%HYv2fp^NY{neiJ{z6;T^&b0IWz`aqw;& zvLCHMfFT0p1TC}!7-BK9$IhVdAi&6(`6C@wI*ApG7Gue}U!+eos@2CMYsDdWL(;6t zH@EJ-)EE{<#%1iSFb)eNT46-p#u+qTmlTXqoG7OYWMMwexW#?K8|+X{4Md^OAc5CI;PoW%2KD;R zJ0<8q*GOn^l%Nnx?LbGOLlVE6cJa{7 zc#sMK71X6GsdlhtVx={v*z0YLGfLcfof= zQ}7&_J!Syu0NepHhO)@9MI~cx#bJW7&18UI3dGphYZ#w4jJhnbkE5G$^!)nTu-(u z6zVtoT56}cNey5ahQ}{d_ulanZ-ur%L+}rYwx51jTkcWU?7dap-#bRN?Aw*?q3)r& zxdF;#_knk*PGea9c^ddMB8cq#_%3FWbVO>bjLnu#5jc% z$VsZ|g{s@Xq5}$@e;r~c7NzOLTfXyfcRiSvD{oNg{cUQMRi$p<->m*lHn6F6jTD66 zYc@7oBE~{+tSX<>2M9Gg6*`F)E9RA;(XIhl<2HqR&GoJa~9Ruf7J8BS8cMZ%MVdn9HSphIpegpu#lOIKklhnJ=vY9vIe_)=xo95n2 zY}=4$mCM;IALw)?y$z2K$}@5=h2tE@^d7jE_elYu%@%#qFxvC%y9c|C=95~~;r%V* zLrNZ)A&%wlI&fHISeGXB#r^6>gQxR%eaQ4{5}sQ($D2P-V2=Ij|%77d+xBMg{Rt z(ex3O?@3)kU+^@bsBm%L_LN)}&F3ty(Om6DB4(G?Ucm+r@JcS^ZGPt)iid~CiD%W5 z!!yJqDrHT`xj!&0S|0*#j^BvjkzLGYbu>G*{zmNRT}#J@lSA?*^>lV)upV{8byz*fz{2aie1avKz zbh;@9ptd0cU`Sn*F^!mc@(Y^pwW|Of>aDBV#4Bq2)icCb)QYQrWnbPPWcdidR|rtn z_zASX48YrZ6J};lii*%^NP0aeKaDX6stl{*VgnGW=i<|v_yE!xirQ~N#bDEvP`c_} z^*&dvsHF^2K9S2^b4YM7wSC4jG)FpF^)SvVxG$;$QURCtvrGDH2AouP@q0W{A}1Yi zEP${2ypbEbu|d2*$Iz9ZjrbCwKKsFGb-FrV{@_g5kknpx^lb#w1PEFKM^j>PI{ zbGufxUpGx0&#k-eVX?R%C;53gOMBvQ-Shzm4&Debn$A6>g4a)LxDgAgY@Lgkq;ywY zAIY-Pog`#Q>8$;BS;WLeI;_2L}8+;

    CpR^?wpOZRi6(I(clX`v-&=~8_soqjC^m-on&+JW+lul$T~}Hcl-&z3<2C@O z=6lQZ{`{d?VwZa4=oIlBWqN-vS_@g8yd68?ZG*f6tvl7vZ#{n`N1=po5hr|gJ#?=L z_gaxXNqqcKWJt8ekVsd`lfL(qw_K}U{&4r-D#ereM1@>qu?toIc!a)7QGNH#5HIAa zZ}S!loNhO427k3&sfPS4*xL=j!b31E7v4^cmYK%1esx4mop?hfIdQz$=BfZy0elq` z2zp%d#oP&%dsm4bz2od6iDo(Ld>%(R{;))X+D&ze;#zuFu+gTQwG-}J;-%)96H3S7 zrEpG3LYd8f%UO2It+unX0QOB|5wnW9R7lG0s)-%n))($yBXXoQWxSi7e@&0FODvI& zo84s7>h(Jt#69XScLv1|b2C5kXug>G9L{_JU@O3G;K3-qh{qoiAeZOJknw!?ypzSu zB4sMda2sQdVB{k=V_EcO<1rPw`&{wkzk!!2A}sG&=*365c8&ofMpM!0* zV}9`}yP22%N;eZ6O)2TT|ATMws$KW4%YKhGDaslDzi*Ngz_U>=L~8kcv&A3S&HN;H z^L@QVZbl*x>yk`9n<#Rykq_ZUb3L5#BWMI`%O%OG$5FXZ5*Yb#Wd2BF0cHMSsS?Mk zI=DsRT!1V;e<8?c|K*ZbA3GKlHR`*^rih>AesRoQv653ly9$F(zwD^;d2rwa9v?!B z!;Kq{cDSKF`d$!;n!5Ln;z~o2n*#Y-9MqLb{*ZS*_Bn&=lHbQ561O6uQ|Y0AKrk?> zCtHdm=%Xuqpgv@618_3C-FRd|IlXJdY<3fTxenuT1wi4lq3h_&k8p-w z?$cjJgcDX#GWbs4n0Jl1fn@D>kBzp)u>=D+`(1sb`~l8BNkG>n2NLu*8JSp^U$*KR ze3VRINb(Ctxq`OR)r8T=xU{hOYglfpD|lG!PnGw9*|kEusL+njw1d+jLyjQDck#!2uTv=#y2+w-Eo z;kXEog8&}{xCP({z})~jfG+`jg8;dKfkb-HuG~I_-ctal@y(U|Bbxld-`i+a6GE~M z0FGFXNrW?~_QYeoo#{cesQS;pZ3{miGg~p!L*k0268w#k{f*5WjI9Ca0@wnu17H^b z6hLMN083xP%0twLzg7tMrespyj2ZlT=~}dI1^6()5dZ~HsB_8t=~35Okyv`LGh$!o z>Dv^$As@rs(jr1uK7r1{9Zf!iN6y(C!+a5aNN41a0bT<52|&R_3`0V>CA3FE>@cI+5@S-&J~Aw(sm-5T`QQD%rh@=Ubm9E#V-n84YBukcUNnYFXg7-yO*w_}>7-1s;c452I>Mlub_r|M|7g?rR z4ok345_qhEusD;15SB2VNd_`Q$ee`9LLecB%%qYKU@~ExFv)PpGQ(shbH7`yu9hq( zBtIrPN1tB3^}c)W``&%8N+7hNB?%WlXrXg)#7Jv0}(~^m7#EK4@dbN`_l19o%Tc)ml7uY-9r)ipAaWot0 z&%}>e#!<^I1L`UjY!PkhwqVXbhpNb+stMiMSx3p?Ai+O;>iCB9R>e<@lr6Q6S~5Uc zVZR;7m`k%!Ye=G=?H|abhZizNI2kR?FR??2I6g0d=y*+`g2O;c7V-IjCTJlSW$3!{b6cPK^yq37r~) ztlGqgAeRD!ojGV{BzSe}{^QFRuRkF%j}hP2Oxjo~LzE6TIAXCd z(^<#_;7M~5btBe?=lPhk6_ z*{qR{%MN-7*URZB;Hj@t4^3;zt;X{NX;=ltk#*={AdxiKuC_ms<_GwYgr4LY090z1 zB@<~w&O*}1I-8&>J5;cbA)lj4<~mKSt?v|Fd9!}0Pb^lan=k5yZ`(drbT_Nah46G% z9yu&|xa~EMnU3Vpk6jVVB$GzWN@UWeyg)6RQQyoeV;!wW4R0!uw!Mk8B^%U%84Gjc zm1+B}OeAI=4$pFRWc&LQX*1m~FQN8wCxBJO2Dt!zda`ldPwOucH+4Tvj{e#q=m(BN zS=1g`{>;FDX;{w6faY|cl|w7t$Bp!m(?=v`5n&9cB9xo%H@P(3uZMW4QXYEbv@H_4 zk2}|~X0tMr)hmnRMog)PoS~7dMu8cLrsEOJN6yAX+IV^J+H6-JYPn17&(EE?M9g1| zLSA_&3wvzO((~*xnK@#X8dDva^*Mi`@$%Uhu7^9?UQ(a!mqwK6M8dhjF{vzE$62$f zXx8={W-OXD?6N}!*{@zbKD%itJVp)OJRM5E1jjq#$kENAd0*Ojo56iKe(o6Uk5*ggO1(PsLl(7X$+!C_>f)| zOBzx3eU~9CSOo<=w6(7gX1=dIF36RinX^8}3qDk`;Mq~hzWrpcm|>a|7d6JQ%VN=F zGF)i~X`yA3P$i@x_lyn>(u3=O!w?PYSr0gC(sa0>6ZvH021oxXvimsG-7>dJjO1UK z+f}xQjSY@^PlV2R>F*g2f$bfTJu*ru-P?27r6Zy9htr4BnIq{gy^Kn()V)`3yIl7p zxp}kh-Mn=NfU*sA1Z{=|&3!f8CxWVZ zQH!IpsgBB+9^L7(^<2X-0ZMf1GpRF6E~B*Mh3-pcj*zC06pK|WZ^ihbL^?ZcVfwO3g_rlvDYnuZ;T#cij6}?gID8jDMO{6#eEKeZ zN@m!ircQ+jxTBaA&%_Gt^r_=&nO2nI8=gwh7iBx-n5K3wo2F*0m@95qn^qk6GZ*ev zUtY0t+?;>EqD|bFZ(BLfle>eO^^oTtN|5G=3k4LU0LRfGO^hM~k~=A}rp_=4m7#bG z-~@s0&&ULYC~^mKE+VjL-DZ=~n9(N@-z47o{qKLzyB((zzoy4U4awNSYB<5X|($)X0ZuDP?Iy+SJn%UD%A+yR+Gi`+wXi^w@ z>6(^?eJBV~ml2+d?WLq#pmJ-LRO#hegF;!3taSCvnwD`&dVNiwcq+eo?P9TQ6UO-z z;1K}kC_hHf7cO_E!D?}Rm-pryP>W}QIm)}#C)c$u`vj6$YqZT9gSMCCx{sD3Wdpg0iOpviCUGVwMw~_3e}Sv=8Nals~e8)WB0fdt^OA94B%M+OIPMd$CG{# zxnCp*d*$ELAG?A!>aD~m-BeHR=;`#iGohjjW#!(k++@51cb`|WB!*dwn zdB9cZ-)9cdCOkI(CglGepr{FndE0?u)3^{J!&8ornTX7%Fa~JdV)*KyVXZGwa>#e z$gLo?9dH78(_uO$YnFADdSg@9IA>k7`BL$1wYaM(fBWXY5%bO|qx>pH!cry?4{ZsP zh<9ySeQxzm6cG(y0Q@81Yk;oPz#M8R) zJrS1ORBD$GBxsPUaP-lgv=h>ohFpcyj5~XFWS-n;ZoMEWUh_Qp z^{93^;9Yh;`!VZ3b$x@?J$><(rx7cpCCi zlx+k&4)`Q0)VkB`HHe_c!edlmmA0#m7w`0ZUsES9erczpLRY2Ca{_fw0NA}lQ8Pv- zF0xKvHyw%LxOO=+w-xQ@@HQhY@(%Th?Q`4oI%qXw#xjyZ^g)>!&Kl#xwV!PddtTS_ z^*e$h#}3IOZNqQ@g0N4%juAq1f_2(UR{cMr!uL?2!kO}G$iTeyDXfx6$-sI&Wfk_; z(rk(eXg}tfru?zCh#FU*JZS(;`55}l0iFPqy5I|!tns|5ssFuXZxGoeW==(Rt{S%* zAKcmMc}vSbzw?`dE>KSOEa}k8X$#HB$iQ$qCVz@PM+xlUK!(otbRa&1c-QvQaY|P| z=~?J3$X?WY6JT@YL4!`AL`r&)j&#H{(PO;OC=5$lbbd{lLzyEu)$&mCEwu4j35z_4 zm&IiCIS-WrX7sQzkQj6>J1XUC+GZN_6b5Fe;&uCPc>E=ReQXvQhM=ujGBX_4tK-H% zbU0}tRG>gpS1(>Rdz?;wbJ@k7-)i|yyEfq%asBQYTSC-DVFR_jySDF?|B2q+t&~?^ zCGraA2RaPW8H!{L*AyaM_2b>syKh9532hf5kN4RiMJI!abTsMQ4te`CNA*CAx|-;Z z8!ulagj$|zQYSBedW*9lcm=TX7Nf(D5%h5kV3)H8vA2*5Ohyz>t6XnGeR0K(bE3z3 z7l<dmW{iTP^kzGrtkZp#*@ng|sqv-Klr3pOPhhcTTcJ-MsS z+`mFBP`md3aGUF@k{yY6S312-HUfJSMzDQE5r(8{j_asv4%E$l9OLtF)fk<1--bMf zcJ=WCt2~Q@`c~)k{5uD7qGAeaO$F4c;~$v0mR;ikG-yC(BLFTWry*4j;NuA;v_bmi zA3B-6{DB!c_ME~=oOkMGREOh`s}Rh@<1D}oKntK1FdHzL8-YWm>q{YLK5PRv9M5*l zwk@T`JHY4#H~<0(io4|!q0*qDVO zU033GfyH7eC+eSivDN9=R^sUb9q#)hbSOQL=Nvpp-BI|@0#5E?tBux9tQ*ZcRn&_n zG1DWLP=7I;Gx3A6YWjsuO?Zr@X|#RDg?6Jy`|y9MH_{!sc$pUWX*X)(gkOvMwNbwo z@oSdQ4hFPQA07uiTA_{*Cn(3K9Smym04gJ&(jhHgPCa-FrS#|&QMY4FsM@GUyUC+9 zouGaL+QD+|V1;H?YI_b=Q7vRe$~CK68ztFBE&1ScqLWvoCs;7A?;v| z7U$;9WA9b*YHjx(g5Ge=9fO@Y>7|J~hjP%mvzx=;=@4nf9cm}7XoY22hvYK=Sfz)K z+UH!MOQ`}!#qvtj2j4rP{0>Ud6}xF2%NiRw&d1%gwPqDtMpjTe+DDuvjXs#a*!^H) zJ472mFNc1*Y94B>pVZCNj-l4}7OJk5P4!F{AHp3tbS0y>T!=Y>V;7Rvgm<)!CSB{Y(_>cqhham-fGT~yFPAzUH$Ua;gfCB29kj)8bRKM!0|nRW>)q zr)rJll<1@eju@}gMj;+aPgyhq^&8j7*3)`)KV7KNF`dpO6c=bB zOcpRY3>hj}itdr}X~>NgCznCYuOW3Fg#SaVNtNtIoQ*ORRrDpQQwCitjvZ{TR1c@? z#rgTqr`yKGgRW%bNg`ZTBG_FR|84;9;(XwmL^^psgk|1fItaORU{~Tkkj|V44Lj1# z`INbTIohlMOil^cXIn&6J(!(IFV9|n;2NQp$VD~0{hyBsdy+@ws2Y$<{BJ|Pwdy-} zx16_*y0go7M2F;vF@!h2phhPYpc3s2whlK6_^zg^k8JFK4;Sw5o#>|xjTqV~^%A5B`lqa`R-TO#w+Y74lxSbV(cH2jvbp4gec_vom|v7`CpMhvg}c2L8@d$=Bo zZ(N4yLXX{modoobZUW6P!?;8U`#GNuuuYk3HX`pVt|3me>zHi#glS&J%R6N}s6I2Y zMD(cdkGxYjoGKXTIN%Wg^;U9Q!?6I}(-iKC_oDEJ0N&&2lBQ~O2Hj1BM;BGJA)BFH z9T;tB{xY}?Gu1rhv>z`L(>=16dXIV3cRHu#PmlJNUfAGHjU$pXXhPpIu#R3pt{b=( zFKtS$#8{*&P;WipO2EefU>&-_y_|*|bNRl^74*qTEl$Cg4O~ThA9tb)Ub=P}4w%kW z#i!B1Zf?m}PnVv3rT*l#4dNT>tJlsGdsN11%9rI%h~_?OQow&E_d^9~G*Qpn;+Y$aQhT2F_vfn25c$il9p*!?R@8rtW1=qKD-qFl<^fN56~a z?qLH<>fA>Rmm>o5D)hY~*(4PGta53<*<}LTscxpnPDp5s|pB%{NZ#*t|Gj$yLe)PefekW2Kuomfy zz7e1+M_d~lfK=VJ85jM{f=EXPg<3L&R64?FiDj?)#ctoPF}@&F(L;B)N< zklKOK_*N&1$3M>zb0z283YkDh9Pzo+97Ns_026lHaVj)4HW5cjdO7lVBQqx@<_3B- zhp)U?yG6(G3^uDTeK0Ii`7<93i6mQ>&IFoF5sZ;=abU^9aKJ~W22C0OmW_pG@mB9p)GquuW>#49lK zInRRUjPo*38`HBB=h@ZcFS4X}1#0ki#=G15iuC^bLkq=T_0$8i>64e2ANZhXE=GCs z0(3`*I!$guYO{LlBNvRtC=~Fm;()KZkIpjDels?dPFQCJeG=iy#Gvne#VyyW*KXhS zmkRM@Ay6S|ETvHO#XHF7)xg8^gq5#Qfl`Ch35S=#pKMpU^vpuNd^BR?Jr0#mFy2}5 z39;CmL3R6`?K5ul5s711B60fMkxua5CPKeQ4(DIGGyexIdjH+$l~^>#Q6~t$gb&AA z&@Q7Jh0@F4o^YKrZ}v{mx7-alXF%x^9*36W0?J(B<3rwr-MzNLiO#V1Id^HSD%Mgl zD0i;7o&Bx5c>Wr*BN}pR0Mp9c!h2<0kR(#sq_KtNSc&Am*`SuA8J%zI$bV)#58l_Po$afWKKysr@Ljs zKZ{#D>cB(Wawat|i5dUDPm)bE{N(+K$K4p2y+Y3 z|3q?951$T;fcnPi+2XqV&rX+lyB>k8A0y~{NkL}|? zO+9#b=}kikO=0@BIBNUS{N?G-(VF!#4cZ;=Acv8Kz^%mSboEdaAly1BlaaggS3-ED zP93~~PYxts6|Kj!8`<^Z@eu%rci%&bX@a@14x_(|yqQShrq8aXZxx~zy^f|&`J*Yb zBp5uK8w=D3R?Io~eRU2kvIYM*H_9UzHb=~x+&BmgR_1e%MgL4injp}!8+5B;J=!qu!R`I4Mez4mO=5L2lg3e)#tuV>3&bfmn zn=nQIfjjm6S@raj^|?FH*Pe;rz4kz?y?{A@4nQMdA>cef7hpeN2yhL+1dIS~0o(?- zoq%XyIF&u-+}z%W+@}Cv;3fMUlFtLa0{AxI6~Oli>?x_Jh1;HFqJL@j7*e+1`RlCk zU!x{}{^2Kc#WN}SI&$6wyao6zK+r6mm-ppJ)dJ=Ka`S-;0E+;4ZzvZN;Pd)ue@3R| zdSu~!n_LOl0O$np{qFdCF4;o`dX*VVWRES4ImdbWn!-uQ{byS3uQw0TN9!`Zl& Date: Wed, 9 Sep 2015 21:28:12 +0200 Subject: [PATCH 48/56] Fixed double loop bug: https://github.com/cligs/tmw/issues/17 --- tmw.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tmw.py b/tmw.py index 29bcd25..d862c96 100644 --- a/tmw.py +++ b/tmw.py @@ -1,4 +1,5 @@ #!/usr/bin/env python3 +# -*- coding: utf-8 -*- # Filename: tmw.py # Author: #cf @@ -347,8 +348,7 @@ def perform_multipleSubs(substitutionsFile, text): ## Load table and turn into dict with open(substitutionsFile, "r") as subsFile: subs = csv.reader(subsFile) - for rows in subs: - subsDict = {rows[0]:rows[1] for rows in subs} + subsDict = {rows[0]:rows[1] for rows in subs} #print(subsDict) ## Create a regular expression from the dictionary keys regex = re.compile("(%s)" % "|".join(map(re.escape, subsDict.keys()))) From 033804ca18755f9365ec37daeea6caeb65fa90f3 Mon Sep 17 00:00:00 2001 From: christofs Date: Thu, 17 Sep 2015 16:46:47 +0200 Subject: [PATCH 49/56] complex progression: 10 bins for display --- __pycache__/tmw.cpython-34.pyc | Bin 45451 -> 45362 bytes tmw.py | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/__pycache__/tmw.cpython-34.pyc b/__pycache__/tmw.cpython-34.pyc index 0e49b07b889ca56e94fa3393fe1c9f93175c0fb0..c7fc6dfe1b6c46b70f878fb8a6693237a831f651 100644 GIT binary patch delta 681 zcmW-eO-NKx6vxm1zBfAa@?)Bk8FicwwNQ#6TUZOju%RMKq6DR|(B~jZW;$lbGs;QH ztR${vso4}xn4&eFYO573lr}-oDudicN$#YeJF__d!~Oc5d;Vu=O)UHnj$av>M(p!u zO<%&XWQUhm>>a=vSz)}v1>P{$Y-r`L&N>~7yf>#K9qV$+dnaTgqD&Q?}+oGy^W-fkR}#gP`VuL*;z*_{1Nt zozk$%?!GAShsUG+0tN>M9&5_&#lcCSoa={%A*yZiY1?+z?gj+T%9j&yaZGM)~h(gTMlxA5(R{^UDeKRh{dfqyrlqjx`E6c;#6+P$z;@mnHf@&l&ze^xfetT7<6p7;q$P72$eoEp`z%8hEyikDaNQAO=Y?h$S7Ev zq9@cOALJu5GHd5YKM@3xL1hp~KSBCi`jPe#1f4rye(c9$FV@=ooEOWYb455;lT);H zteyGt-1)|VO#a}A0H#1S{{nV7 z7Ra+jZYn+bKCsJO38kK_5Lw+wLkc`>&#oyGQON2cuc(6j)fbeHYs)m#-ex*MuGSvg z42wg1Hp6S7yj+v)NRU+tyGTz>?C4A!;YB$T*&F-gVP3?Rs&D;P0 diff --git a/tmw.py b/tmw.py index d862c96..732cbc7 100644 --- a/tmw.py +++ b/tmw.py @@ -1603,7 +1603,7 @@ def create_allComplexProgression_lineplot(dataToPlot, targetCategories, plt.ylabel("Topic scores (absolut)", fontsize=16) plt.xlabel("Textabschnitte", fontsize=16) plt.legend() - plt.locator_params(axis = 'x', nbins = 5) + plt.locator_params(axis = 'x', nbins = 10) plt.setp(plt.xticks()[1], rotation=0, fontsize = 14) if height != 0: plt.ylim((0.000,height)) From 52270fcd926171d9513e959cdb0988f05fcb2573 Mon Sep 17 00:00:00 2001 From: christofs Date: Sun, 18 Oct 2015 18:26:35 +0200 Subject: [PATCH 50/56] Add normalized and zscored topTopics. Fix https://github.com/cligs/tmw/issues/18 --- __pycache__/tmw.cpython-34.pyc | Bin 45362 -> 45567 bytes tmw.py | 207 +++++++++++++++++++++++++++++---- tmw_config.py | 72 +++++++----- 3 files changed, 227 insertions(+), 52 deletions(-) diff --git a/__pycache__/tmw.cpython-34.pyc b/__pycache__/tmw.cpython-34.pyc index c7fc6dfe1b6c46b70f878fb8a6693237a831f651..229b7dfa66541bcdc1717a7162d6bd5735272263 100644 GIT binary patch delta 8604 zcmcIp33OCdntr#cQb|=-Qb|Y=LMmjVz#xl2h|qx$2|-VwO$eY6sH7@Mg`_InDo6+c zM8y37`Xg@GF4HO_ve@N}xQ>pVnRZ7VYaeDD3Jqw|wrHwhMq{f< zb2=N`qTX%MP2EA=8VXB)pj}T0g*`H?n|%SVTj-W+ zs&;C$O|#3gEf1Pk@1e%h@`-UvLZ1q6`cGK?YH7K6kiIIt(D{S3GC<2bVA8w}ZIwxr ziPTnB4py_yi`J~@@@%zync15&xG*%w>>Ykyn_Lv#Bv);+C$`BC3#wTyT>Tr3|1b7B zwa!%SI!$X6nsZ#h@FW#bO-8;k%svrA`c$A#u@2Iq0$40urqLpsu}~C=R3Q&Vc8|MJ zY2IOw{g>L)04YBU0uHLiXKn6L+&Dfxhr=kCZ#dx z$>HECUtopwwfp+yopgVBY5obJX`xIW)J%3JoFXzshRJFwH93TQCUU&|4<_6Y3bpwA{|i*Ly?sYv{q#!NaqtklH&TCuONn?{-B;fQT= zi7D@8?x@$3LaY2eGT0lEOkI<&0Pd!`DMjKQ>Yfr3uhB@PzH}q^}-!vtIS26!DfG+`G z0Tde+gAH}QchUPbgWD`mQ`f~)92>EB68AvX*=F8TqvxBQk=HM76joznDx2AI$v0T3 zvB;&DTO9j3JU!6^xG?xvfN>PGa1uRnO~_a64}|2m=sK;m{n5;^IJ<&O>tz$8ZdvE^ z$o{0_o;5A>WtH0`1QpF%YJCk0UZ<;Pxt%z_p2#VIZ6hRI7?=k*O)t#~irsY4>~SI^ zvU2ttLRs=T$;bq%n==`kZ=ADuld(DWutM;OfFytoU&dgxC^}o z8)&7VFO@;h@OyoMu)odklSVp|X&6ffjK$Qk8l7%WqGdJ3;z`RVinpWGT$`7@A47^c>Xy|A2K5OUG_;6O!r0OAEyX^vR_jv6pHuYe*dd_SpV8YRZX`hc8Ps zSto$PZhE|azV!k$3&^ryvY1E}3#Jz+>MATK$+$pj5+=<6oS+>GI)#h=x?r4`7b#m9 z5TcHD{k+b40Q2hTlbK2fPnx z!c?WQC1{Oi&}|_Wzdzvh^|_PeQgkn(eNAIH`@P#VzMu_!7-1*Vlq+l;OkJi7k;~zS zt)w+lwD?gWD(LW%0`U%gzGTWc$k;v12%xZG-O|$)49hBzvM{M0-P#xSx2_7wYFfB- z#`HO$cp*<@>Mdwr0a%IYwzgm(9BTD+`E+ZC&)?n=HZ#LMju?6C;I8@E1#-1TwLjdCn2SsOB>1!p$NCdU`V;I ziaLhPY;R60j9JtJ&;qItjjbnFq?l4U1<3VOvHbm%ICF53t2t+GECwo_gNmNOp68QX z!War>&h%6cBXwb4cc`Hw*c&k7hho4JJKXl>T<1Q}PBUmL3vjBUMQ)5d-+YIt^;nuGh=oo16T*MlGT9K0AvUm1oQyHfDlz}&aE5| z$}wBB8Mda>q|~b9e=g0lo%QC?I?piiG~MJW6PqFtPm5?-jICE<)AeX=1Z-k(r^`OH zT^Nd*np}sz4S*puVuqzcxne_cWmwmZt!d(Iy0dkPxQc_(AB{jXb1)ErL?dVpmQ683 z8ycMqPNL@JqVYca@Y7MF;#xEslr~2e_+Aj3qE=-jDwVQS=KPYWa*-GI#-$P!cN8o2 zXI($W%Mf$i3?VGWCRR@k&=4>1ZSD8nd*@E~_#KpBP7zb&SJJ%x)g1Z^XHDeqARz7aumn(`)S#DLaYCCP%HS)q8yoYX| z@9B{JzK%fH7Z!oC$ngWoG(t$dorJIFh=%{r2h&fQGTdoB5b4HongeU z^9fyuS~Hc$#msInL%Nul5l7UimD#NYpZvnfm0{>IPm8Y$39`2s^u>DmbErhz7s(4B z7YWz!;+bu{a{?s6FfaVUF*I-c&m_YI3FZ1VTc3<9nS@7$|YhFxEH+P&=>H# z{;0kZ@ld3u?*<`$NpGynt-AxG_W>SaaHkA)H|1$DCxbZUBv4TVl+WFcKEvm#`?JJD zw79=q?4hmwCkKyG<+uDeuX}UW2a+A8yD^{Viys0N_{VeUz}VCDC*~UqnVpeY~5sy<|*cmyvaf8S{fZbHkH=-pTE2#Ravh?lT)Dvs`e3KXG zmpiEMs`;a{Fg%~Pi_6viC0ww(P4W=N4$$$diYgFib!(fyeU0?#c7B=i%3!;df$Gjj>MzH5U0RE8zo5u(v+&;27R~)AoH^Jft#`Xc?DWHU(x+TN;C8C~D2-!{P{4bTSibe_fQe?%J z^P1%Bj4B@!4MV;{(9*3TkwLF-EfTLrPHx>VrWyhQ@h2E`YZt;oNM4AZ1%RRY zMc)BBa$Qk|QgV`^GTE)+AMQ*zxP1L-K z8E-M1^>~Ovsb8uc*5UD=0WSj7hKiD+IcQ=qfUj>%7cS;aaP`p%J42ZP|=ZN(7KyO{i5n@g5j(>S!LC zGbFQqZ_>GzZoFmT|18H&8IoasB*)6LX@+fL)vYCB4c)?cH1fo)iJ~*=^YvWMDL4Mf z>fpEFe}l+kT2P60h!%`2bn(t&t}<5cbc$n<{+*E@R6y;n32_R@RvG?X5Q+tXduabI z=PV<0I}G-fZ6MCbX2AK@EvS|+jRovvqgr-hi#5^1sdIsv@veGyw%T|%{6WlU&*5L_1U2{NL`-*FE0!O@&PM^UM5)rrG4x>z4hb<) zyLh>}6)-rA@f9lqluxu^^$B$=`{kXP;t(a@HC=o-)GyY+4X?Me38oNM|hh?sY*B%2A^OF7`Gn>&{FY{%ZX@weB;4saqej~WKZP}z2Lpn955zv9D<7Vo zj8A9r@!(evmx?kv_VA_RWTa%z3Ss>K6MjcG{{8Hh?{)r9G3O-1cXX^6QEvc>FU8W2 zM}X|iV0TZKuWuo$SYIgQ4+gHJ#z$t0Khds74iCG@$^nUg{Fw+JYvd{F-FuuP*s@29 z>Qy=S7iL2@xB0`mEi65OP+L%T#|0~>%eXd8P1u;YxgJ=kqRFM=(s=}>8LI{GO zwn8`CaY3LzwH*;bTH%fD=%=){dy6dX+D}_(rDtRmMeRk~?$0^D`zom-gEsabGlBYj z_ub{(<^0a_?)&yb%l;26u1^OfyY4;ux4CPw_qm)&B2mvudMa~LkESiwG@#Jdngt6A zH(Rv4yxAU6>#-X4a6|@zEru-|@ydu{^#^?(VOTmSqv%z!hTbn4EjH-s?z@E8Ko1pH zigol(aX@U=tB3qU2%;(Hg$wT1G_5&-0e@Ni%jKVyf8pn?bojiLCFRy0ySCVhm8=z7 zO+&tQrBjPIv>vDCOVHL@w2cWADjkzJ(bAK|&0}bHsVsF^eFLJ%5=&3*$EEo5*N)szhYF}knl&sm-+*SA3eW9IOyB+N929)Txpog+GEjtnOd7g zlP+o>T7jizozq-}YWR}3HU3}k%hnbrYd_YsMMCop)OFN@eh8%O0aPtw<9CMua}^55zCQLS86F}hQjqxOx?7GKdzqf5kd^zrB;>23>i z$Ieq|!etq3k+13p$CO&~pXZ8NBQe|-=#-&lVJY}oz5v)p11>BPJ80yEVeu0E+l5oa z6ZFRmpA6KMMl;56VPs+w4yzYs@~#?PA7Wf>p_h*TRN zjhzqYD_T9VYn(cU7t#Lh0{?k{k7Z0rYK7ZN+db4g0GD!La-q zjeljH3}wgi#Knb{Z#Al{&ZW~A@1_5&P95veg6yPh8ryoCRg)*Q9tSLhA6wW$5;dQT zDKA#4Ztx8@EytHkf0$h4PSM!d*9wa=j5M^sN|pq)`UqP}x<-*pXOWyzsVEf38yr}z zyqeKSpckfiEPvPN^ptDtFJYxG)67dedDxNRVBgr{kA$Tg4VMFWAD4v0og}KuM3P=n zeMo$_aH-M2nlTW5MNR!GQ#fP@X0WsXtbhc74KN6;>=`s!2*7E3T;}20(e6gAxeSyX zs7qup(gHqzFcMf4@JrLLODEb~fZ^ymT%+QtqF8PmdDXH9dH8h zJ{`P#f;dc$Y2!p5O`dkxwCSi3qsLnSmV&uG(?@WKxn_Dp=0ExvYRMPFMXE&>>#s~t z5f|-1&shK`gU2R|u>vRSs1f+b^`vWeA=oRX~IUEW)!nK*+0W8c2d@rvy-zpz=e5Pnr4yZdgMxn#a;%f+v&!c z)9pi197IQFjuGW#tE(zfGb+ngr{RY93UnF=I6)0{i^U*%xvorHs{g4jC`1i4&zff6 zjea$>e^#BifUL7iL@n*`<$;~e=GCC@1s8dKZHT+5+8Wl?}x zp%il&9E~DSI@Qe?l05^vad9`zn!$l0PZV0@RrKRIhllhzmRe7u=|wWRSJYdg^fGnL zU6y@yp8!jW$Pl@f42ztnCtRH+nqm@6#eyWJp7XhM856s|M2LMGa22{L70yPfkil?< zS-F9r&)>xhnuF>(>aHIq(&_&C%A$o>M;4FOtQnSE;bdRwwxo$%kp|W*&`;MtEW{Z4 z*)>Jt5Iud(*fN~IC*9l`IBg_!wud5e0;WpfsX7gNS0vEf7M2%L;rvNC7h}c|+>vK* zKzT0UTI!silcaR%<~CVPTjo~?Nzcq5%F*_t`4@=xdFZzQfMf8a7^&Xt{nFdwukl8_ zJW*KIp%EN0(oBDIsW;+p3CVyzEa#)G6R-pjn@OF3T0=b=t_3u(PP1sobt&Rj+H+mL zSV}Kl_hhw67&!iwlRIf~qbu)0 zs2NF!7FBZ|lP=97m+L=m+$xe9F=8RWOC!C7tHz*shARW^bmh}!P3ht>n%6W`bnB~|nncqqh+YVx z*Q2x?(9PgUm0^_KXzC-FjG(Ry&~J_yosMIKT83Ix1J(G7#X(x+8!J|@6FN1^X3T$f<@1Y3;GCW#wevn z6)?Z#{+zKzd~s34q~uIWZygvHcoh$Ou>E?Bv2zNxywk6uuwe@XBNtvEo4G-*M3>np z!S$H2`5J7?72~aGn##3f7lxBF=Go05$wrHVGcx$aD(Le{AMI?-5e;;pb-2*zWNVpN zt7ivpfj8Q*c&OcEBE7P>Sd62O7oT4g9ky38V zS5#A1+i2mYJ#8f!8(CCyA}<(!N&Iu^$J<7V$^25GNi=e-RLoJNadQWgG-{)2hM4de zX&Pu|$4F5{t2;bmlL}QSKY7J_PE2ved5kkjqLd_Wz`|6Xvb!Lpor3B9E0 zjbR*ygW`w+y)%4T>v@7tRh)Q0S9E%dZG4Pp(a7iK=}iA|93{oGB`h8-!iq_+J?gH?SlR=c!Mz^BWyS1as zGHrKr#dhA_v>J13me`1BTsLgq&d&Db-|eE6Up8&4ceiD+-A1j-jddu_A4f^;b{iIh z4VLr0$j4o!;yyin`E8=<9PjG#*tt4`(>%vRDTyyXdz!lngw<4@QEI(;qV{{{wnFNJ*EfQzqW0O>0WUQ}oL< zU*vg70Ocp?nYH=Zdr;g3h?jGEZAso2(1e-dxGbssFPU#{1`GKsz5J#X zV$kzAEFP!H*8qL~O|GP8Z_c?=y-O+%s0dgq$8j${`4|;r>Q^34g*r7#vEg-ecbN2? zOQp9&gp0P_QX*c|58Uz#F~!_3mc5_Bu(!k1!*V!krUUw`3w660x(kuwxvA*JfteHB zavfJ0iKYUZ{j9+^Z*6W>8a&PmU=4<^H*+mhg9FX=KL%;P1UwFaD&>Cwl!u;;(i4Ei z0F`-trz+#@JW0NQ{wi?B_V+C6eht`9AKvC+^(LI^cmamB844Wfr|EO9~9NICP>NY<60#}>eGNI!cR^=}*}uBD%CED}dJ zgx)3w^@UG)JBDt@&^u7NlNN5C+#7B&RHV`6tm;MuX0zt@$*C3Pp&H-CHVN*KZ<6-(}tA+XvR3|q>99YCEh+;aaDZlFKkK8~}9iaYYe8~Ts#(7*R^v+f!hcevRqmLG-u*okhX6?f%L zF(Y=SY1N7a@H=t=;9PT)vHg&ZuMlm z&HbBklY~FGN|*lV)(1qx5y*KRFblAdTg;oeH}Hs2Z2GZdbff+IL%BNX28=$TE}34~ zmLc}jJKL(nyZ=u{7q|YXq48&&{ya#>iW}&&9pl9@WohsIKua5THZri#e`a0@^!7bd zC;gUt#)b8NDUbc(me#&2pPspQu=rA0+V6GSeN7@M7I0r9`Sa4TF>_Pd54@Qtx&N6e z7OHSydN4Zp^NaYE@UtDcG~|AFvD408cp#7DOMZUs9n9$ByEy)rz@LCD@Q7G8KTRu1)Fe5A7`R0C+6?o zgj7W4xbx?Xs&7V*`>a+x9@6rZcOgnd7AwJQ+*wrZPzeA%z^}8==`c4P&A^Lp6a0VIFB9wzrYf{UJIj2~{-``6l z@5vN%sd&#N;uJOSIg)-e2*W@Aa>LhOLViv)51-)MInUmbT9ws(!804yMS+Onj7V=V zyeK3);(QpiV_tPS!hDt-^EgF}WkK<|&$hia;sH9gH&1*`U++CWQYAfKV`GVCmZ)|T zHv$`1`!rJ>az2uuvmfhK_c~1NWUEsQuoMdUH=6iJ^}L&~mJNW-fV%-Y;Bmln3{26b z9i7X~O91ta?IfB$2YdxkpQgS=DG9Tu0P^U~M~YS{o=wK{AO^$EXWp8`H7=f3" not in lemma: lemmata.append(lemma.lower()) + elif mode == "enNV": + if "NN" in pos or "VB" in pos and "|" not in lemma and "" not in lemma: + lemmata.append(lemma.lower()) + elif mode == "enN": + if "NN" in pos and "|" not in lemma and "" not in lemma: + lemmata.append(lemma.lower()) ## Continue with list of lemmata, but remove undesired leftover words lemmata = ' '.join([word for word in lemmata if word not in stoplist]) lemmata = re.sub("[ ]{1,4}"," ", lemmata) @@ -468,19 +481,67 @@ def make_lemmatext(inpath, outfolder, mode, stoplist_errors): + + +################################# +# substitute # +################################# + +import csv + +def multipleSubs(substitutionsFile, text): + """Search and replace from a table of string pairs.""" + ## With code from http://stackoverflow.com/users/735204/emmett-j-butler + ## Load table and turn into dict + with open(substitutionsFile, "r") as subsFile: + subs = csv.reader(subsFile) + subsDict = {rows[0]:rows[1] for rows in subs} + for key, value in subsDict.items(): + text = re.sub(key, value, text) + #print(text) + return text + + ## Create a regular expression from the dictionary keys + #regex = re.compile("(%s)" % "|".join(map(re.escape, subsDict.keys()))) + ## For each match, look-up corresponding value in dictionary + #result = regex.sub(lambda mo: subsDict[mo.string[mo.start():mo.end()]], text) + #print(result) + +def substitute(inpath, substitutionsFile, outfolder): + """Deletion of unwanted elided and hyphenated words for better tokenization in TreeTagger. Optional.""" + print("\nLaunched substitute.") + for file in glob.glob(inpath): + with open(file,"r") as text: + text = text.read() + text = multipleSubs(substitutionsFile, text) + basename = os.path.basename(file) + counter = 0 + if " truc " in text or " type " in text or " flic " in text: + counter +=1 + print(counter) + cleanfilename = basename + if not os.path.exists(outfolder): + os.makedirs(outfolder) + with open(os.path.join(outfolder, cleanfilename),"w") as output: + output.write(text) + print("Done.") + + + + + + ################################################################## ### TOPIC MODELLING WITH MALLET ### ################################################################## -# TODO: Concatenate two stoplists first, one for errors, one for deliberate ommissions. - ################################# # call_mallet_import # ################################# -def call_mallet_import(mallet_path, infolder,outfolder, outfile, stoplist_project): +def call_mallet_import(mallet_path, infolder, outfolder, outfile, stoplist_project): """Function to import text data into Mallet.""" print("\nLaunched call_mallet_import.") import subprocess @@ -861,12 +922,21 @@ def get_targetItems(average, targetCategory): #print(targetItems) return(targetItems) -def get_dataToPlot(average, firstWordsFile, topTopicsShown, item): +def get_dataToPlot(average, firstWordsFile, mode, topTopicsShown, item): """From average topic score data, select data to be plotted.""" #print(" Getting dataToPlot.") with open(average, "r") as infile: ## Read the average topic score data allData = pd.DataFrame.from_csv(infile, sep=",") + if mode == "normalized": # mean normalization + colmeans = allData.mean(axis=0) + allData = allData / colmeans + elif mode == "zscores": # zscore transformation + colmeans = allData.mean(axis=0) # mean for each topic + allstd = allData.stack().std() #std for entire df + allData = (allData - colmeans) / allstd # = zscore transf. + elif mode == "absolute": # absolute values + allData = allData allData = allData.T ## Add top topic words to table for display later firstWords = get_firstWords(firstWordsFile) @@ -879,15 +949,19 @@ def get_dataToPlot(average, firstWordsFile, topTopicsShown, item): #print(dataToPlot) return dataToPlot -def create_barchart_topTopics(dataToPlot, targetCategory, item, +def create_barchart_topTopics(dataToPlot, targetCategory, mode, item, fontscale, height, dpi, outfolder): """Function to make a topTopics barchart.""" print(" Creating plot for: "+str(item)) ## Doing the plotting. dataToPlot.plot(kind="bar", legend=None) plt.setp(plt.xticks()[1], rotation=90, fontsize = 11) - plt.title("Top-Topics für: "+str(item), fontsize=15) - plt.ylabel("Scores", fontsize=13) + if mode == "normalized": + plt.title("Top-distinctive Topics für: "+str(item), fontsize=15) + plt.ylabel("normalized scores", fontsize=13) + elif mode == "absolute": + plt.title("Top-wichtigste Topics für: "+str(item), fontsize=15) + plt.ylabel("absolute scores", fontsize=13) plt.xlabel("Topics", fontsize=13) plt.tight_layout() if height != 0: @@ -897,12 +971,12 @@ def create_barchart_topTopics(dataToPlot, targetCategory, item, outfolder = outfolder+targetCategory+"/" if not os.path.exists(outfolder): os.makedirs(outfolder) - figure_filename = outfolder+"tT_"+str(item)+".png" + figure_filename = outfolder+"tT_"+mode+"-"+str(item)+".png" plt.savefig(figure_filename, dpi=dpi) plt.close() def plot_topTopics(averageDatasets, firstWordsFile, numOfTopics, - targetCategories, topTopicsShown, fontscale, + targetCategories, mode, topTopicsShown, fontscale, height, dpi, outfolder): """For each item in a category, plot the top n topics as a barchart.""" print("Launched plot_topTopics.") @@ -911,8 +985,8 @@ def plot_topTopics(averageDatasets, firstWordsFile, numOfTopics, if targetCategory in average: targetItems = get_targetItems(average, targetCategory) for item in targetItems: - dataToPlot = get_dataToPlot(average, firstWordsFile, topTopicsShown, item) - create_barchart_topTopics(dataToPlot, targetCategory, item, fontscale, height, dpi, outfolder) + dataToPlot = get_dataToPlot(average, firstWordsFile, mode, topTopicsShown, item) + create_barchart_topTopics(dataToPlot, targetCategory, mode, item, fontscale, height, dpi, outfolder) print("Done.") @@ -1034,6 +1108,30 @@ def get_heatmap_dataToPlot(average, firstWordsFile, topTopicsShown, allScores.index = allScores.index.astype(np.int64) allScores = pd.concat([allScores, firstWords], axis=1, join="inner") #print(allScores) + ## Remove undesired columns: subsubgenre + #allScores = allScores.drop("adventure", axis=1) + #allScores = allScores.drop("autobiographical", axis=1) + #allScores = allScores.drop("blanche", axis=1) + #allScores = allScores.drop("education", axis=1) + #allScores = allScores.drop("fantastic", axis=1) + #allScores = allScores.drop("fantastique", axis=1) + #allScores = allScores.drop("historical", axis=1) + #allScores = allScores.drop("n.av.", axis=1) + #allScores = allScores.drop("nouveau-roman", axis=1) + #allScores = allScores.drop("sciencefiction", axis=1) + #allScores = allScores.drop("social", axis=1) + #allScores = allScores.drop("other", axis=1) + #allScores = allScores.drop("espionnage", axis=1) + #allScores = allScores.drop("thriller", axis=1) + #allScores = allScores.drop("neopolar", axis=1) + ## Remove undesired columns: protagonist-policier + #allScores = allScores.drop("crminal", axis=1) + #allScores = allScores.drop("mixed", axis=1) + #allScores = allScores.drop("witness", axis=1) + #allScores = allScores.drop("criminel", axis=1) + #allScores = allScores.drop("detection", axis=1) + #allScores = allScores.drop("victime", axis=1) + #allScores = allScores.drop("n.av.", axis=1) ## Sort by standard deviation standardDeviations = allScores.std(axis=1) standardDeviations.name = "std" @@ -1065,14 +1163,14 @@ def create_distinctiveness_heatmap(dataToPlot, # Nice: bone_r, copper_r, PuBu, OrRd, GnBu, BuGn, YlOrRd plt.title("Verteilung der Topic Scores", fontsize=20) plt.xlabel(targetCategory, fontsize=16) - plt.ylabel("Top topics (stdev)", fontsize=16) - plt.setp(plt.xticks()[1], rotation=90, fontsize = 12) + plt.ylabel("Top topics (stdev)", fontsize=14) + plt.setp(plt.xticks()[1], rotation=90, fontsize = 14) plt.tight_layout() ## Saving the plot to disk. if not os.path.exists(outfolder): os.makedirs(outfolder) - figure_filename = outfolder+"dist-heatmap_by-"+str(targetCategory)+".png" + figure_filename = outfolder+"dist-heatmap_by-"+str(targetCategory)+".jpg" plt.savefig(figure_filename, dpi=dpi) plt.close() @@ -1302,6 +1400,75 @@ def build_itemScoreMatrix(averageDatasets, targetCategory, itemScores = itemScores.sort(columns=["sorting"], axis=0, ascending=False) itemScoreMatrix = itemScores.iloc[0:topicsPerItem,0:-1] itemScoreMatrix = itemScoreMatrix.T + itemScoreMatrix = itemScoreMatrix.drop("Allais", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Audoux", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Barbara", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Barjavel", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Beckett", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Bernanos", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Bosco", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Bourget", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Butor", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Camus", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Carco", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Celine", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Colette", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Darien", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Daudet", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Delly", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Dombre", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Duras", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("ErckChat", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("FevalPP", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("MduGard", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Mirbeau", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Ohnet", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Perec", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Proust", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Queneau", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Rodenbach", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Rolland", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Roussel", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("SaintExupery", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Sand", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Aimard", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("AimardAuriac", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Balzac", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Bon", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Echenoz", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Flaubert", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Fleuriot", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("France", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Galopin", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Gary", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("GaryAjar", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("GaryBogat", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("GarySinibaldi", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Gautier", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Giono", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Gouraud", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Huysmans", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Hugo", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("LeClezio", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Loti", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Malot", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Mary", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Maupassant", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Modiano", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("RobbeGrillet", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Stolz", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Sue", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Tournier", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Verne", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Vian", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("VianSullivan", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Zola", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Malraux", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Simon", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("LeRouge", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("LeRougeGuitton", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Toussaint", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Khadra", axis=0) #print(itemScoreMatrix) return itemScoreMatrix @@ -1315,10 +1482,10 @@ def perform_itemClustering(itemScoreMatrix, targetCategory, method, metric, ## Plot the distance matrix as a dendrogram plt.figure(figsize=figsize) # TODO: this could be a a parameter. itemLabels = itemScoreMatrix.index.values - sc.hierarchy.dendrogram(itemDistanceMatrix, labels=itemLabels, orientation="right") + sc.hierarchy.dendrogram(itemDistanceMatrix, labels=itemLabels, orientation="top") ## Format items labels to x-axis tick labels - plt.setp(plt.xticks()[1], rotation=90, fontsize = 10) + plt.setp(plt.xticks()[1], rotation=90, fontsize = 14) plt.title("Item Clustering Dendrogramm: "+targetCategory, fontsize=20) plt.ylabel("Distance", fontsize=16) plt.xlabel("Parameter: "+method+" clustering - "+metric+" distance - "+str(topicsPerItem)+" topics", fontsize=16) @@ -1328,7 +1495,7 @@ def perform_itemClustering(itemScoreMatrix, targetCategory, method, metric, print("- saving image file.") if not os.path.exists(outfolder): os.makedirs(outfolder) - figure_filename = "item-clustering_"+targetCategory+"_"+metric+"-"+method+"-"+str(topicsPerItem)+"topics"+"-"+sortingCriterium+".svg" + figure_filename = "item-clustering_"+targetCategory+"_"+metric+"-"+method+"-"+sortingCriterium+"-"+str(topicsPerItem)+"topics"+".jpg" plt.savefig(outfolder + figure_filename, dpi=600) plt.close() diff --git a/tmw_config.py b/tmw_config.py index e31bfc7..bc557bf 100644 --- a/tmw_config.py +++ b/tmw_config.py @@ -31,13 +31,13 @@ ### The following settings depend on the system used. ### Path to the working directory. -wdir = "/home/christof/Dropbox/0-Analysen/2015/hybrid/rf10/" # end with slash. +wdir = "/home/" # end with slash. ### Path to the TreeTagger file (language-dependent!) -tagger = "/home/christof/Programs/TreeTagger/cmd/tree-tagger-french" +tagger = "/home/[USER]/Programs/TreeTagger/cmd/tree-tagger-french" ### Path to Mallet installation directory -mallet_path = "/home/christof/Programs/Mallet/bin/mallet" +mallet_path = "/home/[USER]/Programs/Mallet/bin/mallet" ### Path to the font for wordle generation -font_path = "/home/christof/.fonts/AlegreyaSans-Regular.otf" +font_path = "/home/[USER]/.fonts/AlegreyaSans-Regular.otf" import tmw #print(help(topmod)) @@ -58,7 +58,7 @@ ### Split entire texts into smaller segments. inpath = wdir + "1_txt/*.txt" outfolder = wdir + "2_segs/" -target = 2000 +target = 600 sizetolerancefactor = 1.1 preserveparagraphs = True #tmw.segmenter(inpath, outfolder, target, sizetolerancefactor, preserveparagraphs) @@ -67,7 +67,7 @@ ### Assign each segment to one bin over textual progression. inpath = wdir + "2_segs/*.txt" outfolder = wdir + "7_aggregates/" -binsnb = 5 # number of bins +binsnb = 3 # number of bins #tmw.segments_to_bins(inpath,outfolder, binsnb) ### pretokenize @@ -79,7 +79,7 @@ ### call_treetagger ### Perform lemmatization and POS tagging. -infolder = wdir + "3_tokens/" +infolder = wdir + "2_segs/" outfolder = wdir + "4_tagged/" tagger = tagger #tmw.call_treetagger(infolder, outfolder, tagger) @@ -89,9 +89,15 @@ inpath = wdir + "4_tagged/*.trt" outfolder = wdir + "5_lemmata/" mode = "frN" # frN=nouns, esN=nouns, frNV=nouns+verbs, frNVAA=nouns+verbs+adj+adverbs -stoplist_errors = wdir+"extras/fr_stopwords_errors.txt" # in tmw folder +stoplist_errors = wdir+"extras/fr_stopwords_errors.txt" # wdir #tmw.make_lemmatext(inpath, outfolder, mode, stoplist_errors) +### substitute +### Perform some preliminary tokenization. +inpath = wdir + "5_lemmata/*.txt" +outfolder = wdir + "5_substituted/" +substitutionsFile = wdir+"extras/fr_argot-substitutions.csv" +#tmw.substitute(inpath, substitutionsFile, outfolder) ################################ @@ -101,7 +107,7 @@ ### call_mallet_import ### Imports text data into the Mallet corpus format. mallet_path = mallet_path -infolder = wdir + "5_lemmata/" +infolder = wdir + "5_substituted/" outfolder = wdir + "6_mallet/" outfile = outfolder + "corpus.mallet" stoplist_project = wdir+"extras/fr_stopwords_project.txt" # in tmw folder @@ -112,9 +118,9 @@ mallet_path = mallet_path inputfile = wdir + "6_mallet/corpus.mallet" outfolder = wdir + "6_mallet/" -numOfTopics = "50" # string +numOfTopics = "250" # string optimize_interval = "100" # string -num_iterations = "1000" # string +num_iterations = "5000" # string num_top_words = "100" # string doc_topics_max = numOfTopics num_threads = "4" # string @@ -129,21 +135,22 @@ ### create_mastermatrix ### Creates a matrix with all information (metadata and topic scores for ### each segment) in one place. -corpuspath = wdir+"/2_segs/*.txt" +corpuspath = wdir+"2_segs/*.txt" outfolder = wdir+"7_aggregates/" mastermatrixfile = "mastermatrix.csv" -metadatafile = wdir+"/metadata.csv" -topics_in_texts = wdir+"/6_mallet/topics-in-texts.csv" +metadatafile = wdir+"metadata.csv" +topics_in_texts = wdir+"6_mallet/topics-in-texts.csv" numOfTopics = int(numOfTopics) useBins = True # True|False binDataFile = wdir+"7_aggregates/segs-and-bins.csv" -#tmw.create_mastermatrix(corpuspath, outfolder, mastermatrixfile, metadatafile, topics_in_texts, numOfTopics, useBins, binDataFile) +###tmw.create_mastermatrix(corpuspath, outfolder, mastermatrixfile, metadatafile, topics_in_texts, numOfTopics, useBins, binDataFile) ### calculate_averageTopicScores ### Based on the mastermatrix, calculates various average topic score datasets. mastermatrixfile = wdir+"/7_aggregates/mastermatrix.csv" outfolder = wdir+"7_aggregates/" -targets = ["author", "subgenre", "binID", "decade"] +targets = ["segmentID"] +#targets = ["subgenre", "author-name", "subsubgenre","decade", "narration", "setting", "author-gender", "title", "protagonist-policier"] #targets = ["author", "author-gender", "title", "decade", "subgenre", # "idno", "segmentID", "narration", "protagonist-policier", "binID"] #tmw.calculate_averageTopicScores(mastermatrixfile, targets, outfolder) @@ -153,7 +160,7 @@ mastermatrixfile = wdir+"/7_aggregates/mastermatrix.csv" outfolder = wdir+"7_aggregates/" targets = ["decade", "binID"] # 2 targets to combine -tmw.calculate_complexAverageTopicScores(mastermatrixfile, targets, outfolder) +#tmw.calculate_complexAverageTopicScores(mastermatrixfile, targets, outfolder) ### save_firstWords ### Saves the first words of each topic to a separate file. @@ -172,11 +179,11 @@ ### Creates a wordle for each topic. word_weights_file = wdir+"6_mallet/" + "word-weights.txt" numOfTopics = numOfTopics -words = 40 +words = 30 outfolder = wdir+"8_visuals/wordles/" font_path = font_path dpi = 300 -tmw.make_wordle_from_mallet(word_weights_file,numOfTopics,words,outfolder,font_path,dpi) +#tmw.make_wordle_from_mallet(word_weights_file,numOfTopics,words,outfolder,font_path,dpi) ### crop_images ### Optional. Crops the wordle image files. @@ -192,14 +199,15 @@ ### For each item from a category, creates a barchart of the top topics. averageDatasets = wdir+"7_aggregates/avg*.csv" firstWordsFile = wdir+"7_aggregates/firstWords.csv" -targetCategories = ["author", "subgenre", "binID"] -topTopicsShown = 30 +targetCategories = ["title"] +topTopicsShown = 16 numOfTopics = numOfTopics fontscale = 1.0 height = 0 # 0=automatic and variable dpi = 300 +mode = "normalized" #normalized|zscores|absolute outfolder = wdir+"/8_visuals/topTopics/" -#tmw.plot_topTopics(averageDatasets, firstWordsFile, numOfTopics, targetCategories, topTopicsShown, fontscale, height, dpi, outfolder) +tmw.plot_topTopics(averageDatasets, firstWordsFile, numOfTopics, targetCategories, mode, topTopicsShown, fontscale, height, dpi, outfolder) ### plot_topItems ### ### For each topic, creates a barchart with top items from a category. @@ -207,8 +215,8 @@ outfolder = wdir+"8_visuals/topItems/" firstWordsFile = wdir+"7_aggregates/firstWords.csv" numOfTopics = numOfTopics # must be actual number of topics modeled. -targetCategories = ["author", "subgenre", "binID"] -topItemsShown = 30 +targetCategories = ["segmentID"] +topItemsShown = 20 fontscale = 0.8 height = 0 # 0=automatic and flexible dpi = 300 @@ -225,7 +233,7 @@ averageDatasets = wdir+"7_aggregates/avg*.csv" firstWordsFile = wdir+"7_aggregates/firstWords.csv" outfolder = wdir+"8_visuals/distinctiveness/" -targetCategories = ["subgenre"] +targetCategories = ["protagonist-policier"] numOfTopics = numOfTopics # actual number of topics modeled. topTopicsShown = 20 fontscale = 1.0 @@ -242,7 +250,7 @@ dpi = 300 height = 0 # for lineplot; 0=automatic mode = "line" # area|line for areaplot or lineplot -topics = ["25", "44"] # list of one or several topics +topics = ["190", "6"] # list of one or several topics #tmw.plot_topicsOverTime(averageDatasets, firstWordsFile, outfolder, numOfTopics, fontscale, dpi, height, mode, topics) ### topicClustering ### @@ -257,12 +265,12 @@ ### itemClustering ### # This function creates a dendrogram of items in a category (authors, titles). -averageDatasets = wdir+"7_aggregates/avg*author.csv" -figsize = (10,80) # width,height +averageDatasets = wdir+"7_aggregates/avg*.csv" +figsize = (15,10) # width,height outfolder = wdir+"8_visuals/clustering/" -topicsPerItem = 40 # can be set -sortingCriterium = "std" # std|mean -targetCategories = ["author"] # list +topicsPerItem = 50 # can be set +sortingCriterium = "mean" # std|mean +targetCategories = ["author-name"] # list methods=["weighted"] # list metrics=["cosine"] # list #tmw.itemClustering(averageDatasets, figsize, outfolder, topicsPerItem, targetCategories, methods, metrics, sortingCriterium) @@ -302,7 +310,7 @@ ### 5c show segment ## To read a specific segment, better than looking in the folder. -segmentID = "rf0166§0118" # indicate here, manually +segmentID = "rf1246§0048" # indicate here, manually outfolder = wdir+"/9_sel-segs/" #tmw.show_segment(wdir,segmentID, outfolder) From cf3c02cb30c264f85a2dbc618670221bef2df9f5 Mon Sep 17 00:00:00 2001 From: christofs Date: Mon, 19 Oct 2015 15:30:57 +0200 Subject: [PATCH 51/56] Add normalization to heatmap. Fix https://github.com/cligs/tmw/issues/18. --- tmw.py | 25 ++++++++++++++++++------- tmw_config.py | 3 ++- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/tmw.py b/tmw.py index feaf48f..34e72f4 100644 --- a/tmw.py +++ b/tmw.py @@ -1095,13 +1095,22 @@ def get_heatmap_firstWords(firstWordsFile): #print(firstWords) return(firstWords) -def get_heatmap_dataToPlot(average, firstWordsFile, topTopicsShown, +def get_heatmap_dataToPlot(average, mode, firstWordsFile, topTopicsShown, numOfTopics): """From average topic score data, select data to be plotted.""" print("- getting dataToPlot...") with open(average, "r") as infile: ## Read the average topic score data allScores = pd.DataFrame.from_csv(infile, sep=",") + if mode == "normalized": # mean normalization + colmeans = allScores.mean(axis=0) + allScores = allScores / colmeans + elif mode == "zscores": # zscore transformation + colmeans = allScores.mean(axis=0) # mean for each topic + allstd = allScores.stack().std() #std for entire df + allScores = (allScores - colmeans) / allstd # = zscore transf. + elif mode == "absolute": # absolute values + allScores = allScores allScores = allScores.T ## Add top topic words to table for display later firstWords = get_heatmap_firstWords(firstWordsFile) @@ -1154,6 +1163,7 @@ def get_heatmap_dataToPlot(average, firstWordsFile, topTopicsShown, def create_distinctiveness_heatmap(dataToPlot, topTopicsShown, targetCategory, + mode, fontscale, dpi, outfolder): @@ -1163,21 +1173,20 @@ def create_distinctiveness_heatmap(dataToPlot, # Nice: bone_r, copper_r, PuBu, OrRd, GnBu, BuGn, YlOrRd plt.title("Verteilung der Topic Scores", fontsize=20) plt.xlabel(targetCategory, fontsize=16) - plt.ylabel("Top topics (stdev)", fontsize=14) - plt.setp(plt.xticks()[1], rotation=90, fontsize = 14) + plt.ylabel("Top topics (stdev)", fontsize=16) + plt.setp(plt.xticks()[1], rotation=90, fontsize = 12) plt.tight_layout() ## Saving the plot to disk. if not os.path.exists(outfolder): os.makedirs(outfolder) - figure_filename = outfolder+"dist-heatmap_by-"+str(targetCategory)+".jpg" + figure_filename = outfolder+"dist-heatmap_"+mode+"-by-"+str(targetCategory)+".png" plt.savefig(figure_filename, dpi=dpi) plt.close() - - def plot_distinctiveness_heatmap(averageDatasets, firstWordsFile, + mode, outfolder, targetCategories, numOfTopics, @@ -1190,13 +1199,15 @@ def plot_distinctiveness_heatmap(averageDatasets, for targetCategory in targetCategories: if targetCategory in average and targetCategory != "segmentID": print("- working on: "+targetCategory) - dataToPlot = get_heatmap_dataToPlot(average, + dataToPlot = get_heatmap_dataToPlot(average, + mode, firstWordsFile, topTopicsShown, numOfTopics) create_distinctiveness_heatmap(dataToPlot, topTopicsShown, targetCategory, + mode, fontscale, dpi, outfolder) diff --git a/tmw_config.py b/tmw_config.py index bc557bf..1cc4cc1 100644 --- a/tmw_config.py +++ b/tmw_config.py @@ -234,11 +234,12 @@ firstWordsFile = wdir+"7_aggregates/firstWords.csv" outfolder = wdir+"8_visuals/distinctiveness/" targetCategories = ["protagonist-policier"] +mode = "zscores" #normalized|zscores|absolute numOfTopics = numOfTopics # actual number of topics modeled. topTopicsShown = 20 fontscale = 1.0 dpi = 300 -#tmw.plot_distinctiveness_heatmap(averageDatasets, firstWordsFile, outfolder, targetCategories, numOfTopics, topTopicsShown, fontscale, dpi) +#tmw.plot_distinctiveness_heatmap(averageDatasets, firstWordsFile, mode, outfolder, targetCategories, numOfTopics, topTopicsShown, fontscale, dpi) ### plot_topicsOverTime ### ### From 86a400f048401b933efa32f2272783b0648d0f97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christof=20Sch=C3=B6ch?= Date: Wed, 21 Oct 2015 15:42:28 +0200 Subject: [PATCH 52/56] make_wordle: add font_path; Fixes https://github.com/cligs/tmw/issues/14 --- tmw.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tmw.py b/tmw.py index 34e72f4..e1e0823 100644 --- a/tmw.py +++ b/tmw.py @@ -846,7 +846,7 @@ def get_wordlewords(words, word_weights_file, topic): wordlewords = wordlewords + ((word + " ") * score) return wordlewords -def get_color_scale(word, font_size, position, orientation, random_state=None): +def get_color_scale(word, font_size, position, orientation, font_path, random_state=None): """ Create color scheme for wordle.""" return "hsl(245, 58%, 25%)" # Default. Uniform dark blue. #return "hsl(0, 00%, %d%%)" % random.randint(80, 100) # Greys for black background. From d6db645490117ea729eb7f5689d8e2065d52c7cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christof=20Sch=C3=B6ch?= Date: Wed, 21 Oct 2015 16:41:59 +0200 Subject: [PATCH 53/56] Added function save_topicRank --- tmw.py | 24 ++++++++++++++++++++++++ tmw_config.py | 9 ++++++++- 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/tmw.py b/tmw.py index e1e0823..3d5ef97 100644 --- a/tmw.py +++ b/tmw.py @@ -808,6 +808,30 @@ def save_firstWords(topicWordFile, outfolder, filename): print("Done.") +################################# +# save_topicRanks # +################################# + +def save_topicRanks(topicWordFile, outfolder, filename): + """Save a list of topics with their rank by topic score.""" + print("Launched save_topicRanks.") + with open(topicWordFile, "r") as infile: + topicRanks = pd.read_csv(infile, sep="\t", header=None) + topicRanks = topicRanks.drop(2, axis=1) + topicRanks.rename(columns={0:"Number"}, inplace=True) + topicRanks.rename(columns={1:"Score"}, inplace=True) + #topicRanks.sort(columns=["Score"], ascending=False, inplace=True) + topicRanks["Rank"] = topicRanks["Score"].rank(ascending=False) + #print(topicRanks.head()) + ## Saving the file. + if not os.path.exists(outfolder): + os.makedirs(outfolder) + outfile = outfolder + filename + with open(outfile, "w") as outfile: + topicRanks.to_csv(outfile) + print("Done.") + + ################################################################## ### VISUALIZATION ### diff --git a/tmw_config.py b/tmw_config.py index 1cc4cc1..86ab0b5 100644 --- a/tmw_config.py +++ b/tmw_config.py @@ -169,6 +169,13 @@ filename = "firstWords.csv" #tmw.save_firstWords(topicWordFile, outfolder, filename) +### save_topicRanks +### Saves the rank (in the overall scores) of each topic to a separate file. +topicWordFile = wdir+"6_mallet/topics-with-words.csv" +outfolder = wdir+"7_aggregates/" +filename = "topicRanks.csv" +tmw.save_topicRanks(topicWordFile, outfolder, filename) + ################################ @@ -324,4 +331,4 @@ targetCategories = ["subgenre"] # list methods=["weighted"] # list metrics=["cosine"] # list -#tmw.itemPCA(averageDatasets, targetCategories, topicsPerItem, sortingCriterium, figsize, outfolder) \ No newline at end of file +#tmw.itemPCA(averageDatasets, targetCategories, topicsPerItem, sortingCriterium, figsize, outfolder) From d50afaefabe280fbd57b8fd5d85e8dcb9bc5d0b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christof=20Sch=C3=B6ch?= Date: Wed, 21 Oct 2015 16:44:34 +0200 Subject: [PATCH 54/56] make_wordle: added topic rank info to figure title --- tmw.py | 18 ++++++++++++++---- tmw_config.py | 5 +++-- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/tmw.py b/tmw.py index 3d5ef97..ba33d45 100644 --- a/tmw.py +++ b/tmw.py @@ -876,21 +876,31 @@ def get_color_scale(word, font_size, position, orientation, font_path, random_st #return "hsl(0, 00%, %d%%)" % random.randint(80, 100) # Greys for black background. #return "hsl(221, 65%%, %d%%)" % random.randint(30, 35) # Dark blues for white background +def get_topicRank(topic, topicRanksFile): + #print("getting topic rank.") + with open(topicRanksFile, "r") as infile: + topicRanks = pd.read_csv(infile, sep=",", index_col=0) + rank = int(topicRanks.iloc[topic]["Rank"]) + return rank + + def make_wordle_from_mallet(word_weights_file, - numOfTopics,words,outfolder, + numOfTopics,words,outfolder, + topicRanksFile, font_path, dpi): """Generate wordles from Mallet output, using the wordcloud module.""" print("\nLaunched make_wordle_from_mallet.") for topic in range(0,numOfTopics): ## Gets the text for one topic. text = get_wordlewords(words, word_weights_file, topic) - wordcloud = WordCloud(font_path=font_path, background_color="white", margin=5).generate(text) + wordcloud = WordCloud(font_path=font_path, width=600, height=400, background_color="white", margin=4).generate(text) default_colors = wordcloud.to_array() - figure_title = "topic "+ str(topic) + rank = get_topicRank(topic, topicRanksFile) + figure_title = "topic "+ str(topic) + " ("+str(rank)+"/"+str(numOfTopics)+")" plt.imshow(wordcloud.recolor(color_func=get_color_scale, random_state=3)) plt.imshow(default_colors) plt.imshow(wordcloud) - plt.title(figure_title, fontsize=24) + plt.title(figure_title, fontsize=28) plt.axis("off") ## Saving the image file. diff --git a/tmw_config.py b/tmw_config.py index 86ab0b5..d581ab8 100644 --- a/tmw_config.py +++ b/tmw_config.py @@ -185,12 +185,13 @@ ### make_wordle_from_mallet ### Creates a wordle for each topic. word_weights_file = wdir+"6_mallet/" + "word-weights.txt" +topicRanksFile = wdir + "7_aggregates/" + "topicRanks.csv" numOfTopics = numOfTopics -words = 30 +words = 40 outfolder = wdir+"8_visuals/wordles/" font_path = font_path dpi = 300 -#tmw.make_wordle_from_mallet(word_weights_file,numOfTopics,words,outfolder,font_path,dpi) +#tmw.make_wordle_from_mallet(word_weights_file,numOfTopics, words,outfolder, topicRanksFile, font_path,dpi) ### crop_images ### Optional. Crops the wordle image files. From f88f68a362d57e9dce2ce7d126572d39dd21be36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christof=20Sch=C3=B6ch?= Date: Wed, 21 Oct 2015 16:45:45 +0200 Subject: [PATCH 55/56] make_wordle: increase title font size; Fixes https://github.com/cligs/tmw/issues/19 --- tmw.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tmw.py b/tmw.py index ba33d45..991e79b 100644 --- a/tmw.py +++ b/tmw.py @@ -900,7 +900,7 @@ def make_wordle_from_mallet(word_weights_file, plt.imshow(wordcloud.recolor(color_func=get_color_scale, random_state=3)) plt.imshow(default_colors) plt.imshow(wordcloud) - plt.title(figure_title, fontsize=28) + plt.title(figure_title, fontsize=30) plt.axis("off") ## Saving the image file. From 90020188d2dd97dc57f2ba57bcb082d098651e06 Mon Sep 17 00:00:00 2001 From: Ulrike Henny Date: Thu, 3 Mar 2016 11:42:43 +0100 Subject: [PATCH 56/56] Modul aktualisiert --- tmw.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/tmw.py b/tmw.py index 991e79b..d71a17d 100644 --- a/tmw.py +++ b/tmw.py @@ -966,9 +966,10 @@ def get_dataToPlot(average, firstWordsFile, mode, topTopicsShown, item): colmeans = allData.mean(axis=0) allData = allData / colmeans elif mode == "zscores": # zscore transformation - colmeans = allData.mean(axis=0) # mean for each topic - allstd = allData.stack().std() #std for entire df - allData = (allData - colmeans) / allstd # = zscore transf. + colmeans = allData.mean(axis=0) # ??? + colstd = allData.std(axis=0) #std for each topic + allData = (allData - colmeans) / colstd # = zscore transf. + elif mode == "absolute": # absolute values allData = allData allData = allData.T @@ -1141,7 +1142,7 @@ def get_heatmap_dataToPlot(average, mode, firstWordsFile, topTopicsShown, allScores = allScores / colmeans elif mode == "zscores": # zscore transformation colmeans = allScores.mean(axis=0) # mean for each topic - allstd = allScores.stack().std() #std for entire df + allstd = allScores.std(axis=0) #std for entire df allScores = (allScores - colmeans) / allstd # = zscore transf. elif mode == "absolute": # absolute values allScores = allScores @@ -1203,7 +1204,7 @@ def create_distinctiveness_heatmap(dataToPlot, outfolder): print("- doing the plotting...") sns.set_context("poster", font_scale=fontscale) - sns.heatmap(dataToPlot, annot=False, cmap="YlOrRd", square=False) + sns.heatmap(dataToPlot, annot=False, cmap="RdBu_r", square=False) # Nice: bone_r, copper_r, PuBu, OrRd, GnBu, BuGn, YlOrRd plt.title("Verteilung der Topic Scores", fontsize=20) plt.xlabel(targetCategory, fontsize=16) @@ -1445,6 +1446,7 @@ def build_itemScoreMatrix(averageDatasets, targetCategory, itemScores = itemScores.sort(columns=["sorting"], axis=0, ascending=False) itemScoreMatrix = itemScores.iloc[0:topicsPerItem,0:-1] itemScoreMatrix = itemScoreMatrix.T + """ itemScoreMatrix = itemScoreMatrix.drop("Allais", axis=0) itemScoreMatrix = itemScoreMatrix.drop("Audoux", axis=0) itemScoreMatrix = itemScoreMatrix.drop("Barbara", axis=0) @@ -1514,6 +1516,7 @@ def build_itemScoreMatrix(averageDatasets, targetCategory, itemScoreMatrix = itemScoreMatrix.drop("LeRougeGuitton", axis=0) itemScoreMatrix = itemScoreMatrix.drop("Toussaint", axis=0) itemScoreMatrix = itemScoreMatrix.drop("Khadra", axis=0) + """ #print(itemScoreMatrix) return itemScoreMatrix