diff --git a/__pycache__/tmw.cpython-34.pyc b/__pycache__/tmw.cpython-34.pyc index 80f234c..229b7df 100644 Binary files a/__pycache__/tmw.cpython-34.pyc and b/__pycache__/tmw.cpython-34.pyc differ diff --git a/extras/fr_pretokenize_subs.csv b/extras/fr_pretokenize_subs.csv deleted file mode 100644 index 273c5a8..0000000 --- a/extras/fr_pretokenize_subs.csv +++ /dev/null @@ -1,157 +0,0 @@ -"string§To§Find","string§To§Replace" -’,' -J,"Je " -qu'elle,que elle -"’","'" -"J'","Je " -"j'","je " -"S'","Se " -"s'","se " -"C'","Ce " -"c'","ce " -"N'","Ne " -"n'","ne " -"D'","De " -"d'","de " -"L'","Le " -"l'","la " -"T'","tu " -"t'","tu " -"-le"," le" -"-moi"," moi" -"m'","me " -"M'","Me " -"-je"," je" -"-il"," il" -"-on"," on" -"-lui"," lui" -"-elle"," elle" -"-nous"," nous" -"-vous"," vous" -"-nous"," nous" -"-ce"," ce" -"-tu"," tu" -"-toi"," toi" -"jusqu'à'","jusque à" -"aujourd'hui","aujourdhui" -"-t","" -"-y"," y" -"-en"," en" -"-ci"," ci" -"-là"," là" -"Qu'","Que " -"qu'","que " -"-même"," même" -" Il "," il " -" Ils "," ils " -" Elles "," elles " -" Elle "," elle " -" Je "," je " -" Tu "," tu " -" Toi "," toi " -" Nous "," nous " -" Vous "," vous " -" Mais "," mais " -" Ne "," ne " -" Et "," et " -" Pourquoi "," pourquoi " -" Alors "," alors " -" Aussi "," aussi " -" Car "," car " -" Au "," au " -" Ses "," ses " -" Se "," se " -" Moi "," moi " -" Toute "," toute " -" Tout "," tout " -" Hier "," hier " -" Non "," non " -" Comme "," comme " -" Dans "," dans " -" Pour "," pour " -" Voilà "," voilà " -" Son "," son " -" Une "," une " -" Un "," un " -" Où "," où " -" De "," de " -" Qui "," qui " -" Depuis "," depuis " -" Ça "," ça " -" Sur "," sur " -" Ensuite "," ensuite " -" Puis "," puis " -" On "," on " -" Si "," si " -" Même "," même " -" Toutefois "," toutefois " -" Ainsi "," ainsi " -" Aucun "," aucun " -" Ce "," ce " -" Ces "," ces " -" Toutes "," toutes " -" En "," en " -" Après "," après " -" Quel "," quel " -" Quelle "," quelle " -" Quand "," quand " -" Celle "," celle " -" Puisque "," puisque " -" Tous "," tous " -" Dès "," dès " -" Cet "," cet " -" Lorsque "," lorsque " -" Lui "," lui " -" Sauf "," sauf " -" Moins "," moins " -" Encore "," encore " -" Cependant "," cependant " -" Comment "," comment " -" Assez "," assez " -" Ma "," ma " -" Quelques "," quelques " -" Leurs "," leurs " -" Ceux "," ceux " -" Par "," par " -" Devant "," devant " -" Bien "," bien " -" Personne "," personne " -" Près "," près " -" Avant "," avant " -" Rien "," rien " -" Partout "," partout " -" Pourtant "," pourtant " -" Déjà "," déjà " -" Enfin "," enfin " -" Maintenant "," maintenant " -" Quoi "," quoi " -" Eh "," eh " -" Ah "," ah " -" Oh "," oh " -" Jamais "," jamais " -" Mon "," mon " -" Cela "," cela " -" Du "," du " -" Oui "," oui " -" Ou "," ou " -" Sa "," sa " -" Celui "," celui " -" Cette "," cette " -" Des "," des " -" Naturellement "," naturellement " -" Sans "," sans " -" Vos "," vos " -" Votre "," votre " -" Notre "," notre " -" Peut-être "," peut-être " -" Mes "," mes " -" Celle "," celle " -" Tant "," tant " -" Demain "," demain " -" Qu "," que " -" qu "," que " -" quelqu "," quelque " -" jusqu "," jusque " -" Jusqu "," jusque " -" aujourd hui "," aujourd'hui " -" "," " diff --git a/extras/fr_stopwords_errors.txt b/extras/fr_stopwords_errors.txt deleted file mode 100644 index 331dfad..0000000 --- a/extras/fr_stopwords_errors.txt +++ /dev/null @@ -1,738 +0,0 @@ -a -à -abord -aboutissant -achille -adieu -afin -aglaé -aglante -ah -ahi -ai -aidant -aie -ai-je -ailler -ailleurs -ainsi -ais -aise -aise -al -alexandre -aller -alors -angélique -annibal -après -arlequin -arrivant -assez -assurément -as-tu -a-t-elle -a-t-il -a-t-on -attends -atys -au -aucun -aucune -aucuns -aujour -aujourd -aujourd'hui -auprès -aussi -aussitôt -autant -autre -autrement -autres -aux -avant -avec -avecque -avez-vous -avoir -avoir -baccarat -bailli -bajazet -barbier -bas -bazile -beaucoup -bégayait -bel -ben -bérénice -bian -biau -bien -bientôt -bizarre -blaise -bon -bonne -bous -bout -brousse -brute -c -ça -çà -cab -calo -canadien -capucin -car -cassandre -caton -ce -cé -ceci -cela -celle -celles -celui -cent -cent -cents -cents -cependant -certain -ces -ces -césar -cesse -cesse -cet -cette -ceux -chacun -chaque -chatouilleuse -che -chère -cheux -chez -chourineur -ci -cinq -cinquante -claudine -clémence -colette -colin -combien -comme -comment -contre -courant -crois-moi -croyez-moi -croyez-vous -cru -crus -crût -cynthia -d -da -d'abord -d'ailleurs -damis -dan -dans -daphné -davantage -de -dé -debout -début -dedans -dehors -déjà -demain -depuis -dernier -dernière -des -dès -descendant -désormais -dessus -deux -devant -di -dire -dis-je -dis-moi -dis-tu -dites-moi -dites-vous -dit-il -dix -dix-huit -dix-neuf -dix-sept -do -dois-je -dom -donc -dont -dorante -dos -douze -drès -droite -du -dur -écoutez-moi -effraya -effrayait -effrayant -effrayé -effrayée -effrayer -effrayés -effrayons -eh -élise -elle -elle-même -elles -elles-mêmes -embrun -émeri -en -encor -encore -enfin -ensemble -entendant -entier -entrait -entre -essai -essaya -essayai -essayais -essayait -essayant -essayé -essayer -essayez -est -est-ce -est-elle -est-il -es-tu -et -étai -état -été -êtes-vous -être -êtres -eu -eun -eune -eus -eûs -eut -eût -eux -eux-mêmes -fa -fade -faible -faire -faites-vous -falloir -fatmé -faut-il -fi -figaro -fil -fin -fis -fit -fois -folles -force -fort -fossinde -frontin -fur -fût -gauche -gerfaut -gille -gilles -glisse -goualeuse -grave -gris -guère -guise -ha -haut -hé -hélas -heureux -hi -hier -hippolyte -ho -holà -homme-là -hors -hui -huit -hylas -i -ici -ii -il -ils -in -indifférent -indispensable -insu -irai -isabelle -isolé -itou -j -jamais -jason -jaune -je -jé -jean -jeté -jj -joyeux -juan -jugé -jusq -jusqu -jusque -jusques -juste -justement -l -la -là -là-bas -là-dedans -là-dessus -laisse-moi -laisser -laissez-moi -large -le -lé -léandre -les -leur -leurs -levant -levé -levé -li -lire -lis -lisai -lisaient -lisais -lisait -lisant -lisette -lisez -lisons -loin -long -longtemps -lors -lorsq -lorsque -lucas -lucile -lui -lui-même -lut -ly -m -ma -mac -magnier -maintenant -mais -mal -malgré -manière -manqué -margot -marie -marié -marmouset -marton -mathurin -mauvais -me -mé -méchant -même -mêmes -mêmes -ménandre -mes -mettre -mi -mien -mienne -miens -miens -mieux -mille -mille -mine -mis -mise -moderne -moi -moi-même -moins -mon -monsieu -monsir -morgué -mot -moue -moujik -muet -muet -muette -n -ne -né -nérine -ni -no -noir -nommés -non -nos -notre -nôtre -nous -noute -nouveaux -nu -nue -nul -oh -on -ons -ont -onze -oronte -ose -ou -où -oublie -oui -ous -ouvrait -ouvrant -ouvrons -palmure -palsangué -paquier -par -parbleu -parce -pareil -parfait -pargué -parler -parmi -parole -parsonne -part -parton -partout -paru -pas -paya -paya -payai -payais -payait -payât -payé -payées -payer -payés -payez -payons -peignait -pendant -pensé -per -perdre -personne -personnes -peu -peut-être -peut-il -peut-on -pierre -pis -pleurer -plu -plupart -plus -plusieurs -plut -plutôt -point -pompée -porter -possible -pou -pour -pourquoi -pourtant -pourvu -poussa -pouvez-vous -pouvoir -premier -première -prendre -près -presque -priant -prie -pris -prise -promptement -puis -puis-je -puisq -puisque -pût -pyrrhus -qu -quaker -quakeresse -quand -quant -quarante -quatorze -quatre -quatre-vingt -quatre-vingt-dix -que -qué -quel -quelle -quelles -quelq -quelque -quelque -quelquefois -quelques -quelqu'un -quels -queu -queuque -qui -quinze -quinze -quoi -quoiq -quoique -raide -ram -ramené -ramener -ramener -reçois -refait -rendre -rendre -rénine -rentré -reste -rester -rian -riche -rien -rocambole -rosine -rouge -s -sa -sachem -sais-tu -sanche -sans -saurer -savez-vous -savoir -scipion -se -second -seconde -secouée -seize -selon -semble -sembler -sen -serra -ses -seul -seule -seulement -si -sien -sienne -signifiant -sis -sitot -six -soi-même -soixante -soixante-dix -sommes -son -songez -sont -sophie -sortant -sortir -soudain -sous -soutenant -souvent -ste -sti -suis -suis -suivre -sujet -sur -sûr -surtout -sylla -sylvanire -t -ta -taisez-vous -tandis -tant -tantôt -tartarin -tatigué -te -té -tel -téléga -telle -tellement -tels -tenez -tenir -tenons -tente -terrier -tes -thésée -tien -tienne -tiennent -tiens -tient -timar -tirant -tirer -tirinte -tirsis -toi -toi-même -tom -tombée -ton -tôt -toujours -tous -tout -toute -toutefois -toutes -travers -treize -tremblant -tremblante -tremble -trente -très -trois -trop -trouvé -trouver -tu -tullie -turc -un -une -unknown - -ur -ursuline -utile -v -vanda -vela -velà -venir -venu -vers -veux-tu -vi -viant -vingt -vint -vis -vit -vite -vivant -vla -vlà -voici -voilà -voir -vois-je -vois-je -vois-tu -voit -vont -vos -votre -vôtre -vouloir -vous -vous-même -voute -voyant -vraiment -vue -waterproof -y -zoé diff --git a/extras/fr_stopwords_project.txt b/extras/fr_stopwords_project.txt deleted file mode 100644 index 1a3f48d..0000000 --- a/extras/fr_stopwords_project.txt +++ /dev/null @@ -1,55 +0,0 @@ -air -ais -an -année -bras -brousse -chose -chott -côté -coup -doute -effet -état -été -façon -fait -femme -fois -fond -genre -gens -heure -homme -instant -jour -lieu -main -mal -mètre -milieu -moment -monde -nom -nu -nue -oeil -œil -parole -pas -peine -personne -personnes -petit -pied -place -sens -sorte -suite -temps -tête -tour -travers -un -vieux -voix diff --git a/tmw.py b/tmw.py index f84e357..d71a17d 100644 --- a/tmw.py +++ b/tmw.py @@ -1,4 +1,5 @@ #!/usr/bin/env python3 +# -*- coding: utf-8 -*- # Filename: tmw.py # Author: #cf @@ -79,6 +80,13 @@ def tei5reader_fulldocs(inpath, outfolder): print("Done.") +# Utility function for writing segments +def writesegment(segment, outfolder, filename, counter, mode="w"): + from os.path import join + segname = join(outfolder, filename + "§{:04d}".format(counter) + ".txt") + with open(segname, mode) as output: + output.write(' '.join(segment)) + output.close() ################################# # segmenter # @@ -100,25 +108,30 @@ def writesegment(segment, outfolder, filename, target, tolerancefactor, preserve from os.path import join global currentsegmentsize global counter + # ignore empty segments if segment == ["\n"] or len(segment) < 1: return + # workaround for easy inter line-spacing in case of paragraph removal for lines combined into one segment if not preserveparagraphs and segment[-1] == "\n": segment = segment[0:len(segment) - 1] segment[-1] += " " segname = join(outfolder, filename + "§{:04d}".format(counter) + ".txt") relname = filename + "§{:04d}".format(counter) + ".txt" + # case: last segment is too small => fill with (slice of) new segment if currentsegmentsize * tolerancefactor < target: # min size limit not reached => split #split segment wordsliceindex = target - currentsegmentsize + # if it's too big: slice! if currentsegmentsize + len(segment) > target * tolerancefactor: - #print(relname + "\t Last segment size: " + str(currentsegmentsize) + "\t appending " + str(wordsliceindex) + "\t for a total of " + str((currentsegmentsize + wordsliceindex))) + print(relname + "\t Last segment size: " + str(currentsegmentsize) + "\t appending " + str(wordsliceindex) + "\t for a total of " + str((currentsegmentsize + wordsliceindex))) write(segment[0:wordsliceindex], segname, "a") currentsegmentsize += wordsliceindex segment = segment[wordsliceindex:len(segment)] + # segment is filled. continue with next one counter += 1 currentsegmentsize = 0 @@ -128,18 +141,20 @@ def writesegment(segment, outfolder, filename, target, tolerancefactor, preserve os.remove(segname) # else just add text to current segment else: - #print(relname + "\t Last segment size: " + str(currentsegmentsize) + "\t appending " + str(len(segment)) + "\t for a total of " + str((currentsegmentsize + len(segment)))) + print(relname + "\t Last segment size: " + str(currentsegmentsize) + "\t appending " + str(len(segment)) + "\t for a total of " + str((currentsegmentsize + len(segment)))) # segment fits so append write(segment, segname, "a") currentsegmentsize += len(segment) - segment.count("\n") # take possible segment end into account! # done return + # case: new segment is too big # if segment > target: slice segment while len(segment) > target * tolerancefactor: - #print(relname + "\t Last segment size: " + str(currentsegmentsize) + "\t appending " + str(target) + "\t for a total of " + str((currentsegmentsize + target))) + print(relname + "\t Last segment size: " + str(currentsegmentsize) + "\t appending " + str(target) + "\t for a total of " + str((currentsegmentsize + target))) write(segment[0:target], segname) segment = segment[target:len(segment)] + # segment is filled. continue with next one counter += 1 currentsegmentsize = 0 @@ -147,12 +162,15 @@ def writesegment(segment, outfolder, filename, target, tolerancefactor, preserve relname = filename + "§{:04d}".format(counter) + ".txt" if os.path.isfile(segname): os.remove(segname) - #print(relname + "\t New segment with size \t0") + print(relname + "\t New segment with size \t0") + # now size of segment is < target if (len(segment) == 0): #segment was perfectly sliced so we are done return + # there's some part of segment left, write this into file + # if the remaining part is exceeding current segment's capacity start new segment if currentsegmentsize + len(segment) > target * tolerancefactor: # segment is filled. continue with next one @@ -162,17 +180,23 @@ def writesegment(segment, outfolder, filename, target, tolerancefactor, preserve relname = filename + "§{:04d}".format(counter) + ".txt" if os.path.isfile(segname): os.remove(segname) - #print(relname + "\t New segment with size \t0") - #print(relname + "\t Last segment size: " + str(currentsegmentsize) + "\t appending " + str(len(segment)) + "\t for a total of " + str((currentsegmentsize + len(segment)))) + + print(relname + "\t New segment with size \t0") + + print(relname + "\t Last segment size: " + str(currentsegmentsize) + "\t appending " + str(len(segment)) + "\t for a total of " + str((currentsegmentsize + len(segment)))) currentsegmentsize += len(segment) - segment.count("\n") # take possible segment end into account! write(segment, segname, "a") -def segmenter(inpath, outfolder, target, sizetolerancefactor, preserveparagraphs = False): - """Script for turning plain text files into equal-sized segments, with limited respect for paragraph boundaries.""" - print("\nLaunched segmenter.") +def segmenter(inpath, outfolder, target, sizetolerancefactor, preserveparagraphs): + """Script for turning plain text files into equal-sized segments, without respecting paragraph boundaries.""" + print("\nLaunched segmenter.") + import os + import re + from os import listdir from os.path import join from nltk.tokenize import word_tokenize + import glob if not os.path.exists(outfolder): os.makedirs(outfolder) @@ -192,13 +216,15 @@ def segmenter(inpath, outfolder, target, sizetolerancefactor, preserveparagraphs os.remove(segname) # segment contains words assigned to the current segment segment = [] - # go thru paragraphs one by one + + # go through paragraphs one by one for line in infile: text = line - # remove special characters and space-chains - text = re.sub("[,;\.!?—\t\r\n\v\f]", " ", text) - text = re.sub("-", " ", text) + # (optional) remove punctuation, special characters and space-chains + #text = re.sub("[,;\.:!?¿\(\)—-]", " ", text) + text = re.sub("[\t\r\n\v\f]", " ", text) text = re.sub("[ ]{1,9}", " ", text) + # tokenize text words = word_tokenize(text) words.append("\n") @@ -211,7 +237,102 @@ def segmenter(inpath, outfolder, target, sizetolerancefactor, preserveparagraphs # Binning # ################################# -# TODO: Rewrite entirely to make compatible with mastermatrix. +def segments_to_bins(inpath, outfolder, binsnb): + """Script for sorting text segments into bins.""" + print("\nLaunched segments_to_bins.") + + import math, sys + import os + import glob + from collections import Counter + import pandas as pd + + ### Define various objects for later use. + txtids = [] + segids = [] + + filenames = [] + binids = [] + + offset = sys.maxsize # used to track wrong segmenting (i.e. with segment numbering not starting with 0) + + ### Get filenames, text identifiers, segment identifiers. + for file in glob.glob(inpath): + filename = os.path.basename(file)[:-4] + txtid = filename[:6] + txtids.append(txtid) + segid = filename[-4:] + #print(filename, txtid, segid) + segids.append(segid) + offset = min(offset, int(segid)) + #txtids_sr = pd.Series(txtids) + #segids_sr = pd.Series(segids) + + if offset > 0: + print("Warning! Segment numbering should start at 0. Using offset: " + str(offset)) + + ### For each text identifier, get number of segments. + txtids_ct = Counter(txtids) + sum_segnbs = 0 + for txtid in txtids_ct: + segnb = txtids_ct[txtid] + #print(segnb) + sum_segnbs = sum_segnbs + segnb + #print(txtid, segnb) + print("Total number of segments: ", sum_segnbs) + + for txtid in txtids_ct: + countsegs = txtids_ct[txtid] + if binsnb > int(countsegs): + print("Warning! You are expecting more bins than segments available! Bins will not be filled continuously!") + + ### Match each filename to the number of segments of the text. + + bcount = dict() + for i in range(0, binsnb): + bcount[i] = 0 + + for file in glob.glob(inpath): + filename = os.path.basename(file)[:-4] + for txtid in txtids_ct: + if txtid in filename: + filename = filename + "$" + str(txtids_ct[txtid]) + #print(filename) + + ### For each filename, compute and append bin number + txtid = filename[0:6] + segid = filename[7:11] + segnb = filename[12:] + #print(txtid,segid,segnb) + binid = "" + + segprop = (int(segid) - offset) / int(segnb) + #print(txtid, segid, segnb, segprop) + + + binid = math.floor(segprop * binsnb) + + if binid == binsnb: # avoid 1.0 beeing in seperate bin (should never happen due to offset!) + print("Error: Segment numbering is wrong! Continuing anyway...") + binid -= 1 + + bcount[binid] += 1 + + #print(segprop, binid) + + filenames.append(filename[:11]) + binids.append(binid) + filenames_sr = pd.Series(filenames, name="segmentID") + binids_sr = pd.Series(binids, name="binID") + files_and_bins = pd.concat([filenames_sr,binids_sr], axis=1) + print("chunks per bin: ", bcount) + + if not os.path.exists(outfolder): + os.makedirs(outfolder) + outfile = outfolder+"segs-and-bins.csv" + with open(outfile, "w") as outfile: + files_and_bins.to_csv(outfile, index=False) + @@ -227,13 +348,17 @@ def perform_multipleSubs(substitutionsFile, text): ## Load table and turn into dict with open(substitutionsFile, "r") as subsFile: subs = csv.reader(subsFile) - for rows in subs: - subsDict = {rows[0]:rows[1] for rows in subs} - #print(subsDict) + subsDict = {rows[0]:rows[1] for rows in subs} + for key, value in subsDict.items(): + text = re.sub(key, value, text) + #print(text) + return text + ## Create a regular expression from the dictionary keys - regex = re.compile("(%s)" % "|".join(map(re.escape, subsDict.keys()))) + #regex = re.compile("(%s)" % "|".join(map(re.escape, subsDict.keys()))) ## For each match, look-up corresponding value in dictionary - return regex.sub(lambda mo: subsDict[mo.string[mo.start():mo.end()]], text) + #result = regex.sub(lambda mo: subsDict[mo.string[mo.start():mo.end()]], text) + #print(result) def pretokenize(inpath, substitutionsFile, outfolder): """Deletion of unwanted elided and hyphenated words for better tokenization in TreeTagger. Optional.""" @@ -243,6 +368,8 @@ def pretokenize(inpath, substitutionsFile, outfolder): text = text.read() text = perform_multipleSubs(substitutionsFile, text) basename = os.path.basename(file) + if "truc" in text or "type" in text or "flic" in text: + print("Found bad word in", basename) cleanfilename = basename if not os.path.exists(outfolder): os.makedirs(outfolder) @@ -331,10 +458,16 @@ def make_lemmatext(inpath, outfolder, mode, stoplist_errors): elif "NOM" in pos or "VER" in pos or "ADJ" in pos or "ADV" in pos and "|" not in lemma and "" not in lemma: lemmata.append(lemma.lower()) elif mode == "esN": - if "|" in lemma: + if "|" in lemma and "NC" in pos: lemmata.append(token.lower()) elif "NC" in pos and "|" not in lemma and "" not in lemma: lemmata.append(lemma.lower()) + elif mode == "enNV": + if "NN" in pos or "VB" in pos and "|" not in lemma and "" not in lemma: + lemmata.append(lemma.lower()) + elif mode == "enN": + if "NN" in pos and "|" not in lemma and "" not in lemma: + lemmata.append(lemma.lower()) ## Continue with list of lemmata, but remove undesired leftover words lemmata = ' '.join([word for word in lemmata if word not in stoplist]) lemmata = re.sub("[ ]{1,4}"," ", lemmata) @@ -347,19 +480,68 @@ def make_lemmatext(inpath, outfolder, mode, stoplist_errors): + + + +################################# +# substitute # +################################# + +import csv + +def multipleSubs(substitutionsFile, text): + """Search and replace from a table of string pairs.""" + ## With code from http://stackoverflow.com/users/735204/emmett-j-butler + ## Load table and turn into dict + with open(substitutionsFile, "r") as subsFile: + subs = csv.reader(subsFile) + subsDict = {rows[0]:rows[1] for rows in subs} + for key, value in subsDict.items(): + text = re.sub(key, value, text) + #print(text) + return text + + ## Create a regular expression from the dictionary keys + #regex = re.compile("(%s)" % "|".join(map(re.escape, subsDict.keys()))) + ## For each match, look-up corresponding value in dictionary + #result = regex.sub(lambda mo: subsDict[mo.string[mo.start():mo.end()]], text) + #print(result) + +def substitute(inpath, substitutionsFile, outfolder): + """Deletion of unwanted elided and hyphenated words for better tokenization in TreeTagger. Optional.""" + print("\nLaunched substitute.") + for file in glob.glob(inpath): + with open(file,"r") as text: + text = text.read() + text = multipleSubs(substitutionsFile, text) + basename = os.path.basename(file) + counter = 0 + if " truc " in text or " type " in text or " flic " in text: + counter +=1 + print(counter) + cleanfilename = basename + if not os.path.exists(outfolder): + os.makedirs(outfolder) + with open(os.path.join(outfolder, cleanfilename),"w") as output: + output.write(text) + print("Done.") + + + + + + ################################################################## ### TOPIC MODELLING WITH MALLET ### ################################################################## -# TODO: Concatenate two stoplists first, one for errors, one for deliberate ommissions. - ################################# # call_mallet_import # ################################# -def call_mallet_import(mallet_path, infolder,outfolder, outfile, stoplist_project): +def call_mallet_import(mallet_path, infolder, outfolder, outfile, stoplist_project): """Function to import text data into Mallet.""" print("\nLaunched call_mallet_import.") import subprocess @@ -380,14 +562,12 @@ def call_mallet_import(mallet_path, infolder,outfolder, outfile, stoplist_projec # call_mallet_modeling # ################################# -def call_mallet_modeling(mallet_path, inputfile,outfolder,num_topics,optimize_interval,num_iterations,num_top_words,doc_topics_max): +def call_mallet_modeling(mallet_path, inputfile,outfolder,numOfTopics,optimize_interval,num_iterations,num_top_words,doc_topics_max): """Function to perform topic modeling with Mallet.""" print("\nLaunched call_mallet_modeling.") - ### Getting ready. import os import subprocess - if not os.path.exists(outfolder): os.makedirs(outfolder) @@ -399,7 +579,7 @@ def call_mallet_modeling(mallet_path, inputfile,outfolder,num_topics,optimize_in output_topic_state = outfolder + "topic_state.gz" ### Constructing Mallet command from parameters. - command = mallet_path +" train-topics --input "+ inputfile +" --num-topics "+ num_topics +" --optimize-interval "+ optimize_interval +" --num-iterations " + num_iterations +" --num-top-words " + num_top_words +" --word-topic-counts-file "+ word_topics_counts_file + " --topic-word-weights-file "+ topic_word_weights_file +" --output-state topic-state.gz"+" --output-topic-keys "+ output_topic_keys +" --output-doc-topics "+ output_doc_topics +" --doc-topics-max "+ doc_topics_max + " --output-state " + output_topic_state + command = mallet_path +" train-topics --input "+ inputfile +" --num-topics "+ numOfTopics +" --optimize-interval "+ optimize_interval +" --num-iterations " + num_iterations +" --num-top-words " + num_top_words +" --word-topic-counts-file "+ word_topics_counts_file + " --topic-word-weights-file "+ topic_word_weights_file +" --output-state topic-state.gz"+" --output-topic-keys "+ output_topic_keys +" --output-doc-topics "+ output_doc_topics +" --doc-topics-max "+ doc_topics_max + " --output-state " + output_topic_state #print(command) subprocess.call(command, shell=True) print("Done.\n") @@ -423,15 +603,15 @@ def call_mallet_modeling(mallet_path, inputfile,outfolder,num_topics,optimize_in import glob def get_metadata(metadatafile): - print(" Getting metadata...") + print("- getting metadata...") """Read metadata file and create DataFrame.""" metadata = pd.DataFrame.from_csv(metadatafile, header=0, sep=",") #print("metadata\n", metadata) return metadata -def get_topicscores(topics_in_texts, number_of_topics): +def get_topicscores(topics_in_texts, numOfTopics): """Create a matrix of segments x topics, with topic score values, from Mallet output.""" - print(" Getting topicscores...") + print("- getting topicscores...") ## Load Mallet output (strange format) topicsintexts = pd.read_csv(topics_in_texts, header=None, skiprows=[0], sep="\t", index_col=0) #topicsintexts = topicsintexts.iloc[0:100,] ### For testing only!! @@ -449,7 +629,7 @@ def get_topicscores(topics_in_texts, number_of_topics): scores = [] ## For each segment, get the topic number and its score i +=1 - for j in range(1,number_of_topics,2): + for j in range(1,numOfTopics,2): k = j+1 topic = topicsintexts.iloc[i,j] score = topicsintexts.iloc[i,k] @@ -471,7 +651,7 @@ def get_topicscores(topics_in_texts, number_of_topics): def get_docmatrix(corpuspath): """Create a matrix containing segments with their idnos.""" - print(" Getting docmatrix...") + print("- getting docmatrix...") ## Create dataframe with filenames of segments and corresponding idnos. segs = [] idnos = [] @@ -487,18 +667,18 @@ def get_docmatrix(corpuspath): return docmatrix def merge_data(corpuspath, metadatafile, topics_in_texts, mastermatrixfile, - number_of_topics): + numOfTopics): """Merges the three dataframes into one mastermatrix.""" - print(" Getting data...") + print("- getting data...") ## Get all necessary data. metadata = get_metadata(metadatafile) docmatrix = get_docmatrix(corpuspath) - topicscores = get_topicscores(topics_in_texts, number_of_topics) + topicscores = get_topicscores(topics_in_texts, numOfTopics) ## For inspection only. #print("Metadata\n", metadata.head()) #print("Docmatrix\n", docmatrix.head()) #print("topicscores\n", topicscores.head()) - print(" Merging data...") + print("- merging data...") ## Merge metadata and docmatrix, matching each segment to its metadata. mastermatrix = pd.merge(docmatrix, metadata, how="inner", on="idno") #print("mastermatrix: metadata and docmatrix\n", mastermatrix) @@ -510,16 +690,29 @@ def merge_data(corpuspath, metadatafile, topics_in_texts, mastermatrixfile, #print("mastermatrix: all three\n", mastermatrix.head()) return mastermatrix +def add_binData(mastermatrix, binDataFile): + print("- adding bin data...") + ## Read the information about bins + binData = pd.read_csv(binDataFile, sep=",") + #print(binData) + ## Merge existing mastermatrix and binData. + mastermatrix = pd.merge(mastermatrix, binData, how="inner", on="segmentID") + #print(mastermatrix) + return mastermatrix + def create_mastermatrix(corpuspath, outfolder, mastermatrixfile, metadatafile, - topics_in_texts, number_of_topics): + topics_in_texts, numOfTopics, useBins, binDataFile): """Builds the mastermatrix uniting all information about texts and topic scores.""" - print("\nLaunched create_mastermatrix. (This could take a while.)") + print("\nLaunched create_mastermatrix.") + print("(Warning: This is very memory-intensive and may take a while.)") if not os.path.exists(outfolder): os.makedirs(outfolder) mastermatrix = merge_data(corpuspath, metadatafile, topics_in_texts, - mastermatrixfile, number_of_topics) + mastermatrixfile, numOfTopics) + if useBins == True: + mastermatrix = add_binData(mastermatrix, binDataFile) mastermatrix.to_csv(outfolder+mastermatrixfile, sep=",", encoding="utf-8") - print(" Saved mastermatrix. Segments and columns:", mastermatrix.shape) + print("Done. Saved mastermatrix. Segments and columns:", mastermatrix.shape) @@ -538,7 +731,10 @@ def calculate_averageTopicScores(mastermatrixfile, targets, outfolder): for target in targets: grouped = mastermatrix.groupby(target, axis=0) avg_topicscores = grouped.agg(np.mean) - avg_topicscores = avg_topicscores.drop(["year"], axis=1) + if target != "year": + avg_topicscores = avg_topicscores.drop(["year"], axis=1) + if target != "binID": + avg_topicscores = avg_topicscores.drop(["binID"], axis=1) #avg_topicscores = avg_topicscores.drop(["tei"], axis=1) ## Save grouped averages to CSV file for visualization. resultfilename = "avgtopicscores_by-"+target+".csv" @@ -549,6 +745,33 @@ def calculate_averageTopicScores(mastermatrixfile, targets, outfolder): print("Done.") +################################ +# complexAverageTopicScores # +################################ + +def calculate_complexAverageTopicScores(mastermatrixfile, targets, outfolder): + """Function to calculate average topic scores based on the mastermatrix.""" + print("\nLaunched calculate_complexAverageTopicScores.") + if not os.path.exists(outfolder): + os.makedirs(outfolder) + with open(mastermatrixfile, "r") as infile: + mastermatrix = pd.DataFrame.from_csv(infile, header=0, sep=",") + ## Calculate average topic scores for each target category + grouped = mastermatrix.groupby(targets, axis=0) + avg_topicscores = grouped.agg(np.mean) + if "year" not in targets: + avg_topicscores = avg_topicscores.drop(["year"], axis=1) + if "binID" not in targets: + avg_topicscores = avg_topicscores.drop(["binID"], axis=1) + #print(avg_topicscores) + ## Save grouped averages to CSV file for visualization. + identifierstring = '+'.join(map(str, targets)) + resultfilename = "complex-avgtopicscores_by-"+identifierstring+".csv" + resultfilepath = outfolder+resultfilename + avg_topicscores.to_csv(resultfilepath, sep=",", encoding="utf-8") + print("Done. Saved average topic scores for: "+identifierstring) + + ################################# # save_firstWords # @@ -575,7 +798,7 @@ def save_firstWords(topicWordFile, outfolder, filename): #firstWordsSeries.index.name = "topic" #firstWordsSeries = firstWordsSeries.rename(columns = {'two':'new_name'}) firstWordsSeries.reindex_axis(["firstwords"]) - print(firstWordsSeries) + #print(firstWordsSeries) ## Saving the file. if not os.path.exists(outfolder): os.makedirs(outfolder) @@ -585,6 +808,30 @@ def save_firstWords(topicWordFile, outfolder, filename): print("Done.") +################################# +# save_topicRanks # +################################# + +def save_topicRanks(topicWordFile, outfolder, filename): + """Save a list of topics with their rank by topic score.""" + print("Launched save_topicRanks.") + with open(topicWordFile, "r") as infile: + topicRanks = pd.read_csv(infile, sep="\t", header=None) + topicRanks = topicRanks.drop(2, axis=1) + topicRanks.rename(columns={0:"Number"}, inplace=True) + topicRanks.rename(columns={1:"Score"}, inplace=True) + #topicRanks.sort(columns=["Score"], ascending=False, inplace=True) + topicRanks["Rank"] = topicRanks["Score"].rank(ascending=False) + #print(topicRanks.head()) + ## Saving the file. + if not os.path.exists(outfolder): + os.makedirs(outfolder) + outfile = outfolder + filename + with open(outfile, "w") as outfile: + topicRanks.to_csv(outfile) + print("Done.") + + ################################################################## ### VISUALIZATION ### @@ -593,73 +840,77 @@ def save_firstWords(topicWordFile, outfolder, filename): import matplotlib.pyplot as plt - ################################# # make_wordle_from_mallet # ################################# -def make_wordle_from_mallet(word_weights_file,topics,words,outfolder, +from wordcloud import WordCloud +import random + +def read_mallet_output(word_weights_file): + """Reads Mallet output (topics with words and word weights) into dataframe.""" + word_scores = pd.read_table(word_weights_file, header=None, sep="\t") + word_scores = word_scores.sort(columns=[0,2], axis=0, ascending=[True, False]) + word_scores_grouped = word_scores.groupby(0) + #print(word_scores.head()) + return word_scores_grouped + +def get_wordlewords(words, word_weights_file, topic): + """Transform Mallet output for wordle generation.""" + topic_word_scores = read_mallet_output(word_weights_file).get_group(topic) + top_topic_word_scores = topic_word_scores.iloc[0:words] + topic_words = top_topic_word_scores.loc[:,1].tolist() + word_scores = top_topic_word_scores.loc[:,2].tolist() + wordlewords = "" + j = 0 + for word in topic_words: + word = word + score = word_scores[j] + j += 1 + wordlewords = wordlewords + ((word + " ") * score) + return wordlewords + +def get_color_scale(word, font_size, position, orientation, font_path, random_state=None): + """ Create color scheme for wordle.""" + return "hsl(245, 58%, 25%)" # Default. Uniform dark blue. + #return "hsl(0, 00%, %d%%)" % random.randint(80, 100) # Greys for black background. + #return "hsl(221, 65%%, %d%%)" % random.randint(30, 35) # Dark blues for white background + +def get_topicRank(topic, topicRanksFile): + #print("getting topic rank.") + with open(topicRanksFile, "r") as infile: + topicRanks = pd.read_csv(infile, sep=",", index_col=0) + rank = int(topicRanks.iloc[topic]["Rank"]) + return rank + + +def make_wordle_from_mallet(word_weights_file, + numOfTopics,words,outfolder, + topicRanksFile, font_path, dpi): """Generate wordles from Mallet output, using the wordcloud module.""" print("\nLaunched make_wordle_from_mallet.") - - from wordcloud import WordCloud - import random - - if not os.path.exists(outfolder): - os.makedirs(outfolder) - - def read_mallet_output(word_weights_file): - """Reads Mallet output (topics with words and word weights) into dataframe.""" - word_scores = pd.read_table(word_weights_file, header=None, sep="\t") - word_scores = word_scores.sort(columns=[0,2], axis=0, ascending=[True, False]) - word_scores_grouped = word_scores.groupby(0) - #print(word_scores.head()) - return word_scores_grouped - - def get_wordlewords(words,topic): - """Transform Mallet output for wordle generation.""" - topic_word_scores = read_mallet_output(word_weights_file).get_group(topic) - top_topic_word_scores = topic_word_scores.iloc[0:words] - topic_words = top_topic_word_scores.loc[:,1].tolist() - word_scores = top_topic_word_scores.loc[:,2].tolist() - wordlewords = "" - j = 0 - for word in topic_words: - word = word - score = word_scores[j] - j += 1 - wordlewords = wordlewords + ((word + " ") * score) - return wordlewords - - def get_color_scale(word, font_size, position, orientation, random_state=None): - """ Create color scheme for wordle.""" - #return "hsl(0, 00%, %d%%)" % random.randint(80, 100) # Greys for black background. - return "hsl(221, 65%%, %d%%)" % random.randint(30, 35) # Dark blue for white background - -# TODO: pack this into a proper separate function. - - ## Creates the wordle visualisation, using results from the above functions. - for topic in range(0,topics): - ## Defines filename and title for the wordle image. - figure_filename = "wordle_tp"+"{:03d}".format(topic) + ".png" - figure_title = "topic "+ str(topic) + for topic in range(0,numOfTopics): ## Gets the text for one topic. - text = get_wordlewords(words,topic) - #print(text) - ## Generates, recolors and saves the wordcloud. - #original# wordcloud = WordCloud(background_color="white", margin=5).generate(text) - #font_path = "/home/christof/.fonts/AveriaSans-Regular.ttf" - wordcloud = WordCloud(font_path=font_path, background_color="white", margin=5).generate(text) + text = get_wordlewords(words, word_weights_file, topic) + wordcloud = WordCloud(font_path=font_path, width=600, height=400, background_color="white", margin=4).generate(text) default_colors = wordcloud.to_array() + rank = get_topicRank(topic, topicRanksFile) + figure_title = "topic "+ str(topic) + " ("+str(rank)+"/"+str(numOfTopics)+")" plt.imshow(wordcloud.recolor(color_func=get_color_scale, random_state=3)) plt.imshow(default_colors) plt.imshow(wordcloud) - plt.title(figure_title, fontsize=24) + plt.title(figure_title, fontsize=30) plt.axis("off") + + ## Saving the image file. + if not os.path.exists(outfolder): + os.makedirs(outfolder) + figure_filename = "wordle_tp"+"{:03d}".format(topic) + ".png" plt.savefig(outfolder + figure_filename, dpi=dpi) plt.close() print("Done.") + def crop_images(inpath, outfolder, left, upper, right, lower): """ Function to crop wordle files.""" @@ -705,12 +956,22 @@ def get_targetItems(average, targetCategory): #print(targetItems) return(targetItems) -def get_dataToPlot(average, firstWordsFile, topTopicsShown, item): +def get_dataToPlot(average, firstWordsFile, mode, topTopicsShown, item): """From average topic score data, select data to be plotted.""" #print(" Getting dataToPlot.") with open(average, "r") as infile: ## Read the average topic score data allData = pd.DataFrame.from_csv(infile, sep=",") + if mode == "normalized": # mean normalization + colmeans = allData.mean(axis=0) + allData = allData / colmeans + elif mode == "zscores": # zscore transformation + colmeans = allData.mean(axis=0) # ??? + colstd = allData.std(axis=0) #std for each topic + allData = (allData - colmeans) / colstd # = zscore transf. + + elif mode == "absolute": # absolute values + allData = allData allData = allData.T ## Add top topic words to table for display later firstWords = get_firstWords(firstWordsFile) @@ -723,30 +984,34 @@ def get_dataToPlot(average, firstWordsFile, topTopicsShown, item): #print(dataToPlot) return dataToPlot -def create_barchart_topTopics(dataToPlot, targetCategory, item, +def create_barchart_topTopics(dataToPlot, targetCategory, mode, item, fontscale, height, dpi, outfolder): """Function to make a topTopics barchart.""" - print(" Creating plot for: "+item) + print(" Creating plot for: "+str(item)) ## Doing the plotting. dataToPlot.plot(kind="bar", legend=None) plt.setp(plt.xticks()[1], rotation=90, fontsize = 11) - plt.title("Top-Topics für: "+item, fontsize=15) - plt.ylabel("Scores", fontsize=13) + if mode == "normalized": + plt.title("Top-distinctive Topics für: "+str(item), fontsize=15) + plt.ylabel("normalized scores", fontsize=13) + elif mode == "absolute": + plt.title("Top-wichtigste Topics für: "+str(item), fontsize=15) + plt.ylabel("absolute scores", fontsize=13) plt.xlabel("Topics", fontsize=13) + plt.tight_layout() if height != 0: plt.ylim((0.000,height)) - plt.tight_layout() - + ## Saving the plot to disk. outfolder = outfolder+targetCategory+"/" if not os.path.exists(outfolder): os.makedirs(outfolder) - figure_filename = outfolder+"topTopics_"+item+".png" + figure_filename = outfolder+"tT_"+mode+"-"+str(item)+".png" plt.savefig(figure_filename, dpi=dpi) plt.close() -def plot_topTopics(averageDatasets, firstWordsFile, numberOfTopics, - targetCategories, topTopicsShown, fontscale, +def plot_topTopics(averageDatasets, firstWordsFile, numOfTopics, + targetCategories, mode, topTopicsShown, fontscale, height, dpi, outfolder): """For each item in a category, plot the top n topics as a barchart.""" print("Launched plot_topTopics.") @@ -755,8 +1020,8 @@ def plot_topTopics(averageDatasets, firstWordsFile, numberOfTopics, if targetCategory in average: targetItems = get_targetItems(average, targetCategory) for item in targetItems: - dataToPlot = get_dataToPlot(average, firstWordsFile, topTopicsShown, item) - create_barchart_topTopics(dataToPlot, targetCategory, item, fontscale, height, dpi, outfolder) + dataToPlot = get_dataToPlot(average, firstWordsFile, mode, topTopicsShown, item) + create_barchart_topTopics(dataToPlot, targetCategory, mode, item, fontscale, height, dpi, outfolder) print("Done.") @@ -796,7 +1061,7 @@ def create_topItems_barchart(dataToPlot, firstWords, targetCategory, topic, print(" Creating plot for topic: "+str(topic)) ## Doing the plotting. dataToPlot.plot(kind="bar", legend=None) - plt.title("Top "+targetCategory+" für topic "+str(topic)+" ("+str(firstWords)+")", fontsize=15) + plt.title("Top "+targetCategory+" für topic: "+str(firstWords), fontsize=15) plt.ylabel("Scores", fontsize=13) plt.xlabel(targetCategory, fontsize=13) plt.setp(plt.xticks()[1], rotation=90, fontsize = 11) @@ -808,7 +1073,7 @@ def create_topItems_barchart(dataToPlot, firstWords, targetCategory, topic, outfolder = outfolder+targetCategory+"/" if not os.path.exists(outfolder): os.makedirs(outfolder) - figure_filename = outfolder+"topItems_"+str(topic)+".png" + figure_filename = outfolder+"tI_by-"+targetCategory+"-{:03d}".format(topic)+".png" plt.savefig(figure_filename, dpi=dpi) plt.close() @@ -816,7 +1081,7 @@ def create_topItems_barchart(dataToPlot, firstWords, targetCategory, topic, def plot_topItems(averageDatasets, outfolder, firstWordsFile, - numberOfTopics, + numOfTopics, targetCategories, topItemsShown, fontscale, @@ -828,7 +1093,7 @@ def plot_topItems(averageDatasets, for targetCategory in targetCategories: if targetCategory in average: print(" Plotting for: "+targetCategory) - topics = list(range(0,numberOfTopics)) + topics = list(range(0,numOfTopics)) for topic in topics: firstWords = get_topItems_firstWords(firstWordsFile, topic) @@ -857,7 +1122,7 @@ def plot_topItems(averageDatasets, # TODO: This next function could be merged with above. def get_heatmap_firstWords(firstWordsFile): """Function to load list of top topic words into dataframe.""" - #print(" Getting firstWords.") + print("- getting firstWords...") with open(firstWordsFile, "r") as infile: firstWords = pd.read_csv(infile, header=None) firstWords.drop(0, axis=1, inplace=True) @@ -865,30 +1130,66 @@ def get_heatmap_firstWords(firstWordsFile): #print(firstWords) return(firstWords) -def get_heatmap_dataToPlot(average, firstWordsFile, topTopicsShown, - numberOfTopics): +def get_heatmap_dataToPlot(average, mode, firstWordsFile, topTopicsShown, + numOfTopics): """From average topic score data, select data to be plotted.""" - #print(" Getting dataToPlot.") + print("- getting dataToPlot...") with open(average, "r") as infile: ## Read the average topic score data allScores = pd.DataFrame.from_csv(infile, sep=",") + if mode == "normalized": # mean normalization + colmeans = allScores.mean(axis=0) + allScores = allScores / colmeans + elif mode == "zscores": # zscore transformation + colmeans = allScores.mean(axis=0) # mean for each topic + allstd = allScores.std(axis=0) #std for entire df + allScores = (allScores - colmeans) / allstd # = zscore transf. + elif mode == "absolute": # absolute values + allScores = allScores allScores = allScores.T - ## Create subset of data based on target. - stdevs = allScores.std(axis=1) - allScores = pd.concat([allScores, stdevs], axis=1) - allScores = allScores.sort(columns=0, axis=0, ascending=False) + ## Add top topic words to table for display later + firstWords = get_heatmap_firstWords(firstWordsFile) + allScores.index = allScores.index.astype(np.int64) + allScores = pd.concat([allScores, firstWords], axis=1, join="inner") + #print(allScores) + ## Remove undesired columns: subsubgenre + #allScores = allScores.drop("adventure", axis=1) + #allScores = allScores.drop("autobiographical", axis=1) + #allScores = allScores.drop("blanche", axis=1) + #allScores = allScores.drop("education", axis=1) + #allScores = allScores.drop("fantastic", axis=1) + #allScores = allScores.drop("fantastique", axis=1) + #allScores = allScores.drop("historical", axis=1) + #allScores = allScores.drop("n.av.", axis=1) + #allScores = allScores.drop("nouveau-roman", axis=1) + #allScores = allScores.drop("sciencefiction", axis=1) + #allScores = allScores.drop("social", axis=1) + #allScores = allScores.drop("other", axis=1) + #allScores = allScores.drop("espionnage", axis=1) + #allScores = allScores.drop("thriller", axis=1) + #allScores = allScores.drop("neopolar", axis=1) + ## Remove undesired columns: protagonist-policier + #allScores = allScores.drop("crminal", axis=1) + #allScores = allScores.drop("mixed", axis=1) + #allScores = allScores.drop("witness", axis=1) + #allScores = allScores.drop("criminel", axis=1) + #allScores = allScores.drop("detection", axis=1) + #allScores = allScores.drop("victime", axis=1) + #allScores = allScores.drop("n.av.", axis=1) + ## Sort by standard deviation + standardDeviations = allScores.std(axis=1) + standardDeviations.name = "std" + allScores.index = allScores.index.astype(np.int64) + allScores = pd.concat([allScores, standardDeviations], axis=1) + allScores = allScores.sort(columns="std", axis=0, ascending=False) + allScores = allScores.drop("std", axis=1) someScores = allScores[0:topTopicsShown] - someScores = someScores.drop(0, axis=1) ## Necessary step to align dtypes of indexes for concat. someScores.index = someScores.index.astype(np.int64) #print("dtype firstWords: ", type(firstWords.index)) #print("dtype someScores: ", type(someScores.index)) #print("\n==intersection==\n",someScores.index.intersection(firstWords.index)) - ## Add top topic words to table for display later - firstWords = get_heatmap_firstWords(firstWordsFile) - dataToPlot = pd.concat([someScores, firstWords], axis=1, join="inner") - dataToPlot = dataToPlot.set_index("topicwords") - #print(dataToPlot) + dataToPlot = someScores.set_index("topicwords") ## Optionally, limit display to part of the columns #dataToPlot = dataToPlot.iloc[:,0:40] #print(dataToPlot) @@ -897,12 +1198,13 @@ def get_heatmap_dataToPlot(average, firstWordsFile, topTopicsShown, def create_distinctiveness_heatmap(dataToPlot, topTopicsShown, targetCategory, + mode, fontscale, dpi, outfolder): - + print("- doing the plotting...") sns.set_context("poster", font_scale=fontscale) - sns.heatmap(dataToPlot, annot=False, cmap="YlOrRd", square=False) + sns.heatmap(dataToPlot, annot=False, cmap="RdBu_r", square=False) # Nice: bone_r, copper_r, PuBu, OrRd, GnBu, BuGn, YlOrRd plt.title("Verteilung der Topic Scores", fontsize=20) plt.xlabel(targetCategory, fontsize=16) @@ -913,17 +1215,16 @@ def create_distinctiveness_heatmap(dataToPlot, ## Saving the plot to disk. if not os.path.exists(outfolder): os.makedirs(outfolder) - figure_filename = outfolder+"dist-heatmap_by-"+str(targetCategory)+".png" + figure_filename = outfolder+"dist-heatmap_"+mode+"-by-"+str(targetCategory)+".png" plt.savefig(figure_filename, dpi=dpi) plt.close() - - def plot_distinctiveness_heatmap(averageDatasets, firstWordsFile, + mode, outfolder, targetCategories, - numberOfTopics, + numOfTopics, topTopicsShown, fontscale, dpi): @@ -932,18 +1233,20 @@ def plot_distinctiveness_heatmap(averageDatasets, for average in glob.glob(averageDatasets): for targetCategory in targetCategories: if targetCategory in average and targetCategory != "segmentID": - print(" Plotting for: "+targetCategory) - dataToPlot = get_heatmap_dataToPlot(average, + print("- working on: "+targetCategory) + dataToPlot = get_heatmap_dataToPlot(average, + mode, firstWordsFile, topTopicsShown, - numberOfTopics) + numOfTopics) create_distinctiveness_heatmap(dataToPlot, topTopicsShown, targetCategory, + mode, fontscale, dpi, outfolder) - + print("Done.") ################################# @@ -1023,14 +1326,14 @@ def create_overTime_areaplot(dataToPlot, outfolder, fontscale, topics, dpi): plt.close() def plot_topicsOverTime(averageDatasets, firstWordsFile, outfolder, - numberOfTopics, fontscale, dpi, height, + numOfTopics, fontscale, dpi, height, mode, topics): """Function to plot development of topics over time using lineplots or areaplots.""" print("Launched plot_topicsOverTime.") if mode == "line": for average in glob.glob(averageDatasets): if "decade" in average: - entriesShown = numberOfTopics + entriesShown = numOfTopics dataToPlot = get_overTime_dataToPlot(average, firstWordsFile, entriesShown, topics) create_overTime_lineplot(dataToPlot, outfolder, fontscale, @@ -1038,7 +1341,7 @@ def plot_topicsOverTime(averageDatasets, firstWordsFile, outfolder, elif mode == "area": for average in glob.glob(averageDatasets): if "decade" in average: - entriesShown = numberOfTopics + entriesShown = numOfTopics dataToPlot = get_overTime_dataToPlot(average, firstWordsFile, entriesShown, topics) create_overTime_areaplot(dataToPlot, outfolder, fontscale, @@ -1048,11 +1351,529 @@ def plot_topicsOverTime(averageDatasets, firstWordsFile, outfolder, +########################### +## topicClustering ### +########################### + +# TOOD: Add figsize and orientation parameters. +# TODO: Add "firstwords" as leaf labels instead of topic numbers. + +import scipy.cluster as sc + +def get_topWordScores(wordWeightsFile, WordsPerTopic): + """Reads Mallet output (topics with words and word weights) into dataframe.""" + print("- getting topWordScores...") + wordScores = pd.read_table(wordWeightsFile, header=None, sep="\t") + wordScores = wordScores.sort(columns=[0,2], axis=0, ascending=[True, False]) + topWordScores = wordScores.groupby(0).head(WordsPerTopic) + #print(topWordScores) + return topWordScores + +def build_scoreMatrix(topWordScores, topicsToUse): + """Transform Mallet output for wordle generation.""" + print("- building score matrix...") + topWordScores = topWordScores.groupby(0) + listOfWordScores = [] + for topic,data in topWordScores: + if topic in list(range(0,topicsToUse)): + words = data.loc[:,1].tolist() + scores = data.loc[:,2].tolist() + wordScores = dict(zip(words, scores)) + wordScores = pd.Series(wordScores, name=topic) + listOfWordScores.append(wordScores) + scoreMatrix = pd.concat(listOfWordScores, axis=1) + scoreMatrix = scoreMatrix.fillna(10) + #print(scoreMatrix.head) + scoreMatrix = scoreMatrix.T + return scoreMatrix + +def perform_topicClustering(scoreMatrix, method, metric, wordsPerTopic, outfolder): + print("- performing clustering...") + distanceMatrix = sc.hierarchy.linkage(scoreMatrix, method=method, metric=metric) + #print(distanceMatrix) + plt.figure(figsize=(25,10)) + sc.hierarchy.dendrogram(distanceMatrix) + plt.setp(plt.xticks()[1], rotation=90, fontsize = 6) + plt.title("Topic-Clustering Dendrogramm", fontsize=20) + plt.ylabel("Distanz", fontsize=16) + plt.xlabel("Parameter: "+method+" clustering - "+metric+" distance - "+str(wordsPerTopic)+" words", fontsize=16) + plt.tight_layout() + + ## Saving the image file. + if not os.path.exists(outfolder): + os.makedirs(outfolder) + figure_filename = "topic-clustering_"+metric+"-"+method+"-"+str(wordsPerTopic)+"words"+".png" + plt.savefig(outfolder + figure_filename, dpi=600) + plt.close() + + +def topicClustering(wordWeightsFile, wordsPerTopic, outfolder, + methods, metrics, topicsToUse): + """Display dendrogram of topic similarity using clustering.""" + print("\nLaunched topicClustering.") + ## Gets the necessary data: the word scores for each topic + topWordScores = get_topWordScores(wordWeightsFile, wordsPerTopic) + ## Turn the data into a dataframe for further processing + scoreMatrix = build_scoreMatrix(topWordScores, topicsToUse) + ## Do clustering on the dataframe + for method in methods: + for metric in metrics: + perform_topicClustering(scoreMatrix, method, metric, wordsPerTopic, outfolder) + print("Done.") + + + +########################### +## itemClustering ### +########################### + +# TOOD: Add orientation to parameters. + +import scipy.cluster as sc + +def build_itemScoreMatrix(averageDatasets, targetCategory, + topicsPerItem, sortingCriterium): + """Reads Mallet output (topics with words and word weights) into dataframe.""" + print("- getting topWordScores...") + for averageFile in glob.glob(averageDatasets): + if targetCategory in averageFile: + itemScores = pd.read_table(averageFile, header=0, index_col=0, sep=",") + itemScores = itemScores.T + if sortingCriterium == "std": + itemScores["sorting"] = itemScores.std(axis=1) + elif sortingCriterium == "mean": + itemScores["sorting"] = itemScores.mean(axis=1) + itemScores = itemScores.sort(columns=["sorting"], axis=0, ascending=False) + itemScoreMatrix = itemScores.iloc[0:topicsPerItem,0:-1] + itemScoreMatrix = itemScoreMatrix.T + """ + itemScoreMatrix = itemScoreMatrix.drop("Allais", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Audoux", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Barbara", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Barjavel", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Beckett", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Bernanos", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Bosco", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Bourget", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Butor", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Camus", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Carco", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Celine", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Colette", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Darien", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Daudet", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Delly", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Dombre", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Duras", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("ErckChat", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("FevalPP", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("MduGard", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Mirbeau", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Ohnet", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Perec", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Proust", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Queneau", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Rodenbach", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Rolland", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Roussel", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("SaintExupery", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Sand", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Aimard", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("AimardAuriac", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Balzac", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Bon", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Echenoz", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Flaubert", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Fleuriot", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("France", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Galopin", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Gary", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("GaryAjar", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("GaryBogat", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("GarySinibaldi", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Gautier", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Giono", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Gouraud", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Huysmans", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Hugo", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("LeClezio", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Loti", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Malot", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Mary", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Maupassant", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Modiano", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("RobbeGrillet", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Stolz", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Sue", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Tournier", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Verne", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Vian", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("VianSullivan", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Zola", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Malraux", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Simon", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("LeRouge", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("LeRougeGuitton", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Toussaint", axis=0) + itemScoreMatrix = itemScoreMatrix.drop("Khadra", axis=0) + """ + #print(itemScoreMatrix) + return itemScoreMatrix + +def perform_itemClustering(itemScoreMatrix, targetCategory, method, metric, + topicsPerItem, sortingCriterium, figsize, outfolder): + print("- performing clustering...") + + ## Perform the actual clustering + itemDistanceMatrix = sc.hierarchy.linkage(itemScoreMatrix, method=method, metric=metric) + + ## Plot the distance matrix as a dendrogram + plt.figure(figsize=figsize) # TODO: this could be a a parameter. + itemLabels = itemScoreMatrix.index.values + sc.hierarchy.dendrogram(itemDistanceMatrix, labels=itemLabels, orientation="top") + + ## Format items labels to x-axis tick labels + plt.setp(plt.xticks()[1], rotation=90, fontsize = 14) + plt.title("Item Clustering Dendrogramm: "+targetCategory, fontsize=20) + plt.ylabel("Distance", fontsize=16) + plt.xlabel("Parameter: "+method+" clustering - "+metric+" distance - "+str(topicsPerItem)+" topics", fontsize=16) + plt.tight_layout() + + ## Save the image file. + print("- saving image file.") + if not os.path.exists(outfolder): + os.makedirs(outfolder) + figure_filename = "item-clustering_"+targetCategory+"_"+metric+"-"+method+"-"+sortingCriterium+"-"+str(topicsPerItem)+"topics"+".jpg" + plt.savefig(outfolder + figure_filename, dpi=600) + plt.close() + +def itemClustering(averageDatasets, figsize, outfolder, topicsPerItem, + targetCategories, methods, metrics, sortingCriterium): + """Display dendrogram of topic-based item similarity using clustering.""" + print("\nLaunched itemClustering.") + for targetCategory in targetCategories: + ## Load topic scores per itema and turn into score matrix + itemScoreMatrix = build_itemScoreMatrix(averageDatasets, targetCategory, + topicsPerItem, sortingCriterium) + ## Do clustering on the dataframe + for method in methods: + for metric in metrics: + perform_itemClustering(itemScoreMatrix, targetCategory, + method, metric, topicsPerItem, + sortingCriterium, figsize, outfolder) + print("Done.") + + + + +########################### +## simple progression ### +########################### + + +def get_progression_firstWords(firstWordsFile): + """Function to load list of top topic words into dataframe.""" + #print(" Getting firstWords.") + with open(firstWordsFile, "r") as infile: + firstWords = pd.read_csv(infile, header=None) + firstWords.drop(0, axis=1, inplace=True) + firstWords.rename(columns={1:"topicwords"}, inplace=True) + firstWords.index = firstWords.index.astype(np.int64) + #print(firstWords) + return(firstWords) + + +def get_selSimpleProgression_dataToPlot(averageDataset, firstWordsFile, + entriesShown, topics): + """Function to build a dataframe with all data necessary for plotting.""" + print("- getting data to plot...") + with open(averageDataset, "r") as infile: + allScores = pd.DataFrame.from_csv(infile, sep=",") + allScores = allScores.T + #print(allScores.head()) + ## Select the data for selected topics + someScores = allScores.loc[topics,:] + someScores.index = someScores.index.astype(np.int64) + ## Add information about the firstWords of topics + firstWords = get_progression_firstWords(firstWordsFile) + dataToPlot = pd.concat([someScores, firstWords], axis=1, join="inner") + dataToPlot = dataToPlot.set_index("topicwords") + dataToPlot = dataToPlot.T + #print(dataToPlot) + return dataToPlot + + +def create_selSimpleProgression_lineplot(dataToPlot, outfolder, fontscale, + topics, dpi, height): + """This function does the actual plotting and saving to disk.""" + print("- creating the plot...") + ## Plot the selected data + dataToPlot.plot(kind="line", lw=3, marker="o") + plt.title("Entwicklung ausgewählter Topics über den Textverlauf", fontsize=20) + plt.ylabel("Topic scores (absolut)", fontsize=16) + plt.xlabel("Textabschnitte", fontsize=16) + plt.setp(plt.xticks()[1], rotation=0, fontsize = 14) + if height != 0: + plt.ylim((0.000,height)) + + ## Saving the plot to disk. + if not os.path.exists(outfolder): + os.makedirs(outfolder) + ## Format the topic information for display + topicsLabel = "-".join(str(topic) for topic in topics) + figure_filename = outfolder+"sel_"+topicsLabel+".png" + plt.savefig(figure_filename, dpi=dpi) + plt.close() + +def get_allSimpleProgression_dataToPlot(averageDataset, firstWordsFile, + entriesShown, topic): + """Function to build a dataframe with all data necessary for plotting.""" + print("- getting data to plot...") + with open(averageDataset, "r") as infile: + allScores = pd.DataFrame.from_csv(infile, sep=",") + allScores = allScores.T + #print(allScores) + ## Select the data for current topics + someScores = allScores.loc[topic,:] + someScores.index = someScores.index.astype(np.int64) + dataToPlot = someScores + #print(dataToPlot) + return dataToPlot + +# TODO: Make sure this is only read once and then select when plotting. + + +def create_allSimpleProgression_lineplot(dataToPlot, outfolder, fontscale, + firstWordsFile, topic, dpi, height): + """This function does the actual plotting and saving to disk.""" + print("- creating the plot for topic " + topic) + ## Get the first words info for the topic + firstWords = get_progression_firstWords(firstWordsFile) + topicFirstWords = firstWords.iloc[int(topic),0] + #print(topicFirstWords) + ## Plot the selected data + dataToPlot.plot(kind="line", lw=3, marker="o") + plt.title("Entwicklung über den Textverlauf für "+topicFirstWords, fontsize=20) + plt.ylabel("Topic scores (absolut)", fontsize=16) + plt.xlabel("Textabschnitte", fontsize=16) + plt.setp(plt.xticks()[1], rotation=0, fontsize = 14) + if height != 0: + plt.ylim((0.000,height)) + + ## Saving the plot to disk. + if not os.path.exists(outfolder): + os.makedirs(outfolder) + ## Format the topic information for display + topicsLabel = str(topic) + figure_filename = outfolder+"all_"+topicsLabel+".png" + plt.savefig(figure_filename, dpi=dpi) + plt.close() + + +def simpleProgression(averageDataset, firstWordsFile, outfolder, + numOfTopics, + fontscale, dpi, height, mode, topics): + """Function to plot topic development over textual progression.""" + print("Launched textualProgression.") + if mode == "selected" or mode == "sel": + entriesShown = numOfTopics + dataToPlot = get_selSimpleProgression_dataToPlot(averageDataset, + firstWordsFile, + entriesShown, + topics) + create_selSimpleProgression_lineplot(dataToPlot, outfolder, + fontscale, topics, + dpi, height) + elif mode == "all": + entriesShown = numOfTopics + topics = list(range(0, numOfTopics)) + for topic in topics: + topic = str(topic) + dataToPlot = get_allSimpleProgression_dataToPlot(averageDataset, + firstWordsFile, + entriesShown, + topic) + create_allSimpleProgression_lineplot(dataToPlot, outfolder, + fontscale, firstWordsFile, + topic, dpi, height) + else: + print("Please select a valid value for 'mode'.") + print("Done.") + + + + + + ################################################################## -### OTHER / OBSOLETE ### +### OTHER / OBSOLETE / DEV ### ################################################################## +########################### +## complex progression ### IN DEVELOPMENT +########################### + + +def get_selComplexProgression_dataToPlot(averageDataset, firstWordsFile, + entriesShown, topics): + """Function to build a dataframe with all data necessary for plotting.""" + print("- getting data to plot...") + with open(averageDataset, "r") as infile: + allScores = pd.DataFrame.from_csv(infile, sep=",") + allScores = allScores.T + #print(allScores.head()) + ## Select the data for selected topics + someScores = allScores.loc[topics,:] + someScores.index = someScores.index.astype(np.int64) + ## Add information about the firstWords of topics + firstWords = get_progression_firstWords(firstWordsFile) + dataToPlot = pd.concat([someScores, firstWords], axis=1, join="inner") + dataToPlot = dataToPlot.set_index("topicwords") + dataToPlot = dataToPlot.T + #print(dataToPlot) + return dataToPlot + + +def create_selComplexProgression_lineplot(dataToPlot, outfolder, fontscale, + topics, dpi, height): + """This function does the actual plotting and saving to disk.""" + print("- creating the plot...") + ## Plot the selected data + dataToPlot.plot(kind="line", lw=3, marker="o") + plt.title("Entwicklung ausgewählter Topics über den Textverlauf", fontsize=20) + plt.ylabel("Topic scores (absolut)", fontsize=16) + plt.xlabel("Textabschnitte", fontsize=16) + plt.setp(plt.xticks()[1], rotation=0, fontsize = 14) + if height != 0: + plt.ylim((0.000,height)) + + ## Saving the plot to disk. + if not os.path.exists(outfolder): + os.makedirs(outfolder) + ## Format the topic information for display + topicsLabel = "-".join(str(topic) for topic in topics) + figure_filename = outfolder+"sel_"+topicsLabel+".png" + plt.savefig(figure_filename, dpi=dpi) + plt.close() + +def get_allComplexProgression_dataToPlot(averageDataset, firstWordsFile, + entriesShown, topic, targetCategories): + """Function to build a dataframe with all data necessary for plotting.""" + print("- getting data to plot...") + with open(averageDataset, "r") as infile: + allScores = pd.DataFrame.from_csv(infile, sep=",", index_col=None) + #print(allScores) + ## Select the data for current topics + target1 = targetCategories[0] + target2 = targetCategories[1] + target1data = allScores.loc[:,target1] + target2data = allScores.loc[:,target2] + topicScores = allScores.loc[:,topic] + #print(target1data) + #print(target2data) + #print(topicScores) + dataToPlot = pd.concat([target1data, target2data], axis=1) + dataToPlot = pd.concat([dataToPlot, topicScores], axis=1) + #print(dataToPlot) + return dataToPlot + +# TODO: Make sure this is only read once and then select when plotting. + + +def create_allComplexProgression_lineplot(dataToPlot, targetCategories, + outfolder, fontscale, + firstWordsFile, topic, dpi, height): + """This function does the actual plotting and saving to disk.""" + print("- creating the plot for topic " + topic) + ## Get the first words info for the topic + firstWords = get_progression_firstWords(firstWordsFile) + topicFirstWords = firstWords.iloc[int(topic),0] + #print(topicFirstWords) + ## Split plotting data into parts (for target1) + target1data = dataToPlot.iloc[:,0] + #print(target1data) + numPartialData = len(set(target1data)) + ## Initialize plot for several lines + completeData = [] + #print(dataToPlot) + for target in set(target1data): + #print(" - plotting "+target) + partialData = dataToPlot.groupby(targetCategories[0]) + partialData = partialData.get_group(target) + partialData.rename(columns={topic:target}, inplace=True) + partialData = partialData.iloc[:,2:3] + completeData.append(partialData) + #print(completeData) + ## Plot the selected data, one after the other + plt.figure() + plt.figure(figsize=(15,10)) + for i in range(0, numPartialData): + #print(completeData[i]) + label = completeData[i].columns.values.tolist() + label = str(label[0]) + plt.plot(completeData[i], lw=4, marker="o", label=label) + plt.legend() + plt.title("Entwicklung über den Textverlauf für "+topicFirstWords, fontsize=20) + plt.ylabel("Topic scores (absolut)", fontsize=16) + plt.xlabel("Textabschnitte", fontsize=16) + plt.legend() + plt.locator_params(axis = 'x', nbins = 10) + plt.setp(plt.xticks()[1], rotation=0, fontsize = 14) + if height != 0: + plt.ylim((0.000,height)) + + ## Saving the plot to disk. + if not os.path.exists(outfolder): + os.makedirs(outfolder) + ## Format the topic information for display + topicsLabel = str(topic) + figure_filename = outfolder+"all_"+str(targetCategories[0])+"-"+topicsLabel+".png" + plt.savefig(figure_filename, dpi=dpi) + plt.close() + + +def complexProgression(averageDataset, + firstWordsFile, + outfolder, + numOfTopics, + targetCategories, + fontscale, + dpi, height, + mode, topics): + """Function to plot topic development over textual progression.""" + print("Launched complexProgression.") + if mode == "sel": + entriesShown = numOfTopics + dataToPlot = get_selSimpleProgression_dataToPlot(averageDataset, + firstWordsFile, + entriesShown, + topics) + create_selSimpleProgression_lineplot(dataToPlot, + outfolder, + fontscale, + topics, + dpi, height) + elif mode == "all": + entriesShown = numOfTopics + topics = list(range(0, numOfTopics)) + for topic in topics: + topic = str(topic) + dataToPlot = get_allComplexProgression_dataToPlot(averageDataset, + firstWordsFile, + entriesShown, + topic, + targetCategories) + create_allComplexProgression_lineplot(dataToPlot, targetCategories, + outfolder, + fontscale, firstWordsFile, + topic, dpi, height) + else: + print("Please select a valid value for 'mode'.") + print("Done.") + + + ########################### ## show_segment ### @@ -1063,4 +1884,62 @@ def plot_topicsOverTime(averageDatasets, firstWordsFile, outfolder, def show_segment(wdir,segmentID, outfolder): if not os.path.exists(outfolder): os.makedirs(outfolder) - shutil.copyfile(wdir+"2_segs/"+segmentID+".txt",outfolder+segmentID+".txt") \ No newline at end of file + shutil.copyfile(wdir+"2_segs/"+segmentID+".txt",outfolder+segmentID+".txt") + + + + +########################### +## itemPCA ### IN DEVELOPMENT +########################### + +from sklearn.decomposition import PCA + +#def build_itemScoreMatrix(averageDatasets, targetCategory, +# topicsPerItem, sortingCriterium): +# """Reads Mallet output (topics with words and word weights) into dataframe.""" +# print("- building item score matrix...") +# for averageFile in glob.glob(averageDatasets): +# if targetCategory in averageFile: +# itemScores = pd.read_table(averageFile, header=0, index_col=0, sep=",") +# itemScores = itemScores.T +# if sortingCriterium == "std": +# itemScores["sorting"] = itemScores.std(axis=1) +# elif sortingCriterium == "mean": +# itemScores["sorting"] = itemScores.mean(axis=1) +# itemScores = itemScores.sort(columns=["sorting"], axis=0, ascending=False) +# itemScoreMatrix = itemScores.iloc[0:topicsPerItem,0:-1] +# itemScoreMatrix = itemScoreMatrix.T +# #print(itemScoreMatrix) +# return itemScoreMatrix + +def perform_itemPCA(itemScoreMatrix, targetCategory, topicsPerItem, + sortingCriterium, figsize, outfolder): + print("- doing the PCA...") + itemScoreMatrix = itemScoreMatrix.T + targetDimensions = 2 + pca = PCA(n_components=targetDimensions) + pca = pca.fit(itemScoreMatrix) + pca = pca.transform(itemScoreMatrix) +# plt.scatter(pca[0,0:20], pca[1,0:20]) + for i in list(range(0,len(pca)-1)): + plt.scatter(pca[i,:], pca[i+1,:]) + + +def itemPCA(averageDatasets, targetCategories, + topicsPerItem, sortingCriterium, figsize, outfolder): + """Function to perform PCA on per-item topic scores and plot the result.""" + print("Launched itemPCA.") + for targetCategory in targetCategories: + ## Load topic scores per item and turn into score matrix + ## (Using the function from itemClustering above!) + itemScoreMatrix = build_itemScoreMatrix(averageDatasets, targetCategory, + topicsPerItem, sortingCriterium) + ## Do clustering on the dataframe + perform_itemPCA(itemScoreMatrix, targetCategory, topicsPerItem, sortingCriterium, figsize, outfolder) + print("Done.") + + + + + diff --git a/tmw_config.py b/tmw_config.py index 60ec739..d581ab8 100644 --- a/tmw_config.py +++ b/tmw_config.py @@ -12,17 +12,37 @@ # For information on requirements and usage, see the README file. # This config file is structured as follows: +# 0. General Settings # 1. Preprocessing Texts # 2. Topic Modeling # 3. Posprocessing Data -# 4. Visualization -# 5. Other / Obsolete +# 4. Basic Visualizations +# 5. Advanced Visualizations +# 6. Other / Obsolete / in development + +# You may find a tutorial explaining the purpose of each function +# as well as its input, output and other parameters at: +# https://www.penflip.com/c.schoech/tmw-tutorial + + +################################ +### GENERAL SETTINGS ### +################################ + +### The following settings depend on the system used. +### Path to the working directory. +wdir = "/home/" # end with slash. +### Path to the TreeTagger file (language-dependent!) +tagger = "/home/[USER]/Programs/TreeTagger/cmd/tree-tagger-french" +### Path to Mallet installation directory +mallet_path = "/home/[USER]/Programs/Mallet/bin/mallet" +### Path to the font for wordle generation +font_path = "/home/[USER]/.fonts/AlegreyaSans-Regular.otf" import tmw #print(help(topmod)) -### Set the general working directory. -wdir = "/home/christof/Dropbox/0-Analysen/2015/hybrid/rf740c/" # end with slash. + ################################ ### PREPROCESSING TEXTS ### @@ -39,25 +59,29 @@ inpath = wdir + "1_txt/*.txt" outfolder = wdir + "2_segs/" target = 600 -sizetolerancefactor = 1.1 # 1 = exact target; >1 = with some tolerance (1.1 = +/- 10%). -preserveparagraphs = True # True|False +sizetolerancefactor = 1.1 +preserveparagraphs = True #tmw.segmenter(inpath, outfolder, target, sizetolerancefactor, preserveparagraphs) -### segments_to_bins: inpath, outfile -### Currently not implemented any more / yet. +### segments_to_bins +### Assign each segment to one bin over textual progression. +inpath = wdir + "2_segs/*.txt" +outfolder = wdir + "7_aggregates/" +binsnb = 3 # number of bins +#tmw.segments_to_bins(inpath,outfolder, binsnb) ### pretokenize ### Perform some preliminary tokenization. -inpath = wdir + "2_test/*.txt" -substitutionsFile = "./extras/fr_pretokenize_subs.csv" -outfolder = wdir + "3_test/" -tmw.pretokenize(inpath, substitutionsFile, outfolder) +inpath = wdir + "2_segs/*.txt" +outfolder = wdir + "3_tokens/" +substitutionsFile = wdir+"extras/fr_pretokenize_subs.csv" +#tmw.pretokenize(inpath, substitutionsFile, outfolder) ### call_treetagger ### Perform lemmatization and POS tagging. -infolder = wdir + "3_tokens/" +infolder = wdir + "2_segs/" outfolder = wdir + "4_tagged/" -tagger = "/home/christof/Programs/TreeTagger/cmd/tree-tagger-french" +tagger = tagger #tmw.call_treetagger(infolder, outfolder, tagger) ### make_lemmatext @@ -65,9 +89,15 @@ inpath = wdir + "4_tagged/*.trt" outfolder = wdir + "5_lemmata/" mode = "frN" # frN=nouns, esN=nouns, frNV=nouns+verbs, frNVAA=nouns+verbs+adj+adverbs -stoplist_errors = "./extras/fr_stopwords_errors.txt" # in tmw folder +stoplist_errors = wdir+"extras/fr_stopwords_errors.txt" # wdir #tmw.make_lemmatext(inpath, outfolder, mode, stoplist_errors) +### substitute +### Perform some preliminary tokenization. +inpath = wdir + "5_lemmata/*.txt" +outfolder = wdir + "5_substituted/" +substitutionsFile = wdir+"extras/fr_argot-substitutions.csv" +#tmw.substitute(inpath, substitutionsFile, outfolder) ################################ @@ -76,25 +106,25 @@ ### call_mallet_import ### Imports text data into the Mallet corpus format. -mallet_path = "/home/christof/Programs/Mallet/bin/mallet" -infolder = wdir + "5_lemmata/" +mallet_path = mallet_path +infolder = wdir + "5_substituted/" outfolder = wdir + "6_mallet/" outfile = outfolder + "corpus.mallet" -stoplist_project = "./extras/fr_stopwords_project.txt" # in tmw folder +stoplist_project = wdir+"extras/fr_stopwords_project.txt" # in tmw folder #tmw.call_mallet_import(mallet_path, infolder, outfolder, outfile, stoplist_project) ### call_mallet_model ### Performs the actual topic modeling. -mallet_path = "/home/christof/Programs/Mallet/bin/mallet" +mallet_path = mallet_path inputfile = wdir + "6_mallet/corpus.mallet" outfolder = wdir + "6_mallet/" -num_topics = "250" -optimize_interval = "100" -num_iterations = "5000" -num_top_words = "200" -doc_topics_max = num_topics -num_threads = "4" -#tmw.call_mallet_modeling(mallet_path, inputfile, outfolder, num_topics, optimize_interval, num_iterations, num_top_words, doc_topics_max) +numOfTopics = "250" # string +optimize_interval = "100" # string +num_iterations = "5000" # string +num_top_words = "100" # string +doc_topics_max = numOfTopics +num_threads = "4" # string +#tmw.call_mallet_modeling(mallet_path, inputfile, outfolder, numOfTopics, optimize_interval, num_iterations, num_top_words, doc_topics_max) @@ -103,24 +133,35 @@ ################################ ### create_mastermatrix -### Creates the mastermatrix with all information in one place. -corpuspath = wdir+"/2_segs/*.txt" +### Creates a matrix with all information (metadata and topic scores for +### each segment) in one place. +corpuspath = wdir+"2_segs/*.txt" outfolder = wdir+"7_aggregates/" mastermatrixfile = "mastermatrix.csv" -metadatafile = wdir+"/metadata.csv" -topics_in_texts = wdir+"/6_mallet/topics-in-texts.csv" -number_of_topics = 250 -#tmw.create_mastermatrix(corpuspath, outfolder, mastermatrixfile, metadatafile, topics_in_texts, number_of_topics) +metadatafile = wdir+"metadata.csv" +topics_in_texts = wdir+"6_mallet/topics-in-texts.csv" +numOfTopics = int(numOfTopics) +useBins = True # True|False +binDataFile = wdir+"7_aggregates/segs-and-bins.csv" +###tmw.create_mastermatrix(corpuspath, outfolder, mastermatrixfile, metadatafile, topics_in_texts, numOfTopics, useBins, binDataFile) ### calculate_averageTopicScores ### Based on the mastermatrix, calculates various average topic score datasets. mastermatrixfile = wdir+"/7_aggregates/mastermatrix.csv" outfolder = wdir+"7_aggregates/" -# targets: one or several:author|decade|subgenre|author-gender|idno|segmentID|narration -targets = ["author-name", "author-gender", "title", "decade", "subgenre", - "idno", "segmentID", "narration", "protagonist-policier"] +targets = ["segmentID"] +#targets = ["subgenre", "author-name", "subsubgenre","decade", "narration", "setting", "author-gender", "title", "protagonist-policier"] +#targets = ["author", "author-gender", "title", "decade", "subgenre", +# "idno", "segmentID", "narration", "protagonist-policier", "binID"] #tmw.calculate_averageTopicScores(mastermatrixfile, targets, outfolder) +### calculate_complexAverageTopicScores +### Based on the mastermatrix, calculates average topic scores for two target categories at once. +mastermatrixfile = wdir+"/7_aggregates/mastermatrix.csv" +outfolder = wdir+"7_aggregates/" +targets = ["decade", "binID"] # 2 targets to combine +#tmw.calculate_complexAverageTopicScores(mastermatrixfile, targets, outfolder) + ### save_firstWords ### Saves the first words of each topic to a separate file. topicWordFile = wdir+"6_mallet/topics-with-words.csv" @@ -128,24 +169,32 @@ filename = "firstWords.csv" #tmw.save_firstWords(topicWordFile, outfolder, filename) +### save_topicRanks +### Saves the rank (in the overall scores) of each topic to a separate file. +topicWordFile = wdir+"6_mallet/topics-with-words.csv" +outfolder = wdir+"7_aggregates/" +filename = "topicRanks.csv" +tmw.save_topicRanks(topicWordFile, outfolder, filename) + ################################ -### VISUALIZATION ### +### BASIC VISUALIZATION ### ################################ ### make_wordle_from_mallet ### Creates a wordle for each topic. -word_weights_file = wdir + "6_mallet/" + "word-weights.txt" -topics = 250 +word_weights_file = wdir+"6_mallet/" + "word-weights.txt" +topicRanksFile = wdir + "7_aggregates/" + "topicRanks.csv" +numOfTopics = numOfTopics words = 40 -outfolder = wdir + "8_visuals/wordles/" -font_path = "/home/christof/.fonts/AlegreyaSans-Regular.otf" +outfolder = wdir+"8_visuals/wordles/" +font_path = font_path dpi = 300 -#tmw.make_wordle_from_mallet(word_weights_file,topics,words,outfolder,font_path,dpi) +#tmw.make_wordle_from_mallet(word_weights_file,numOfTopics, words,outfolder, topicRanksFile, font_path,dpi) ### crop_images -### Crops the wordle image files, use if needed. +### Optional. Crops the wordle image files. inpath = wdir + "8_visuals/wordles/*.png" outfolder = wdir + "8_visuals/wordles/" left = 225 # image start at the left @@ -156,75 +205,131 @@ ### plot_topTopics ### For each item from a category, creates a barchart of the top topics. -averageDatasets = wdir+"/7_aggregates/avg*.csv" -firstWordsFile = wdir+"/7_aggregates/firstWords.csv" -numberOfTopics = 250 # must be actual number of topics modeled. -targetCategories = ["author-name", "author-gender", "decade", "subgenre", "title"] -# one or several: "author-name", "author-gender", "decade", "subgenre", "title" -topTopicsShown = 30 +averageDatasets = wdir+"7_aggregates/avg*.csv" +firstWordsFile = wdir+"7_aggregates/firstWords.csv" +targetCategories = ["title"] +topTopicsShown = 16 +numOfTopics = numOfTopics fontscale = 1.0 height = 0 # 0=automatic and variable dpi = 300 +mode = "normalized" #normalized|zscores|absolute outfolder = wdir+"/8_visuals/topTopics/" -#tmw.plot_topTopics(averageDatasets, firstWordsFile, numberOfTopics, targetCategories, topTopicsShown, fontscale, height, dpi, outfolder) +tmw.plot_topTopics(averageDatasets, firstWordsFile, numOfTopics, targetCategories, mode, topTopicsShown, fontscale, height, dpi, outfolder) -### plot_topItems +### plot_topItems ### ### For each topic, creates a barchart with top items from a category. -averageDatasets = wdir+"/7_aggregates/avg*.csv" -outfolder = wdir+"/8_visuals/topItems/" -firstWordsFile = wdir+"/7_aggregates/firstWords.csv" -numberOfTopics = 250 # must be actual number of topics modeled. -targetCategories = ["author-name", "subgenre", "title", "decade", "author-gender"] -# choose one or several from: author-name, decade, subgenre, gender, idno, title, segmentID -topItemsShown = 30 +averageDatasets = wdir+"7_aggregates/avg*.csv" +outfolder = wdir+"8_visuals/topItems/" +firstWordsFile = wdir+"7_aggregates/firstWords.csv" +numOfTopics = numOfTopics # must be actual number of topics modeled. +targetCategories = ["segmentID"] +topItemsShown = 20 fontscale = 0.8 height = 0 # 0=automatic and flexible dpi = 300 -#tmw.plot_topItems(averageDatasets, outfolder, firstWordsFile, numberOfTopics, targetCategories, topItemsShown, fontscale, height, dpi) +#tmw.plot_topItems(averageDatasets, outfolder, firstWordsFile, numOfTopics, targetCategories, topItemsShown, fontscale, height, dpi) + + + +################################ +### ADVANCED VISUALIZATION ### +################################ -### plot_distinctiveness_heatmap +### plot_distinctiveness_heatmap ### ### For each category, make a heatmap of most distinctive topics. -averageDatasets = wdir+"/7_aggregates/avg*.csv" -firstWordsFile = wdir+"/7_aggregates/firstWords.csv" -outfolder = wdir+"/8_visuals/distinctiveness/" -targetCategories = ["author-name", "decade", "subgenre", "gender"] -# one or several: "author-name", "decade", "subgenre", "gender", "idno", "title" -numberOfTopics = 250 # must be actual number of topics modeled. +averageDatasets = wdir+"7_aggregates/avg*.csv" +firstWordsFile = wdir+"7_aggregates/firstWords.csv" +outfolder = wdir+"8_visuals/distinctiveness/" +targetCategories = ["protagonist-policier"] +mode = "zscores" #normalized|zscores|absolute +numOfTopics = numOfTopics # actual number of topics modeled. topTopicsShown = 20 fontscale = 1.0 dpi = 300 -#tmw.plot_distinctiveness_heatmap(averageDatasets, firstWordsFile, outfolder, targetCategories, numberOfTopics, topTopicsShown, fontscale, dpi) - -### plot_topicsOverTime -### Creates lineplots or areaplots for topic development over time. -averageDatasets = wdir+"/7_aggregates/avgtopicscores_by-decade.csv" -firstWordsFile = wdir+"/7_aggregates/firstWords.csv" -outfolder = wdir+"/8_visuals/overTime/" -numberOfTopics = 250 # must be actual number of topics modeled. +#tmw.plot_distinctiveness_heatmap(averageDatasets, firstWordsFile, mode, outfolder, targetCategories, numOfTopics, topTopicsShown, fontscale, dpi) + +### plot_topicsOverTime ### +### +averageDatasets = wdir+"7_aggregates/avgtopicscores_by-decade.csv" +firstWordsFile = wdir+"7_aggregates/firstWords.csv" +outfolder = wdir+"8_visuals/overTime/" +numOfTopics = numOfTopics # actual number of topics modeled. fontscale = 1.0 dpi = 300 height = 0 # for lineplot; 0=automatic mode = "line" # area|line for areaplot or lineplot -topics = ["48","67","199"] # list of one or several topics -#tmw.plot_topicsOverTime(averageDatasets, firstWordsFile, outfolder, numberOfTopics, fontscale, dpi, height, mode, topics) +topics = ["190", "6"] # list of one or several topics +#tmw.plot_topicsOverTime(averageDatasets, firstWordsFile, outfolder, numOfTopics, fontscale, dpi, height, mode, topics) + +### topicClustering ### +# This function will create a dendrogram grouping topics based on their word weight similarity. +wordWeightsFile = wdir+"6_mallet/"+"word-weights.txt" +outfolder = wdir+"8_visuals/clustering/" +topicsToUse = numOfTopics # should be all topics. +wordsPerTopic = 50 +methods=["weighted"] # list +metrics=["cosine"] # list +#tmw.topicClustering(wordWeightsFile, wordsPerTopic, outfolder, methods, metrics, topicsToUse) + +### itemClustering ### +# This function creates a dendrogram of items in a category (authors, titles). +averageDatasets = wdir+"7_aggregates/avg*.csv" +figsize = (15,10) # width,height +outfolder = wdir+"8_visuals/clustering/" +topicsPerItem = 50 # can be set +sortingCriterium = "mean" # std|mean +targetCategories = ["author-name"] # list +methods=["weighted"] # list +metrics=["cosine"] # list +#tmw.itemClustering(averageDatasets, figsize, outfolder, topicsPerItem, targetCategories, methods, metrics, sortingCriterium) + +### simpleProgression ### +### Creates a lineplot of topic development over textual progression. +averageDataset = wdir+"7_aggregates/avgtopicscores_by-binID.csv" +firstWordsFile = wdir+"7_aggregates/firstWords.csv" +outfolder = wdir+"8_visuals/progression/simple/" +numOfTopics = numOfTopics # must be actual number of topics modeled. +fontscale = 1.0 +dpi = 300 +height = 0 # 0=automatic +mode = "sel" # all|sel +topics = ["25", "44", "12"] # if mode="sel": list of topics +#tmw.simpleProgression(averageDataset, firstWordsFile, outfolder, numOfTopics, fontscale, dpi, height, mode, topics) + +### complexProgression ### +### Creates a lineplot of topic development over textual progression, +### but does so separatedly for different target categories. +averageDataset = wdir+"7_aggregates/complex-avgtopicscores_by-subgenre+binID.csv" +firstWordsFile = wdir+"7_aggregates/firstWords.csv" +outfolder = wdir+"8_visuals/progression/complex/" +numOfTopics = numOfTopics # must be actual number of topics modeled. +targetCategories = ["subgenre","binID"] # two values, corresponding to averageDataset +fontscale = 1.0 +dpi = 300 +height = 0 # for lineplot; 0=automatic +mode = "all" # all|sel ### only "all" is implemented ## +#tmw.complexProgression(averageDataset, firstWordsFile, outfolder, numOfTopics, targetCategories, fontscale, dpi, height, mode, topics) ################################ -### OTHER/OBSOLETE ### +### OTHER / OBSOLETE / DEV ### ################################ ### 5c show segment ## To read a specific segment, better than looking in the folder. -segmentID = "rf0546§000083" +segmentID = "rf1246§0048" # indicate here, manually outfolder = wdir+"/9_sel-segs/" #tmw.show_segment(wdir,segmentID, outfolder) -### 6b - create_topicscores_lineplot -inpath = wdir + "7_aggregates/*-lp.csv" # narrow down as needed -outfolder = wdir + "8_visuals/lineplots/" -topicwordfile = wdir + "6_mallet/topics-with-words.csv" -dpi = 300 -height = 0.050 -genres = ["detection","noir"] # User: set depending on metadata. Available: noir, detection, criminel, experim., archq., blanche, neopl., susp. -#tmw.create_topicscores_lineplot(inpath,outfolder,topicwordfile,dpi,height,genres) +### itemPCA ### CURRENTLY NOT WORKING +averageDatasets = wdir+"7_aggregates/avg*.csv" +figsize = (10,10) # width,height +outfolder = wdir+"8_visuals/clustering/" +topicsPerItem = 50 +sortingCriterium = "std" # std|mean +targetCategories = ["subgenre"] # list +methods=["weighted"] # list +metrics=["cosine"] # list +#tmw.itemPCA(averageDatasets, targetCategories, topicsPerItem, sortingCriterium, figsize, outfolder)