diff --git a/__pycache__/tmw.cpython-34.pyc b/__pycache__/tmw.cpython-34.pyc
index 80f234c..229b7df 100644
Binary files a/__pycache__/tmw.cpython-34.pyc and b/__pycache__/tmw.cpython-34.pyc differ
diff --git a/extras/fr_pretokenize_subs.csv b/extras/fr_pretokenize_subs.csv
deleted file mode 100644
index 273c5a8..0000000
--- a/extras/fr_pretokenize_subs.csv
+++ /dev/null
@@ -1,157 +0,0 @@
-"string§To§Find","string§To§Replace"
-’,'
-J,"Je "
-qu'elle,que elle
-"’","'"
-"J'","Je "
-"j'","je "
-"S'","Se "
-"s'","se "
-"C'","Ce "
-"c'","ce "
-"N'","Ne "
-"n'","ne "
-"D'","De "
-"d'","de "
-"L'","Le "
-"l'","la "
-"T'","tu "
-"t'","tu "
-"-le"," le"
-"-moi"," moi"
-"m'","me "
-"M'","Me "
-"-je"," je"
-"-il"," il"
-"-on"," on"
-"-lui"," lui"
-"-elle"," elle"
-"-nous"," nous"
-"-vous"," vous"
-"-nous"," nous"
-"-ce"," ce"
-"-tu"," tu"
-"-toi"," toi"
-"jusqu'à'","jusque à"
-"aujourd'hui","aujourdhui"
-"-t",""
-"-y"," y"
-"-en"," en"
-"-ci"," ci"
-"-là"," là"
-"Qu'","Que "
-"qu'","que "
-"-même"," même"
-" Il "," il "
-" Ils "," ils "
-" Elles "," elles "
-" Elle "," elle "
-" Je "," je "
-" Tu "," tu "
-" Toi "," toi "
-" Nous "," nous "
-" Vous "," vous "
-" Mais "," mais "
-" Ne "," ne "
-" Et "," et "
-" Pourquoi "," pourquoi "
-" Alors "," alors "
-" Aussi "," aussi "
-" Car "," car "
-" Au "," au "
-" Ses "," ses "
-" Se "," se "
-" Moi "," moi "
-" Toute "," toute "
-" Tout "," tout "
-" Hier "," hier "
-" Non "," non "
-" Comme "," comme "
-" Dans "," dans "
-" Pour "," pour "
-" Voilà "," voilà "
-" Son "," son "
-" Une "," une "
-" Un "," un "
-" Où "," où "
-" De "," de "
-" Qui "," qui "
-" Depuis "," depuis "
-" Ça "," ça "
-" Sur "," sur "
-" Ensuite "," ensuite "
-" Puis "," puis "
-" On "," on "
-" Si "," si "
-" Même "," même "
-" Toutefois "," toutefois "
-" Ainsi "," ainsi "
-" Aucun "," aucun "
-" Ce "," ce "
-" Ces "," ces "
-" Toutes "," toutes "
-" En "," en "
-" Après "," après "
-" Quel "," quel "
-" Quelle "," quelle "
-" Quand "," quand "
-" Celle "," celle "
-" Puisque "," puisque "
-" Tous "," tous "
-" Dès "," dès "
-" Cet "," cet "
-" Lorsque "," lorsque "
-" Lui "," lui "
-" Sauf "," sauf "
-" Moins "," moins "
-" Encore "," encore "
-" Cependant "," cependant "
-" Comment "," comment "
-" Assez "," assez "
-" Ma "," ma "
-" Quelques "," quelques "
-" Leurs "," leurs "
-" Ceux "," ceux "
-" Par "," par "
-" Devant "," devant "
-" Bien "," bien "
-" Personne "," personne "
-" Près "," près "
-" Avant "," avant "
-" Rien "," rien "
-" Partout "," partout "
-" Pourtant "," pourtant "
-" Déjà "," déjà "
-" Enfin "," enfin "
-" Maintenant "," maintenant "
-" Quoi "," quoi "
-" Eh "," eh "
-" Ah "," ah "
-" Oh "," oh "
-" Jamais "," jamais "
-" Mon "," mon "
-" Cela "," cela "
-" Du "," du "
-" Oui "," oui "
-" Ou "," ou "
-" Sa "," sa "
-" Celui "," celui "
-" Cette "," cette "
-" Des "," des "
-" Naturellement "," naturellement "
-" Sans "," sans "
-" Vos "," vos "
-" Votre "," votre "
-" Notre "," notre "
-" Peut-être "," peut-être "
-" Mes "," mes "
-" Celle "," celle "
-" Tant "," tant "
-" Demain "," demain "
-" Qu "," que "
-" qu "," que "
-" quelqu "," quelque "
-" jusqu "," jusque "
-" Jusqu "," jusque "
-" aujourd hui "," aujourd'hui "
-"  ","  "
diff --git a/extras/fr_stopwords_errors.txt b/extras/fr_stopwords_errors.txt
deleted file mode 100644
index 331dfad..0000000
--- a/extras/fr_stopwords_errors.txt
+++ /dev/null
@@ -1,738 +0,0 @@
-a
-à
-abord
-aboutissant
-achille
-adieu
-afin
-aglaé
-aglante
-ah
-ahi
-ai
-aidant
-aie
-ai-je
-ailler
-ailleurs
-ainsi
-ais
-aise
-aise
-al
-alexandre
-aller
-alors
-angélique
-annibal
-après
-arlequin
-arrivant
-assez
-assurément
-as-tu
-a-t-elle
-a-t-il
-a-t-on
-attends
-atys
-au
-aucun
-aucune
-aucuns
-aujour
-aujourd
-aujourd'hui
-auprès
-aussi
-aussitôt
-autant
-autre
-autrement
-autres
-aux
-avant
-avec
-avecque
-avez-vous
-avoir
-avoir
-baccarat
-bailli
-bajazet
-barbier
-bas
-bazile
-beaucoup
-bégayait
-bel
-ben
-bérénice
-bian
-biau
-bien
-bientôt
-bizarre
-blaise
-bon
-bonne
-bous
-bout
-brousse
-brute
-c
-ça
-çà
-cab
-calo
-canadien
-capucin
-car
-cassandre
-caton
-ce
-cé
-ceci
-cela
-celle
-celles
-celui
-cent
-cent
-cents
-cents
-cependant
-certain
-ces
-ces
-césar
-cesse
-cesse 
-cet
-cette
-ceux
-chacun
-chaque
-chatouilleuse
-che
-chère
-cheux
-chez
-chourineur
-ci
-cinq
-cinquante
-claudine
-clémence
-colette
-colin
-combien
-comme
-comment
-contre
-courant
-crois-moi
-croyez-moi
-croyez-vous
-cru
-crus 
-crût
-cynthia
-d
-da
-d'abord
-d'ailleurs
-damis
-dan
-dans
-daphné
-davantage
-de
-dé
-debout
-début
-dedans
-dehors
-déjà
-demain
-depuis
-dernier
-dernière
-des
-dès
-descendant
-désormais
-dessus
-deux
-devant
-di
-dire
-dis-je
-dis-moi
-dis-tu
-dites-moi
-dites-vous
-dit-il
-dix
-dix-huit
-dix-neuf
-dix-sept
-do 
-dois-je
-dom
-donc
-dont
-dorante
-dos
-douze
-drès
-droite
-du
-dur
-écoutez-moi
-effraya
-effrayait
-effrayant
-effrayé
-effrayée
-effrayer
-effrayés
-effrayons
-eh
-élise
-elle
-elle-même
-elles
-elles-mêmes
-embrun 
-émeri
-en
-encor
-encore
-enfin
-ensemble
-entendant
-entier
-entrait
-entre
-essai
-essaya
-essayai
-essayais 
-essayait
-essayant
-essayé
-essayer
-essayez
-est
-est-ce
-est-elle
-est-il
-es-tu
-et
-étai
-état
-été
-êtes-vous
-être
-êtres
-eu
-eun
-eune
-eus
-eûs
-eut
-eût
-eux
-eux-mêmes
-fa
-fade
-faible
-faire
-faites-vous
-falloir
-fatmé
-faut-il
-fi
-figaro
-fil
-fin
-fis
-fit
-fois
-folles
-force
-fort
-fossinde
-frontin
-fur
-fût
-gauche
-gerfaut
-gille
-gilles
-glisse
-goualeuse
-grave
-gris
-guère
-guise
-ha
-haut
-hé
-hélas
-heureux
-hi
-hier
-hippolyte
-ho
-holà
-homme-là
-hors
-hui
-huit
-hylas
-i
-ici
-ii
-il
-ils
-in
-indifférent 
-indispensable
-insu
-irai
-isabelle
-isolé
-itou
-j
-jamais
-jason
-jaune
-je
-jé
-jean
-jeté
-jj
-joyeux 
-juan
-jugé
-jusq
-jusqu
-jusque
-jusques
-juste
-justement
-l
-la
-là
-là-bas
-là-dedans
-là-dessus
-laisse-moi
-laisser
-laissez-moi
-large
-le
-lé
-léandre
-les
-leur
-leurs
-levant 
-levé
-levé
-li
-lire
-lis
-lisai
-lisaient
-lisais
-lisait
-lisant
-lisette
-lisez
-lisons
-loin
-long
-longtemps
-lors
-lorsq
-lorsque
-lucas
-lucile
-lui
-lui-même
-lut
-ly
-m
-ma
-mac
-magnier
-maintenant
-mais
-mal
-malgré
-manière
-manqué
-margot
-marie
-marié
-marmouset
-marton
-mathurin
-mauvais
-me
-mé
-méchant 
-même
-mêmes
-mêmes
-ménandre
-mes
-mettre
-mi
-mien
-mienne
-miens
-miens
-mieux
-mille
-mille
-mine
-mis
-mise
-moderne
-moi
-moi-même
-moins
-mon
-monsieu
-monsir
-morgué
-mot
-moue
-moujik
-muet
-muet 
-muette
-n
-ne
-né
-nérine
-ni
-no
-noir
-nommés
-non
-nos
-notre
-nôtre
-nous
-noute
-nouveaux
-nu
-nue
-nul
-oh
-on
-ons
-ont
-onze
-oronte
-ose
-ou
-où
-oublie
-oui
-ous
-ouvrait
-ouvrant
-ouvrons
-palmure
-palsangué
-paquier
-par
-parbleu
-parce
-pareil
-parfait
-pargué
-parler
-parmi
-parole
-parsonne
-part
-parton
-partout
-paru
-pas
-paya
-paya
-payai
-payais
-payait
-payât
-payé
-payées
-payer
-payés
-payez
-payons
-peignait
-pendant
-pensé
-per
-perdre
-personne
-personnes
-peu
-peut-être
-peut-il
-peut-on
-pierre
-pis
-pleurer
-plu
-plupart
-plus
-plusieurs
-plut
-plutôt
-point
-pompée
-porter 
-possible
-pou
-pour
-pourquoi
-pourtant
-pourvu
-poussa
-pouvez-vous
-pouvoir
-premier
-première
-prendre
-près
-presque
-priant
-prie
-pris
-prise
-promptement
-puis
-puis-je
-puisq
-puisque
-pût
-pyrrhus
-qu
-quaker
-quakeresse
-quand
-quant
-quarante
-quatorze
-quatre
-quatre-vingt
-quatre-vingt-dix
-que
-qué
-quel
-quelle
-quelles
-quelq
-quelque
-quelque
-quelquefois
-quelques
-quelqu'un
-quels
-queu
-queuque
-qui
-quinze
-quinze
-quoi
-quoiq
-quoique
-raide
-ram
-ramené
-ramener
-ramener 
-reçois
-refait
-rendre
-rendre
-rénine
-rentré
-reste
-rester
-rian
-riche
-rien
-rocambole
-rosine
-rouge
-s
-sa
-sachem
-sais-tu
-sanche
-sans
-saurer
-savez-vous
-savoir
-scipion
-se
-second
-seconde
-secouée
-seize
-selon
-semble
-sembler
-sen
-serra
-ses
-seul
-seule
-seulement
-si
-sien
-sienne
-signifiant
-sis
-sitot
-six
-soi-même
-soixante
-soixante-dix
-sommes
-son
-songez
-sont
-sophie
-sortant
-sortir
-soudain
-sous
-soutenant
-souvent
-ste
-sti
-suis
-suis
-suivre
-sujet
-sur
-sûr
-surtout
-sylla
-sylvanire
-t
-ta
-taisez-vous
-tandis
-tant
-tantôt
-tartarin
-tatigué
-te
-té
-tel
-téléga
-telle
-tellement
-tels
-tenez
-tenir
-tenons
-tente
-terrier
-tes
-thésée
-tien
-tienne
-tiennent
-tiens
-tient
-timar
-tirant
-tirer
-tirinte
-tirsis
-toi
-toi-même
-tom
-tombée
-ton
-tôt
-toujours
-tous
-tout
-toute
-toutefois
-toutes
-travers
-treize
-tremblant
-tremblante
-tremble
-trente
-très
-trois
-trop
-trouvé
-trouver
-tu
-tullie
-turc
-un
-une
-unknown
-<unknown>
-ur
-ursuline
-utile
-v
-vanda
-vela
-velà
-venir
-venu 
-vers
-veux-tu
-vi
-viant
-vingt
-vint
-vis
-vit
-vite
-vivant
-vla
-vlà
-voici
-voilà
-voir
-vois-je
-vois-je
-vois-tu
-voit
-vont
-vos
-votre
-vôtre
-vouloir
-vous
-vous-même
-voute
-voyant
-vraiment
-vue
-waterproof
-y
-zoé
diff --git a/extras/fr_stopwords_project.txt b/extras/fr_stopwords_project.txt
deleted file mode 100644
index 1a3f48d..0000000
--- a/extras/fr_stopwords_project.txt
+++ /dev/null
@@ -1,55 +0,0 @@
-air
-ais
-an
-année
-bras
-brousse
-chose
-chott
-côté
-coup
-doute
-effet
-état
-été
-façon
-fait
-femme
-fois
-fond
-genre
-gens
-heure
-homme
-instant
-jour
-lieu
-main
-mal
-mètre
-milieu
-moment
-monde
-nom
-nu
-nue
-oeil
-œil
-parole
-pas
-peine
-personne
-personnes
-petit
-pied
-place
-sens
-sorte
-suite
-temps
-tête
-tour
-travers
-un
-vieux
-voix
diff --git a/tmw.py b/tmw.py
index f84e357..d71a17d 100644
--- a/tmw.py
+++ b/tmw.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python3
+# -*- coding: utf-8 -*-
 # Filename: tmw.py
 # Author: #cf
 
@@ -79,6 +80,13 @@ def tei5reader_fulldocs(inpath, outfolder):
     print("Done.")
     
 
+# Utility function for writing segments
+def writesegment(segment, outfolder, filename, counter, mode="w"):
+    from os.path import join
+    segname = join(outfolder, filename + "§{:04d}".format(counter) + ".txt")
+    with open(segname, mode) as output:
+        output.write(' '.join(segment))
+    output.close()
 
 #################################
 # segmenter                     #
@@ -100,25 +108,30 @@ def writesegment(segment, outfolder, filename, target, tolerancefactor, preserve
     from os.path import join
     global currentsegmentsize
     global counter
+
     # ignore empty segments
     if segment == ["\n"] or len(segment) < 1:
         return
+
     # workaround for easy inter line-spacing in case of paragraph removal for lines combined into one segment
     if not preserveparagraphs and segment[-1] == "\n":
         segment = segment[0:len(segment) - 1]
         segment[-1] += " "
     segname = join(outfolder, filename + "§{:04d}".format(counter) + ".txt")
     relname = filename + "§{:04d}".format(counter) + ".txt"
+
     # case: last segment is too small => fill with (slice of) new segment
     if currentsegmentsize * tolerancefactor < target: # min size limit not reached => split
         #split segment
         wordsliceindex = target - currentsegmentsize
+
         # if it's too big: slice!
         if currentsegmentsize + len(segment) > target * tolerancefactor:
-            #print(relname + "\t Last segment size: " + str(currentsegmentsize) + "\t appending " + str(wordsliceindex) + "\t for a total of " + str((currentsegmentsize + wordsliceindex)))
+            print(relname + "\t Last segment size: " + str(currentsegmentsize) + "\t appending " + str(wordsliceindex) + "\t for a total of " + str((currentsegmentsize + wordsliceindex)))
             write(segment[0:wordsliceindex], segname, "a")
             currentsegmentsize += wordsliceindex
             segment = segment[wordsliceindex:len(segment)]
+
             # segment is filled. continue with next one
             counter += 1
             currentsegmentsize = 0
@@ -128,18 +141,20 @@ def writesegment(segment, outfolder, filename, target, tolerancefactor, preserve
                 os.remove(segname)
         # else just add text to current segment
         else:
-            #print(relname + "\t Last segment size: " + str(currentsegmentsize) + "\t appending " + str(len(segment)) + "\t for a total of " + str((currentsegmentsize + len(segment))))
+            print(relname + "\t Last segment size: " + str(currentsegmentsize) + "\t appending " + str(len(segment)) + "\t for a total of " + str((currentsegmentsize + len(segment))))
             # segment fits so append
             write(segment, segname, "a")
             currentsegmentsize += len(segment) - segment.count("\n") # take possible segment end into account!
             # done
             return
+
     # case: new segment is too big
     # if segment > target: slice segment
     while len(segment) > target * tolerancefactor:
-        #print(relname + "\t Last segment size: " + str(currentsegmentsize) + "\t appending " + str(target) + "\t for a total of " + str((currentsegmentsize + target)))
+        print(relname + "\t Last segment size: " + str(currentsegmentsize) + "\t appending " + str(target) + "\t for a total of " + str((currentsegmentsize + target)))
         write(segment[0:target], segname)
         segment = segment[target:len(segment)]
+
         # segment is filled. continue with next one
         counter += 1
         currentsegmentsize = 0
@@ -147,12 +162,15 @@ def writesegment(segment, outfolder, filename, target, tolerancefactor, preserve
         relname = filename + "§{:04d}".format(counter) + ".txt"
         if os.path.isfile(segname):
             os.remove(segname)
-        #print(relname + "\t New segment with size \t0")
+        print(relname + "\t New segment with size \t0")
+
     # now size of segment is < target
     if (len(segment) == 0):
         #segment was perfectly sliced so we are done
         return
+
     # there's some part of segment left, write this into file
+
     # if the remaining part is exceeding current segment's capacity start new segment
     if currentsegmentsize + len(segment) > target * tolerancefactor:
         # segment is filled. continue with next one
@@ -162,17 +180,23 @@ def writesegment(segment, outfolder, filename, target, tolerancefactor, preserve
         relname = filename + "§{:04d}".format(counter) + ".txt"
         if os.path.isfile(segname):
             os.remove(segname)
-        #print(relname + "\t New segment with size \t0")
-    #print(relname + "\t Last segment size: " + str(currentsegmentsize) + "\t appending " + str(len(segment)) + "\t for a total of " + str((currentsegmentsize + len(segment))))
+
+        print(relname + "\t New segment with size \t0")
+
+    print(relname + "\t Last segment size: " + str(currentsegmentsize) + "\t appending " + str(len(segment)) + "\t for a total of " + str((currentsegmentsize + len(segment))))
     currentsegmentsize += len(segment) - segment.count("\n") # take possible segment end into account!
     write(segment, segname, "a")
 
-def segmenter(inpath, outfolder, target, sizetolerancefactor, preserveparagraphs = False):
-    """Script for turning plain text files into equal-sized segments, with limited respect for paragraph boundaries."""
-    print("\nLaunched segmenter.")
 
+def segmenter(inpath, outfolder, target, sizetolerancefactor, preserveparagraphs):
+    """Script for turning plain text files into equal-sized segments, without respecting paragraph boundaries."""
+    print("\nLaunched segmenter.")
+    import os
+    import re
+    from os import listdir
     from os.path import join
     from nltk.tokenize import word_tokenize
+    import glob
 
     if not os.path.exists(outfolder):
         os.makedirs(outfolder)
@@ -192,13 +216,15 @@ def segmenter(inpath, outfolder, target, sizetolerancefactor, preserveparagraphs
                 os.remove(segname)
             # segment contains words assigned to the current segment
             segment = []
-            # go thru paragraphs one by one
+
+            # go through paragraphs one by one
             for line in infile:
                 text = line
-                # remove special characters and space-chains
-                text = re.sub("[,;\.!?—\t\r\n\v\f]", " ", text)
-                text = re.sub("-", " ", text)
+                # (optional) remove punctuation, special characters and space-chains
+                #text = re.sub("[,;\.:!?¿\(\)—-]", " ", text)
+                text = re.sub("[\t\r\n\v\f]", " ", text)
                 text = re.sub("[ ]{1,9}", " ", text)
+
                 # tokenize text
                 words = word_tokenize(text)
                 words.append("\n")
@@ -211,7 +237,102 @@ def segmenter(inpath, outfolder, target, sizetolerancefactor, preserveparagraphs
 # Binning                       #
 #################################
 
-# TODO: Rewrite entirely to make compatible with mastermatrix.
+def segments_to_bins(inpath, outfolder, binsnb):
+    """Script for sorting text segments into bins."""
+    print("\nLaunched segments_to_bins.")
+
+    import math, sys
+    import os
+    import glob
+    from collections import Counter
+    import pandas as pd
+
+    ### Define various objects for later use.
+    txtids = []
+    segids = []
+
+    filenames = []
+    binids = []
+
+    offset = sys.maxsize # used to track wrong segmenting (i.e. with segment numbering not starting with 0)
+
+    ### Get filenames, text identifiers, segment identifiers.
+    for file in glob.glob(inpath):
+        filename = os.path.basename(file)[:-4]
+        txtid = filename[:6]
+        txtids.append(txtid)
+        segid = filename[-4:]
+        #print(filename, txtid, segid)
+        segids.append(segid)
+        offset = min(offset, int(segid))
+    #txtids_sr = pd.Series(txtids)
+    #segids_sr = pd.Series(segids)
+
+    if offset > 0:
+        print("Warning! Segment numbering should start at 0. Using offset: " + str(offset))
+
+    ### For each text identifier, get number of segments.
+    txtids_ct = Counter(txtids)
+    sum_segnbs = 0
+    for txtid in txtids_ct:
+        segnb = txtids_ct[txtid]
+        #print(segnb)
+        sum_segnbs = sum_segnbs + segnb
+        #print(txtid, segnb)
+    print("Total number of segments: ", sum_segnbs)
+
+    for txtid in txtids_ct:
+        countsegs = txtids_ct[txtid]
+        if binsnb > int(countsegs):
+            print("Warning! You are expecting more bins than segments available! Bins will not be filled continuously!")
+
+    ### Match each filename to the number of segments of the text.
+
+    bcount = dict()
+    for i in range(0, binsnb):
+        bcount[i] = 0
+
+    for file in glob.glob(inpath):
+        filename = os.path.basename(file)[:-4]
+        for txtid in txtids_ct:
+            if txtid in filename:
+                filename = filename + "$" + str(txtids_ct[txtid])
+                #print(filename)
+
+    ### For each filename, compute and append bin number
+        txtid = filename[0:6]
+        segid = filename[7:11]
+        segnb = filename[12:]
+        #print(txtid,segid,segnb)
+        binid = ""
+
+        segprop = (int(segid) - offset) / int(segnb)
+        #print(txtid, segid, segnb, segprop)
+
+
+        binid = math.floor(segprop * binsnb)
+
+        if binid == binsnb: # avoid 1.0 beeing in seperate bin (should never happen due to offset!)
+            print("Error: Segment numbering is wrong! Continuing anyway...")
+            binid -= 1
+
+        bcount[binid] += 1
+
+        #print(segprop, binid)
+
+        filenames.append(filename[:11])
+        binids.append(binid)
+    filenames_sr = pd.Series(filenames, name="segmentID")
+    binids_sr = pd.Series(binids, name="binID")
+    files_and_bins = pd.concat([filenames_sr,binids_sr], axis=1)
+    print("chunks per bin: ", bcount)
+
+    if not os.path.exists(outfolder):
+        os.makedirs(outfolder)
+    outfile = outfolder+"segs-and-bins.csv"
+    with open(outfile, "w") as outfile:
+        files_and_bins.to_csv(outfile, index=False)
+
 
 
 
@@ -227,13 +348,17 @@ def perform_multipleSubs(substitutionsFile, text):
     ## Load table and turn into dict
     with open(substitutionsFile, "r") as subsFile: 
         subs = csv.reader(subsFile)
-        for rows in subs:
-            subsDict = {rows[0]:rows[1] for rows in subs}
-        #print(subsDict)
+        subsDict = {rows[0]:rows[1] for rows in subs}
+        for key, value in subsDict.items(): 
+            text = re.sub(key, value, text)
+            #print(text)
+        return text
+
         ## Create a regular expression  from the dictionary keys
-        regex = re.compile("(%s)" % "|".join(map(re.escape, subsDict.keys())))
+        #regex = re.compile("(%s)" % "|".join(map(re.escape, subsDict.keys())))
         ## For each match, look-up corresponding value in dictionary
-        return regex.sub(lambda mo: subsDict[mo.string[mo.start():mo.end()]], text) 
+        #result = regex.sub(lambda mo: subsDict[mo.string[mo.start():mo.end()]], text)
+        #print(result)
 
 def pretokenize(inpath, substitutionsFile, outfolder):
     """Deletion of unwanted elided and hyphenated words for better tokenization in TreeTagger. Optional."""
@@ -243,6 +368,8 @@ def pretokenize(inpath, substitutionsFile, outfolder):
             text = text.read()
             text = perform_multipleSubs(substitutionsFile, text)
             basename = os.path.basename(file)
+            if "truc" in text or "type" in text or "flic" in text: 
+                print("Found bad word in", basename)
             cleanfilename = basename
             if not os.path.exists(outfolder):
                 os.makedirs(outfolder)
@@ -331,10 +458,16 @@ def make_lemmatext(inpath, outfolder, mode, stoplist_errors):
                         elif "NOM" in pos or "VER" in pos or "ADJ" in pos or "ADV" in pos and "|" not in lemma and "<unknown>" not in lemma:
                             lemmata.append(lemma.lower())
                     elif mode == "esN":
-                        if "|" in lemma:
+                        if "|" in lemma and "NC" in pos:
                             lemmata.append(token.lower())
                         elif "NC" in pos and "|" not in lemma and "<unknown>" not in lemma:
                             lemmata.append(lemma.lower())
+                    elif mode == "enNV":
+                        if "NN" in pos or "VB" in pos and "|" not in lemma and "<unknown>" not in lemma:
+                            lemmata.append(lemma.lower())
+                    elif mode == "enN":
+                        if "NN" in pos and "|" not in lemma and "<unknown>" not in lemma:
+                            lemmata.append(lemma.lower())
             ## Continue with list of lemmata, but remove undesired leftover words         
             lemmata = ' '.join([word for word in lemmata if word not in stoplist])
             lemmata = re.sub("[ ]{1,4}"," ", lemmata)
@@ -347,19 +480,68 @@ def make_lemmatext(inpath, outfolder, mode, stoplist_errors):
 
 
 
+
+
+
+#################################
+# substitute                    #
+#################################
+
+import csv
+
+def multipleSubs(substitutionsFile, text):
+    """Search and replace from a table of string pairs."""
+    ## With code from http://stackoverflow.com/users/735204/emmett-j-butler
+    ## Load table and turn into dict
+    with open(substitutionsFile, "r") as subsFile: 
+        subs = csv.reader(subsFile)
+        subsDict = {rows[0]:rows[1] for rows in subs}
+        for key, value in subsDict.items(): 
+            text = re.sub(key, value, text)
+            #print(text)
+        return text
+
+        ## Create a regular expression  from the dictionary keys
+        #regex = re.compile("(%s)" % "|".join(map(re.escape, subsDict.keys())))
+        ## For each match, look-up corresponding value in dictionary
+        #result = regex.sub(lambda mo: subsDict[mo.string[mo.start():mo.end()]], text)
+        #print(result)
+
+def substitute(inpath, substitutionsFile, outfolder):
+    """Deletion of unwanted elided and hyphenated words for better tokenization in TreeTagger. Optional."""
+    print("\nLaunched substitute.")
+    for file in glob.glob(inpath):
+        with open(file,"r") as text:
+            text = text.read()
+            text = multipleSubs(substitutionsFile, text)
+            basename = os.path.basename(file)
+            counter = 0
+            if " truc " in text or " type " in text or " flic " in text: 
+                counter +=1
+            print(counter)
+            cleanfilename = basename
+            if not os.path.exists(outfolder):
+                os.makedirs(outfolder)
+        with open(os.path.join(outfolder, cleanfilename),"w") as output:
+            output.write(text)
+    print("Done.")
+
+
+
+
+
+
 ##################################################################
 ### TOPIC MODELLING WITH MALLET                                ###
 ##################################################################
 
-# TODO: Concatenate two stoplists first, one for errors, one for deliberate ommissions.
-
 
 #################################
 # call_mallet_import            #
 #################################
 
 
-def call_mallet_import(mallet_path, infolder,outfolder, outfile, stoplist_project):
+def call_mallet_import(mallet_path, infolder, outfolder, outfile, stoplist_project):
     """Function to import text data into Mallet."""
     print("\nLaunched call_mallet_import.")    
     import subprocess
@@ -380,14 +562,12 @@ def call_mallet_import(mallet_path, infolder,outfolder, outfile, stoplist_projec
 # call_mallet_modeling          #
 #################################
 
-def call_mallet_modeling(mallet_path, inputfile,outfolder,num_topics,optimize_interval,num_iterations,num_top_words,doc_topics_max):
+def call_mallet_modeling(mallet_path, inputfile,outfolder,numOfTopics,optimize_interval,num_iterations,num_top_words,doc_topics_max):
     """Function to perform topic modeling with Mallet."""
     print("\nLaunched call_mallet_modeling.")
 
-    ### Getting ready.
     import os
     import subprocess
-    
     if not os.path.exists(outfolder):
         os.makedirs(outfolder)
 
@@ -399,7 +579,7 @@ def call_mallet_modeling(mallet_path, inputfile,outfolder,num_topics,optimize_in
     output_topic_state = outfolder + "topic_state.gz"
     
     ### Constructing Mallet command from parameters.
-    command = mallet_path +" train-topics --input "+ inputfile +" --num-topics "+ num_topics +" --optimize-interval "+ optimize_interval +" --num-iterations " + num_iterations +" --num-top-words " + num_top_words +" --word-topic-counts-file "+ word_topics_counts_file + " --topic-word-weights-file "+ topic_word_weights_file +" --output-state topic-state.gz"+" --output-topic-keys "+ output_topic_keys +" --output-doc-topics "+ output_doc_topics +" --doc-topics-max "+ doc_topics_max + " --output-state " + output_topic_state
+    command = mallet_path +" train-topics --input "+ inputfile +" --num-topics "+ numOfTopics +" --optimize-interval "+ optimize_interval +" --num-iterations " + num_iterations +" --num-top-words " + num_top_words +" --word-topic-counts-file "+ word_topics_counts_file + " --topic-word-weights-file "+ topic_word_weights_file +" --output-state topic-state.gz"+" --output-topic-keys "+ output_topic_keys +" --output-doc-topics "+ output_doc_topics +" --doc-topics-max "+ doc_topics_max + " --output-state " + output_topic_state
     #print(command)
     subprocess.call(command, shell=True)
     print("Done.\n")
@@ -423,15 +603,15 @@ def call_mallet_modeling(mallet_path, inputfile,outfolder,num_topics,optimize_in
 import glob
 
 def get_metadata(metadatafile):
-    print("  Getting metadata...")
+    print("- getting metadata...")
     """Read metadata file and create DataFrame."""
     metadata = pd.DataFrame.from_csv(metadatafile, header=0, sep=",")
     #print("metadata\n", metadata)
     return metadata
 
-def get_topicscores(topics_in_texts, number_of_topics): 
+def get_topicscores(topics_in_texts, numOfTopics): 
     """Create a matrix of segments x topics, with topic score values, from Mallet output.""" 
-    print("  Getting topicscores...")   
+    print("- getting topicscores...")   
     ## Load Mallet output (strange format)
     topicsintexts = pd.read_csv(topics_in_texts, header=None, skiprows=[0], sep="\t", index_col=0)
     #topicsintexts = topicsintexts.iloc[0:100,]  ### For testing only!!
@@ -449,7 +629,7 @@ def get_topicscores(topics_in_texts, number_of_topics):
         scores = []
         ## For each segment, get the topic number and its score
         i +=1
-        for j in range(1,number_of_topics,2):
+        for j in range(1,numOfTopics,2):
             k = j+1
             topic = topicsintexts.iloc[i,j]
             score = topicsintexts.iloc[i,k]
@@ -471,7 +651,7 @@ def get_topicscores(topics_in_texts, number_of_topics):
         
 def get_docmatrix(corpuspath):
     """Create a matrix containing segments with their idnos."""
-    print("  Getting docmatrix...")
+    print("- getting docmatrix...")
     ## Create dataframe with filenames of segments and corresponding idnos.
     segs = []
     idnos = []
@@ -487,18 +667,18 @@ def get_docmatrix(corpuspath):
     return docmatrix
     
 def merge_data(corpuspath, metadatafile, topics_in_texts, mastermatrixfile, 
-               number_of_topics):
+               numOfTopics):
     """Merges the three dataframes into one mastermatrix."""
-    print("  Getting data...")
+    print("- getting data...")
     ## Get all necessary data.
     metadata = get_metadata(metadatafile)
     docmatrix = get_docmatrix(corpuspath)
-    topicscores = get_topicscores(topics_in_texts, number_of_topics)
+    topicscores = get_topicscores(topics_in_texts, numOfTopics)
     ## For inspection only.
     #print("Metadata\n", metadata.head())
     #print("Docmatrix\n", docmatrix.head())
     #print("topicscores\n", topicscores.head())
-    print("  Merging data...")    
+    print("- merging data...")    
     ## Merge metadata and docmatrix, matching each segment to its metadata.
     mastermatrix = pd.merge(docmatrix, metadata, how="inner", on="idno")  
     #print("mastermatrix: metadata and docmatrix\n", mastermatrix)
@@ -510,16 +690,29 @@ def merge_data(corpuspath, metadatafile, topics_in_texts, mastermatrixfile,
     #print("mastermatrix: all three\n", mastermatrix.head())
     return mastermatrix
 
+def add_binData(mastermatrix, binDataFile): 
+    print("- adding bin data...")
+    ## Read the information about bins
+    binData = pd.read_csv(binDataFile, sep=",")
+    #print(binData)
+    ## Merge existing mastermatrix and binData.
+    mastermatrix = pd.merge(mastermatrix, binData, how="inner", on="segmentID")  
+    #print(mastermatrix)
+    return mastermatrix
+
 def create_mastermatrix(corpuspath, outfolder, mastermatrixfile, metadatafile, 
-                        topics_in_texts, number_of_topics):
+                        topics_in_texts, numOfTopics, useBins, binDataFile):
     """Builds the mastermatrix uniting all information about texts and topic scores."""
-    print("\nLaunched create_mastermatrix. (This could take a while.)")
+    print("\nLaunched create_mastermatrix.")
+    print("(Warning: This is very memory-intensive and may take a while.)")
     if not os.path.exists(outfolder):
         os.makedirs(outfolder)
     mastermatrix = merge_data(corpuspath, metadatafile, topics_in_texts, 
-                              mastermatrixfile, number_of_topics)
+                              mastermatrixfile, numOfTopics)
+    if useBins == True: 
+        mastermatrix = add_binData(mastermatrix, binDataFile)
     mastermatrix.to_csv(outfolder+mastermatrixfile, sep=",", encoding="utf-8")
-    print("  Saved mastermatrix. Segments and columns:", mastermatrix.shape)    
+    print("Done. Saved mastermatrix. Segments and columns:", mastermatrix.shape)    
 
 
 
@@ -538,7 +731,10 @@ def calculate_averageTopicScores(mastermatrixfile, targets, outfolder):
     for target in targets:
         grouped = mastermatrix.groupby(target, axis=0)
         avg_topicscores = grouped.agg(np.mean)
-        avg_topicscores = avg_topicscores.drop(["year"], axis=1)
+        if target != "year":
+            avg_topicscores = avg_topicscores.drop(["year"], axis=1)
+        if target != "binID":
+            avg_topicscores = avg_topicscores.drop(["binID"], axis=1)
         #avg_topicscores = avg_topicscores.drop(["tei"], axis=1)
         ## Save grouped averages to CSV file for visualization.
         resultfilename = "avgtopicscores_by-"+target+".csv"
@@ -549,6 +745,33 @@ def calculate_averageTopicScores(mastermatrixfile, targets, outfolder):
     print("Done.")
 
 
+################################
+# complexAverageTopicScores    #
+################################
+
+def calculate_complexAverageTopicScores(mastermatrixfile, targets, outfolder):
+    """Function to calculate average topic scores based on the mastermatrix."""
+    print("\nLaunched calculate_complexAverageTopicScores.")
+    if not os.path.exists(outfolder):
+        os.makedirs(outfolder)
+    with open(mastermatrixfile, "r") as infile:
+        mastermatrix = pd.DataFrame.from_csv(infile, header=0, sep=",")
+    ## Calculate average topic scores for each target category 
+    grouped = mastermatrix.groupby(targets, axis=0)
+    avg_topicscores = grouped.agg(np.mean)
+    if "year" not in targets:
+        avg_topicscores = avg_topicscores.drop(["year"], axis=1)
+    if "binID" not in targets:
+        avg_topicscores = avg_topicscores.drop(["binID"], axis=1)
+    #print(avg_topicscores)
+    ## Save grouped averages to CSV file for visualization.
+    identifierstring = '+'.join(map(str, targets))
+    resultfilename = "complex-avgtopicscores_by-"+identifierstring+".csv"
+    resultfilepath = outfolder+resultfilename
+    avg_topicscores.to_csv(resultfilepath, sep=",", encoding="utf-8")
+    print("Done. Saved average topic scores for: "+identifierstring)    
+
+
 
 #################################
 # save_firstWords               #
@@ -575,7 +798,7 @@ def save_firstWords(topicWordFile, outfolder, filename):
         #firstWordsSeries.index.name = "topic"
         #firstWordsSeries = firstWordsSeries.rename(columns = {'two':'new_name'})
         firstWordsSeries.reindex_axis(["firstwords"])
-        print(firstWordsSeries)
+        #print(firstWordsSeries)
         ## Saving the file.
         if not os.path.exists(outfolder):
             os.makedirs(outfolder)
@@ -585,6 +808,30 @@ def save_firstWords(topicWordFile, outfolder, filename):
         print("Done.")
 
 
+#################################
+# save_topicRanks               #
+#################################
+
+def save_topicRanks(topicWordFile, outfolder, filename):
+    """Save a list of topics with their rank by topic score."""
+    print("Launched save_topicRanks.")
+    with open(topicWordFile, "r") as infile:
+        topicRanks = pd.read_csv(infile, sep="\t", header=None)
+        topicRanks = topicRanks.drop(2, axis=1)
+        topicRanks.rename(columns={0:"Number"}, inplace=True)
+        topicRanks.rename(columns={1:"Score"}, inplace=True)
+        #topicRanks.sort(columns=["Score"], ascending=False, inplace=True)
+        topicRanks["Rank"] = topicRanks["Score"].rank(ascending=False)
+        #print(topicRanks.head())
+        ## Saving the file.
+        if not os.path.exists(outfolder):
+            os.makedirs(outfolder)
+        outfile = outfolder + filename
+        with open(outfile, "w") as outfile: 
+            topicRanks.to_csv(outfile)
+        print("Done.")
+
+
 
 ##################################################################
 ###    VISUALIZATION                                           ###
@@ -593,73 +840,77 @@ def save_firstWords(topicWordFile, outfolder, filename):
 import matplotlib.pyplot as plt
 
 
-
 #################################
 # make_wordle_from_mallet       #
 #################################
 
-def make_wordle_from_mallet(word_weights_file,topics,words,outfolder, 
+from wordcloud import WordCloud
+import random
+
+def read_mallet_output(word_weights_file):
+    """Reads Mallet output (topics with words and word weights) into dataframe.""" 
+    word_scores = pd.read_table(word_weights_file, header=None, sep="\t")
+    word_scores = word_scores.sort(columns=[0,2], axis=0, ascending=[True, False])
+    word_scores_grouped = word_scores.groupby(0)
+    #print(word_scores.head())
+    return word_scores_grouped
+
+def get_wordlewords(words, word_weights_file, topic):
+    """Transform Mallet output for wordle generation."""
+    topic_word_scores = read_mallet_output(word_weights_file).get_group(topic)
+    top_topic_word_scores = topic_word_scores.iloc[0:words]
+    topic_words = top_topic_word_scores.loc[:,1].tolist()
+    word_scores = top_topic_word_scores.loc[:,2].tolist()
+    wordlewords = ""
+    j = 0
+    for word in topic_words:
+        word = word
+        score = word_scores[j]
+        j += 1
+        wordlewords = wordlewords + ((word + " ") * score)
+    return wordlewords
+        
+def get_color_scale(word, font_size, position, orientation, font_path, random_state=None):
+    """ Create color scheme for wordle."""
+    return "hsl(245, 58%, 25%)" # Default. Uniform dark blue.
+    #return "hsl(0, 00%, %d%%)" % random.randint(80, 100) # Greys for black background.
+    #return "hsl(221, 65%%, %d%%)" % random.randint(30, 35) # Dark blues for white background
+
+def get_topicRank(topic, topicRanksFile):
+    #print("getting topic rank.")
+    with open(topicRanksFile, "r") as infile:
+        topicRanks = pd.read_csv(infile, sep=",", index_col=0)
+        rank = int(topicRanks.iloc[topic]["Rank"])
+        return rank
+
+
+def make_wordle_from_mallet(word_weights_file, 
+                            numOfTopics,words,outfolder,
+                            topicRanksFile,
                             font_path, dpi):
     """Generate wordles from Mallet output, using the wordcloud module."""
     print("\nLaunched make_wordle_from_mallet.")
-
-    from wordcloud import WordCloud
-    import random
-
-    if not os.path.exists(outfolder):
-        os.makedirs(outfolder)
-    
-    def read_mallet_output(word_weights_file):
-        """Reads Mallet output (topics with words and word weights) into dataframe.""" 
-        word_scores = pd.read_table(word_weights_file, header=None, sep="\t")
-        word_scores = word_scores.sort(columns=[0,2], axis=0, ascending=[True, False])
-        word_scores_grouped = word_scores.groupby(0)
-        #print(word_scores.head())
-        return word_scores_grouped
-
-    def get_wordlewords(words,topic):
-        """Transform Mallet output for wordle generation."""
-        topic_word_scores = read_mallet_output(word_weights_file).get_group(topic)
-        top_topic_word_scores = topic_word_scores.iloc[0:words]
-        topic_words = top_topic_word_scores.loc[:,1].tolist()
-        word_scores = top_topic_word_scores.loc[:,2].tolist()
-        wordlewords = ""
-        j = 0
-        for word in topic_words:
-            word = word
-            score = word_scores[j]
-            j += 1
-            wordlewords = wordlewords + ((word + " ") * score)
-        return wordlewords
-        
-    def get_color_scale(word, font_size, position, orientation, random_state=None):
-        """ Create color scheme for wordle."""
-        #return "hsl(0, 00%, %d%%)" % random.randint(80, 100) # Greys for black background.
-        return "hsl(221, 65%%, %d%%)" % random.randint(30, 35) # Dark blue for white background
-
-# TODO: pack this into a proper separate function.
-
-    ## Creates the wordle visualisation, using results from the above functions.
-    for topic in range(0,topics):
-        ## Defines filename and title for the wordle image.
-        figure_filename = "wordle_tp"+"{:03d}".format(topic) + ".png"
-        figure_title = "topic "+ str(topic)        
+    for topic in range(0,numOfTopics):
         ## Gets the text for one topic.
-        text = get_wordlewords(words,topic)
-        #print(text)
-        ## Generates, recolors and saves the wordcloud.
-        #original# wordcloud = WordCloud(background_color="white", margin=5).generate(text)
-        #font_path = "/home/christof/.fonts/AveriaSans-Regular.ttf"
-        wordcloud = WordCloud(font_path=font_path, background_color="white", margin=5).generate(text)
+        text = get_wordlewords(words, word_weights_file, topic)
+        wordcloud = WordCloud(font_path=font_path, width=600, height=400, background_color="white", margin=4).generate(text)
         default_colors = wordcloud.to_array()
+        rank = get_topicRank(topic, topicRanksFile)
+        figure_title = "topic "+ str(topic) + " ("+str(rank)+"/"+str(numOfTopics)+")"       
         plt.imshow(wordcloud.recolor(color_func=get_color_scale, random_state=3))
         plt.imshow(default_colors)
         plt.imshow(wordcloud)
-        plt.title(figure_title, fontsize=24)
+        plt.title(figure_title, fontsize=30)
         plt.axis("off")
+        
+        ## Saving the image file.
+        if not os.path.exists(outfolder):
+            os.makedirs(outfolder)
+        figure_filename = "wordle_tp"+"{:03d}".format(topic) + ".png"
         plt.savefig(outfolder + figure_filename, dpi=dpi)
         plt.close()
     print("Done.")
+
     
 def crop_images(inpath, outfolder, left, upper, right, lower):
     """ Function to crop wordle files."""
@@ -705,12 +956,22 @@ def get_targetItems(average, targetCategory):
         #print(targetItems)
         return(targetItems)    
      
-def get_dataToPlot(average, firstWordsFile, topTopicsShown, item):
+def get_dataToPlot(average, firstWordsFile, mode, topTopicsShown, item):
     """From average topic score data, select data to be plotted."""
     #print("  Getting dataToPlot.")
     with open(average, "r") as infile:
         ## Read the average topic score data
         allData = pd.DataFrame.from_csv(infile, sep=",")
+        if mode == "normalized": # mean normalization
+            colmeans = allData.mean(axis=0)
+            allData = allData / colmeans
+        elif mode == "zscores": # zscore transformation
+            colmeans = allData.mean(axis=0) # ???
+            colstd = allData.std(axis=0) #std for each topic
+            allData = (allData - colmeans) / colstd # = zscore transf.
+            
+        elif mode == "absolute": # absolute values
+            allData = allData
         allData = allData.T
         ## Add top topic words to table for display later
         firstWords = get_firstWords(firstWordsFile)
@@ -723,30 +984,34 @@ def get_dataToPlot(average, firstWordsFile, topTopicsShown, item):
         #print(dataToPlot)         
         return dataToPlot
 
-def create_barchart_topTopics(dataToPlot, targetCategory, item, 
+def create_barchart_topTopics(dataToPlot, targetCategory, mode, item, 
                               fontscale, height, dpi, outfolder):
     """Function to make a topTopics barchart."""
-    print("  Creating plot for: "+item)
+    print("  Creating plot for: "+str(item))
     ## Doing the plotting.
     dataToPlot.plot(kind="bar", legend=None) 
     plt.setp(plt.xticks()[1], rotation=90, fontsize = 11)   
-    plt.title("Top-Topics für: "+item, fontsize=15)
-    plt.ylabel("Scores", fontsize=13)
+    if mode == "normalized": 
+        plt.title("Top-distinctive Topics für: "+str(item), fontsize=15)
+        plt.ylabel("normalized scores", fontsize=13)
+    elif mode == "absolute":
+        plt.title("Top-wichtigste Topics für: "+str(item), fontsize=15)
+        plt.ylabel("absolute scores", fontsize=13)
     plt.xlabel("Topics", fontsize=13)
+    plt.tight_layout() 
     if height != 0:
         plt.ylim((0.000,height))
-    plt.tight_layout() 
-
+   
     ## Saving the plot to disk.
     outfolder = outfolder+targetCategory+"/"
     if not os.path.exists(outfolder):
         os.makedirs(outfolder)
-    figure_filename = outfolder+"topTopics_"+item+".png"
+    figure_filename = outfolder+"tT_"+mode+"-"+str(item)+".png"
     plt.savefig(figure_filename, dpi=dpi)
     plt.close()
 
-def plot_topTopics(averageDatasets, firstWordsFile, numberOfTopics, 
-                   targetCategories, topTopicsShown, fontscale, 
+def plot_topTopics(averageDatasets, firstWordsFile, numOfTopics, 
+                   targetCategories, mode, topTopicsShown, fontscale, 
                    height, dpi, outfolder): 
     """For each item in a category, plot the top n topics as a barchart."""
     print("Launched plot_topTopics.")
@@ -755,8 +1020,8 @@ def plot_topTopics(averageDatasets, firstWordsFile, numberOfTopics,
             if targetCategory in average:
                 targetItems = get_targetItems(average, targetCategory)
                 for item in targetItems:
-                    dataToPlot = get_dataToPlot(average, firstWordsFile, topTopicsShown, item)
-                    create_barchart_topTopics(dataToPlot, targetCategory, item, fontscale, height, dpi, outfolder)
+                    dataToPlot = get_dataToPlot(average, firstWordsFile, mode, topTopicsShown, item)
+                    create_barchart_topTopics(dataToPlot, targetCategory, mode, item, fontscale, height, dpi, outfolder)
     print("Done.")
 
 
@@ -796,7 +1061,7 @@ def create_topItems_barchart(dataToPlot, firstWords, targetCategory, topic,
     print("  Creating plot for topic: "+str(topic))
     ## Doing the plotting.
     dataToPlot.plot(kind="bar", legend=None) 
-    plt.title("Top "+targetCategory+" für topic "+str(topic)+" ("+str(firstWords)+")", fontsize=15)
+    plt.title("Top "+targetCategory+" für topic: "+str(firstWords), fontsize=15)
     plt.ylabel("Scores", fontsize=13)
     plt.xlabel(targetCategory, fontsize=13)
     plt.setp(plt.xticks()[1], rotation=90, fontsize = 11)   
@@ -808,7 +1073,7 @@ def create_topItems_barchart(dataToPlot, firstWords, targetCategory, topic,
     outfolder = outfolder+targetCategory+"/"
     if not os.path.exists(outfolder):
         os.makedirs(outfolder)
-    figure_filename = outfolder+"topItems_"+str(topic)+".png"
+    figure_filename = outfolder+"tI_by-"+targetCategory+"-{:03d}".format(topic)+".png"
     plt.savefig(figure_filename, dpi=dpi)
     plt.close()
 
@@ -816,7 +1081,7 @@ def create_topItems_barchart(dataToPlot, firstWords, targetCategory, topic,
 def plot_topItems(averageDatasets, 
                   outfolder, 
                   firstWordsFile,  
-                  numberOfTopics, 
+                  numOfTopics, 
                   targetCategories, 
                   topItemsShown, 
                   fontscale, 
@@ -828,7 +1093,7 @@ def plot_topItems(averageDatasets,
         for targetCategory in targetCategories:
             if targetCategory in average:
                 print(" Plotting for: "+targetCategory)
-                topics = list(range(0,numberOfTopics))
+                topics = list(range(0,numOfTopics))
                 for topic in topics:
                     firstWords = get_topItems_firstWords(firstWordsFile, 
                                                          topic)
@@ -857,7 +1122,7 @@ def plot_topItems(averageDatasets,
 # TODO: This next function could be merged with above.
 def get_heatmap_firstWords(firstWordsFile):
     """Function to load list of top topic words into dataframe."""
-    #print("  Getting firstWords.")
+    print("- getting firstWords...")
     with open(firstWordsFile, "r") as infile: 
         firstWords = pd.read_csv(infile, header=None)
         firstWords.drop(0, axis=1, inplace=True)
@@ -865,30 +1130,66 @@ def get_heatmap_firstWords(firstWordsFile):
         #print(firstWords)
         return(firstWords)
 
-def get_heatmap_dataToPlot(average, firstWordsFile, topTopicsShown, 
-                           numberOfTopics):
+def get_heatmap_dataToPlot(average, mode, firstWordsFile, topTopicsShown, 
+                           numOfTopics):
     """From average topic score data, select data to be plotted."""
-    #print("  Getting dataToPlot.")
+    print("- getting dataToPlot...")
     with open(average, "r") as infile:
         ## Read the average topic score data
         allScores = pd.DataFrame.from_csv(infile, sep=",")
+        if mode == "normalized": # mean normalization
+            colmeans = allScores.mean(axis=0)
+            allScores = allScores / colmeans
+        elif mode == "zscores": # zscore transformation
+            colmeans = allScores.mean(axis=0) # mean for each topic
+            allstd = allScores.std(axis=0) #std for entire df
+            allScores = (allScores - colmeans) / allstd # = zscore transf.
+        elif mode == "absolute": # absolute values
+            allScores = allScores
         allScores = allScores.T
-        ## Create subset of data based on target.
-        stdevs = allScores.std(axis=1)
-        allScores = pd.concat([allScores, stdevs], axis=1)
-        allScores = allScores.sort(columns=0, axis=0, ascending=False)
+        ## Add top topic words to table for display later
+        firstWords = get_heatmap_firstWords(firstWordsFile)
+        allScores.index = allScores.index.astype(np.int64)        
+        allScores = pd.concat([allScores, firstWords], axis=1, join="inner")
+        #print(allScores)
+        ## Remove undesired columns: subsubgenre
+        #allScores = allScores.drop("adventure", axis=1)
+        #allScores = allScores.drop("autobiographical", axis=1)
+        #allScores = allScores.drop("blanche", axis=1)
+        #allScores = allScores.drop("education", axis=1)
+        #allScores = allScores.drop("fantastic", axis=1)
+        #allScores = allScores.drop("fantastique", axis=1)
+        #allScores = allScores.drop("historical", axis=1)
+        #allScores = allScores.drop("n.av.", axis=1)
+        #allScores = allScores.drop("nouveau-roman", axis=1)
+        #allScores = allScores.drop("sciencefiction", axis=1)
+        #allScores = allScores.drop("social", axis=1)
+        #allScores = allScores.drop("other", axis=1)
+        #allScores = allScores.drop("espionnage", axis=1)
+        #allScores = allScores.drop("thriller", axis=1)
+        #allScores = allScores.drop("neopolar", axis=1)
+        ## Remove undesired columns: protagonist-policier
+        #allScores = allScores.drop("crminal", axis=1)
+        #allScores = allScores.drop("mixed", axis=1)
+        #allScores = allScores.drop("witness", axis=1)
+        #allScores = allScores.drop("criminel", axis=1)
+        #allScores = allScores.drop("detection", axis=1)
+        #allScores = allScores.drop("victime", axis=1)
+        #allScores = allScores.drop("n.av.", axis=1)
+        ## Sort by standard deviation
+        standardDeviations = allScores.std(axis=1)
+        standardDeviations.name = "std"
+        allScores.index = allScores.index.astype(np.int64)        
+        allScores = pd.concat([allScores, standardDeviations], axis=1)
+        allScores = allScores.sort(columns="std", axis=0, ascending=False)
+        allScores = allScores.drop("std", axis=1)
         someScores = allScores[0:topTopicsShown]
-        someScores = someScores.drop(0, axis=1)
         ## Necessary step to align dtypes of indexes for concat.
         someScores.index = someScores.index.astype(np.int64)        
         #print("dtype firstWords: ", type(firstWords.index))
         #print("dtype someScores: ", type(someScores.index))
         #print("\n==intersection==\n",someScores.index.intersection(firstWords.index))
-        ## Add top topic words to table for display later
-        firstWords = get_heatmap_firstWords(firstWordsFile)
-        dataToPlot = pd.concat([someScores, firstWords], axis=1, join="inner")
-        dataToPlot = dataToPlot.set_index("topicwords")
-        #print(dataToPlot)
+        dataToPlot = someScores.set_index("topicwords")
         ## Optionally, limit display to part of the columns
         #dataToPlot = dataToPlot.iloc[:,0:40]
         #print(dataToPlot)
@@ -897,12 +1198,13 @@ def get_heatmap_dataToPlot(average, firstWordsFile, topTopicsShown,
 def create_distinctiveness_heatmap(dataToPlot, 
                                    topTopicsShown,
                                    targetCategory, 
+                                   mode,
                                    fontscale,
                                    dpi, 
                                    outfolder):
-
+    print("- doing the plotting...")
     sns.set_context("poster", font_scale=fontscale)
-    sns.heatmap(dataToPlot, annot=False, cmap="YlOrRd", square=False)
+    sns.heatmap(dataToPlot, annot=False, cmap="RdBu_r", square=False)
     # Nice: bone_r, copper_r, PuBu, OrRd, GnBu, BuGn, YlOrRd
     plt.title("Verteilung der Topic Scores", fontsize=20)
     plt.xlabel(targetCategory, fontsize=16)
@@ -913,17 +1215,16 @@ def create_distinctiveness_heatmap(dataToPlot,
     ## Saving the plot to disk.
     if not os.path.exists(outfolder):
         os.makedirs(outfolder)
-    figure_filename = outfolder+"dist-heatmap_by-"+str(targetCategory)+".png"
+    figure_filename = outfolder+"dist-heatmap_"+mode+"-by-"+str(targetCategory)+".png"
     plt.savefig(figure_filename, dpi=dpi)
     plt.close()
 
-
-
 def plot_distinctiveness_heatmap(averageDatasets, 
                                  firstWordsFile, 
+                                 mode,
                                  outfolder, 
                                  targetCategories, 
-                                 numberOfTopics, 
+                                 numOfTopics, 
                                  topTopicsShown, 
                                  fontscale, 
                                  dpi):
@@ -932,18 +1233,20 @@ def plot_distinctiveness_heatmap(averageDatasets,
     for average in glob.glob(averageDatasets):
         for targetCategory in targetCategories: 
             if targetCategory in average and targetCategory != "segmentID":
-                print(" Plotting for: "+targetCategory)
-                dataToPlot = get_heatmap_dataToPlot(average, 
+                print("- working on: "+targetCategory)
+                dataToPlot = get_heatmap_dataToPlot(average,
+                                                    mode,
                                                     firstWordsFile, 
                                                     topTopicsShown,
-                                                    numberOfTopics)
+                                                    numOfTopics)
                 create_distinctiveness_heatmap(dataToPlot, 
                                                topTopicsShown,
                                                targetCategory, 
+                                               mode,
                                                fontscale,
                                                dpi, 
                                                outfolder)
-
+    print("Done.")
 
 
 #################################
@@ -1023,14 +1326,14 @@ def create_overTime_areaplot(dataToPlot, outfolder, fontscale, topics, dpi):
     plt.close()
 
 def plot_topicsOverTime(averageDatasets, firstWordsFile, outfolder, 
-                        numberOfTopics, fontscale, dpi, height,  
+                        numOfTopics, fontscale, dpi, height,  
                         mode, topics):
     """Function to plot development of topics over time using lineplots or areaplots."""
     print("Launched plot_topicsOverTime.")
     if mode == "line": 
         for average in glob.glob(averageDatasets):
             if "decade" in average:
-                entriesShown = numberOfTopics
+                entriesShown = numOfTopics
                 dataToPlot = get_overTime_dataToPlot(average, firstWordsFile, 
                                                      entriesShown, topics)
                 create_overTime_lineplot(dataToPlot, outfolder, fontscale, 
@@ -1038,7 +1341,7 @@ def plot_topicsOverTime(averageDatasets, firstWordsFile, outfolder,
     elif mode == "area":
         for average in glob.glob(averageDatasets):
             if "decade" in average:
-                entriesShown = numberOfTopics
+                entriesShown = numOfTopics
                 dataToPlot = get_overTime_dataToPlot(average, firstWordsFile, 
                                                      entriesShown, topics)
                 create_overTime_areaplot(dataToPlot, outfolder, fontscale, 
@@ -1048,11 +1351,529 @@ def plot_topicsOverTime(averageDatasets, firstWordsFile, outfolder,
 
 
 
+###########################
+## topicClustering     ###
+###########################
+
+# TOOD: Add figsize and orientation parameters.
+# TODO: Add "firstwords" as leaf labels instead of topic numbers.
+
+import scipy.cluster as sc
+
+def get_topWordScores(wordWeightsFile, WordsPerTopic):
+    """Reads Mallet output (topics with words and word weights) into dataframe.""" 
+    print("- getting topWordScores...")
+    wordScores = pd.read_table(wordWeightsFile, header=None, sep="\t")
+    wordScores = wordScores.sort(columns=[0,2], axis=0, ascending=[True, False])
+    topWordScores = wordScores.groupby(0).head(WordsPerTopic)
+    #print(topWordScores)
+    return topWordScores
+
+def build_scoreMatrix(topWordScores, topicsToUse):
+    """Transform Mallet output for wordle generation."""
+    print("- building score matrix...")
+    topWordScores = topWordScores.groupby(0)
+    listOfWordScores = []
+    for topic,data in topWordScores:
+        if topic in list(range(0,topicsToUse)):
+            words = data.loc[:,1].tolist()
+            scores = data.loc[:,2].tolist()
+            wordScores = dict(zip(words, scores))
+            wordScores = pd.Series(wordScores, name=topic)
+            listOfWordScores.append(wordScores)
+        scoreMatrix = pd.concat(listOfWordScores, axis=1)
+        scoreMatrix = scoreMatrix.fillna(10)
+    #print(scoreMatrix.head)
+    scoreMatrix = scoreMatrix.T
+    return scoreMatrix
+
+def perform_topicClustering(scoreMatrix, method, metric, wordsPerTopic, outfolder): 
+    print("- performing clustering...")
+    distanceMatrix = sc.hierarchy.linkage(scoreMatrix, method=method, metric=metric)
+    #print(distanceMatrix)
+    plt.figure(figsize=(25,10))
+    sc.hierarchy.dendrogram(distanceMatrix)
+    plt.setp(plt.xticks()[1], rotation=90, fontsize = 6)   
+    plt.title("Topic-Clustering Dendrogramm", fontsize=20)
+    plt.ylabel("Distanz", fontsize=16)
+    plt.xlabel("Parameter: "+method+" clustering - "+metric+" distance - "+str(wordsPerTopic)+" words", fontsize=16)
+    plt.tight_layout() 
+
+    ## Saving the image file.
+    if not os.path.exists(outfolder):
+        os.makedirs(outfolder)
+    figure_filename = "topic-clustering_"+metric+"-"+method+"-"+str(wordsPerTopic)+"words"+".png"
+    plt.savefig(outfolder + figure_filename, dpi=600)
+    plt.close()
+    
+
+def topicClustering(wordWeightsFile, wordsPerTopic, outfolder, 
+                    methods, metrics, topicsToUse):
+    """Display dendrogram of topic similarity using clustering."""
+    print("\nLaunched topicClustering.")
+    ## Gets the necessary data: the word scores for each topic
+    topWordScores = get_topWordScores(wordWeightsFile, wordsPerTopic)
+    ## Turn the data into a dataframe for further processing
+    scoreMatrix = build_scoreMatrix(topWordScores, topicsToUse)
+    ## Do clustering on the dataframe
+    for method in methods: 
+        for metric in metrics: 
+            perform_topicClustering(scoreMatrix, method, metric, wordsPerTopic, outfolder)
+    print("Done.")
+
+
+
+###########################
+## itemClustering       ###
+###########################
+
+# TOOD: Add orientation to parameters.
+
+import scipy.cluster as sc
+
+def build_itemScoreMatrix(averageDatasets, targetCategory, 
+                          topicsPerItem, sortingCriterium):
+    """Reads Mallet output (topics with words and word weights) into dataframe.""" 
+    print("- getting topWordScores...")
+    for averageFile in glob.glob(averageDatasets): 
+        if targetCategory in averageFile:
+            itemScores = pd.read_table(averageFile, header=0, index_col=0, sep=",")
+            itemScores = itemScores.T 
+            if sortingCriterium == "std": 
+                itemScores["sorting"] = itemScores.std(axis=1)
+            elif sortingCriterium == "mean": 
+                itemScores["sorting"] = itemScores.mean(axis=1)
+            itemScores = itemScores.sort(columns=["sorting"], axis=0, ascending=False)
+            itemScoreMatrix = itemScores.iloc[0:topicsPerItem,0:-1]
+            itemScoreMatrix = itemScoreMatrix.T
+            """
+            itemScoreMatrix = itemScoreMatrix.drop("Allais", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Audoux", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Barbara", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Barjavel", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Beckett", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Bernanos", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Bosco", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Bourget", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Butor", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Camus", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Carco", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Celine", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Colette", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Darien", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Daudet", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Delly", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Dombre", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Duras", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("ErckChat", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("FevalPP", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("MduGard", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Mirbeau", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Ohnet", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Perec", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Proust", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Queneau", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Rodenbach", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Rolland", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Roussel", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("SaintExupery", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Sand", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Aimard", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("AimardAuriac", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Balzac", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Bon", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Echenoz", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Flaubert", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Fleuriot", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("France", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Galopin", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Gary", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("GaryAjar", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("GaryBogat", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("GarySinibaldi", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Gautier", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Giono", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Gouraud", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Huysmans", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Hugo", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("LeClezio", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Loti", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Malot", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Mary", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Maupassant", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Modiano", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("RobbeGrillet", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Stolz", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Sue", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Tournier", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Verne", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Vian", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("VianSullivan", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Zola", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Malraux", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Simon", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("LeRouge", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("LeRougeGuitton", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Toussaint", axis=0)
+            itemScoreMatrix = itemScoreMatrix.drop("Khadra", axis=0)
+            """
+            #print(itemScoreMatrix)
+            return itemScoreMatrix
+
+def perform_itemClustering(itemScoreMatrix, targetCategory, method, metric, 
+                           topicsPerItem, sortingCriterium, figsize, outfolder): 
+    print("- performing clustering...")
+
+    ## Perform the actual clustering
+    itemDistanceMatrix = sc.hierarchy.linkage(itemScoreMatrix, method=method, metric=metric)
+        
+    ## Plot the distance matrix as a dendrogram
+    plt.figure(figsize=figsize) # TODO: this could be a a parameter.
+    itemLabels = itemScoreMatrix.index.values
+    sc.hierarchy.dendrogram(itemDistanceMatrix, labels=itemLabels, orientation="top")
+
+    ## Format items labels to x-axis tick labels
+    plt.setp(plt.xticks()[1], rotation=90, fontsize = 14)
+    plt.title("Item Clustering Dendrogramm: "+targetCategory, fontsize=20)
+    plt.ylabel("Distance", fontsize=16)
+    plt.xlabel("Parameter: "+method+" clustering - "+metric+" distance - "+str(topicsPerItem)+" topics", fontsize=16)
+    plt.tight_layout() 
+
+    ## Save the image file.
+    print("- saving image file.")
+    if not os.path.exists(outfolder):
+        os.makedirs(outfolder)
+    figure_filename = "item-clustering_"+targetCategory+"_"+metric+"-"+method+"-"+sortingCriterium+"-"+str(topicsPerItem)+"topics"+".jpg"
+    plt.savefig(outfolder + figure_filename, dpi=600)
+    plt.close()
+    
+def itemClustering(averageDatasets, figsize, outfolder, topicsPerItem, 
+                   targetCategories, methods, metrics, sortingCriterium):
+    """Display dendrogram of topic-based item similarity using clustering."""
+    print("\nLaunched itemClustering.")
+    for targetCategory in targetCategories: 
+        ## Load topic scores per itema and turn into score matrix
+        itemScoreMatrix = build_itemScoreMatrix(averageDatasets, targetCategory, 
+                                                topicsPerItem, sortingCriterium)
+        ## Do clustering on the dataframe
+        for method in methods: 
+            for metric in metrics: 
+                perform_itemClustering(itemScoreMatrix, targetCategory, 
+                                       method, metric, topicsPerItem, 
+                                       sortingCriterium, figsize, outfolder)
+    print("Done.")
+
+
+
+
+###########################
+## simple progression   ###
+###########################
+
+
+def get_progression_firstWords(firstWordsFile):
+    """Function to load list of top topic words into dataframe."""
+    #print("  Getting firstWords.")
+    with open(firstWordsFile, "r") as infile: 
+        firstWords = pd.read_csv(infile, header=None)
+        firstWords.drop(0, axis=1, inplace=True)
+        firstWords.rename(columns={1:"topicwords"}, inplace=True)
+        firstWords.index = firstWords.index.astype(np.int64)        
+        #print(firstWords)
+        return(firstWords)
+
+
+def get_selSimpleProgression_dataToPlot(averageDataset, firstWordsFile, 
+                               entriesShown, topics): 
+    """Function to build a dataframe with all data necessary for plotting."""
+    print("- getting data to plot...")
+    with open(averageDataset, "r") as infile:
+        allScores = pd.DataFrame.from_csv(infile, sep=",")
+        allScores = allScores.T        
+        #print(allScores.head())
+        ## Select the data for selected topics
+        someScores = allScores.loc[topics,:]
+        someScores.index = someScores.index.astype(np.int64)        
+        ## Add information about the firstWords of topics
+        firstWords = get_progression_firstWords(firstWordsFile)
+        dataToPlot = pd.concat([someScores, firstWords], axis=1, join="inner")
+        dataToPlot = dataToPlot.set_index("topicwords")
+        dataToPlot = dataToPlot.T
+        #print(dataToPlot)
+        return dataToPlot
+    
+    
+def create_selSimpleProgression_lineplot(dataToPlot, outfolder, fontscale, 
+                                topics, dpi, height):
+    """This function does the actual plotting and saving to disk."""
+    print("- creating the plot...")
+    ## Plot the selected data
+    dataToPlot.plot(kind="line", lw=3, marker="o")
+    plt.title("Entwicklung ausgewählter Topics über den Textverlauf", fontsize=20)
+    plt.ylabel("Topic scores (absolut)", fontsize=16)
+    plt.xlabel("Textabschnitte", fontsize=16)
+    plt.setp(plt.xticks()[1], rotation=0, fontsize = 14)   
+    if height != 0:
+        plt.ylim((0.000,height))
+
+    ## Saving the plot to disk.
+    if not os.path.exists(outfolder):
+        os.makedirs(outfolder)
+    ## Format the topic information for display
+    topicsLabel = "-".join(str(topic) for topic in topics)
+    figure_filename = outfolder+"sel_"+topicsLabel+".png"
+    plt.savefig(figure_filename, dpi=dpi)
+    plt.close()
+
+def get_allSimpleProgression_dataToPlot(averageDataset, firstWordsFile, 
+                               entriesShown, topic): 
+    """Function to build a dataframe with all data necessary for plotting."""
+    print("- getting data to plot...")
+    with open(averageDataset, "r") as infile:
+        allScores = pd.DataFrame.from_csv(infile, sep=",")
+        allScores = allScores.T        
+        #print(allScores)
+        ## Select the data for current topics
+        someScores = allScores.loc[topic,:]
+        someScores.index = someScores.index.astype(np.int64)
+        dataToPlot = someScores
+        #print(dataToPlot)
+        return dataToPlot
+        
+# TODO: Make sure this is only read once and then select when plotting.
+    
+    
+def create_allSimpleProgression_lineplot(dataToPlot, outfolder, fontscale, 
+                                firstWordsFile, topic, dpi, height):
+    """This function does the actual plotting and saving to disk."""
+    print("- creating the plot for topic " + topic)
+    ## Get the first words info for the topic
+    firstWords = get_progression_firstWords(firstWordsFile)
+    topicFirstWords = firstWords.iloc[int(topic),0]
+    #print(topicFirstWords)
+    ## Plot the selected data
+    dataToPlot.plot(kind="line", lw=3, marker="o")
+    plt.title("Entwicklung über den Textverlauf für "+topicFirstWords, fontsize=20)
+    plt.ylabel("Topic scores (absolut)", fontsize=16)
+    plt.xlabel("Textabschnitte", fontsize=16)
+    plt.setp(plt.xticks()[1], rotation=0, fontsize = 14)   
+    if height != 0:
+        plt.ylim((0.000,height))
+
+    ## Saving the plot to disk.
+    if not os.path.exists(outfolder):
+        os.makedirs(outfolder)
+    ## Format the topic information for display
+    topicsLabel = str(topic)
+    figure_filename = outfolder+"all_"+topicsLabel+".png"
+    plt.savefig(figure_filename, dpi=dpi)
+    plt.close()
+
+
+def simpleProgression(averageDataset, firstWordsFile, outfolder, 
+                           numOfTopics, 
+                           fontscale, dpi, height, mode, topics):
+    """Function to plot topic development over textual progression."""
+    print("Launched textualProgression.")
+    if mode == "selected" or mode == "sel": 
+        entriesShown = numOfTopics
+        dataToPlot = get_selSimpleProgression_dataToPlot(averageDataset, 
+                                                      firstWordsFile, 
+                                                      entriesShown, 
+                                                      topics)
+        create_selSimpleProgression_lineplot(dataToPlot, outfolder, 
+                                          fontscale, topics, 
+                                          dpi, height)
+    elif mode == "all": 
+        entriesShown = numOfTopics
+        topics = list(range(0, numOfTopics))
+        for topic in topics:
+            topic = str(topic)
+            dataToPlot = get_allSimpleProgression_dataToPlot(averageDataset, 
+                                                             firstWordsFile, 
+                                                             entriesShown, 
+                                                             topic)
+            create_allSimpleProgression_lineplot(dataToPlot, outfolder, 
+                                                 fontscale, firstWordsFile, 
+                                                 topic, dpi, height)
+    else: 
+        print("Please select a valid value for 'mode'.")
+    print("Done.")
+
+
+
+
+
+
 ##################################################################
-###    OTHER / OBSOLETE                                        ###
+###    OTHER / OBSOLETE / DEV                                  ###
 ##################################################################
 
 
+###########################
+## complex progression  ###        IN DEVELOPMENT
+###########################
+
+
+def get_selComplexProgression_dataToPlot(averageDataset, firstWordsFile, 
+                               entriesShown, topics): 
+    """Function to build a dataframe with all data necessary for plotting."""
+    print("- getting data to plot...")
+    with open(averageDataset, "r") as infile:
+        allScores = pd.DataFrame.from_csv(infile, sep=",")
+        allScores = allScores.T        
+        #print(allScores.head())
+        ## Select the data for selected topics
+        someScores = allScores.loc[topics,:]
+        someScores.index = someScores.index.astype(np.int64)        
+        ## Add information about the firstWords of topics
+        firstWords = get_progression_firstWords(firstWordsFile)
+        dataToPlot = pd.concat([someScores, firstWords], axis=1, join="inner")
+        dataToPlot = dataToPlot.set_index("topicwords")
+        dataToPlot = dataToPlot.T
+        #print(dataToPlot)
+        return dataToPlot
+    
+    
+def create_selComplexProgression_lineplot(dataToPlot, outfolder, fontscale, 
+                                topics, dpi, height):
+    """This function does the actual plotting and saving to disk."""
+    print("- creating the plot...")
+    ## Plot the selected data
+    dataToPlot.plot(kind="line", lw=3, marker="o")
+    plt.title("Entwicklung ausgewählter Topics über den Textverlauf", fontsize=20)
+    plt.ylabel("Topic scores (absolut)", fontsize=16)
+    plt.xlabel("Textabschnitte", fontsize=16)
+    plt.setp(plt.xticks()[1], rotation=0, fontsize = 14)   
+    if height != 0:
+        plt.ylim((0.000,height))
+
+    ## Saving the plot to disk.
+    if not os.path.exists(outfolder):
+        os.makedirs(outfolder)
+    ## Format the topic information for display
+    topicsLabel = "-".join(str(topic) for topic in topics)
+    figure_filename = outfolder+"sel_"+topicsLabel+".png"
+    plt.savefig(figure_filename, dpi=dpi)
+    plt.close()
+
+def get_allComplexProgression_dataToPlot(averageDataset, firstWordsFile, 
+                                         entriesShown, topic, targetCategories): 
+    """Function to build a dataframe with all data necessary for plotting."""
+    print("- getting data to plot...")
+    with open(averageDataset, "r") as infile:
+        allScores = pd.DataFrame.from_csv(infile, sep=",", index_col=None)
+        #print(allScores)
+        ## Select the data for current topics
+        target1 = targetCategories[0]
+        target2 = targetCategories[1]
+        target1data = allScores.loc[:,target1]
+        target2data = allScores.loc[:,target2]
+        topicScores = allScores.loc[:,topic]
+        #print(target1data)
+        #print(target2data)
+        #print(topicScores)
+        dataToPlot = pd.concat([target1data, target2data], axis=1)
+        dataToPlot = pd.concat([dataToPlot, topicScores], axis=1)
+        #print(dataToPlot)
+        return dataToPlot
+        
+# TODO: Make sure this is only read once and then select when plotting.
+
+        
+def create_allComplexProgression_lineplot(dataToPlot, targetCategories, 
+                                          outfolder, fontscale, 
+                                firstWordsFile, topic, dpi, height):
+    """This function does the actual plotting and saving to disk."""
+    print("- creating the plot for topic " + topic)
+    ## Get the first words info for the topic
+    firstWords = get_progression_firstWords(firstWordsFile)
+    topicFirstWords = firstWords.iloc[int(topic),0]
+    #print(topicFirstWords)
+    ## Split plotting data into parts (for target1)
+    target1data = dataToPlot.iloc[:,0]
+    #print(target1data)
+    numPartialData = len(set(target1data))
+    ## Initialize plot for several lines
+    completeData = []
+    #print(dataToPlot)
+    for target in set(target1data):
+        #print("  - plotting "+target)
+        partialData = dataToPlot.groupby(targetCategories[0])
+        partialData = partialData.get_group(target)
+        partialData.rename(columns={topic:target}, inplace=True)
+        partialData = partialData.iloc[:,2:3]
+        completeData.append(partialData)
+    #print(completeData)
+    ## Plot the selected data, one after the other
+    plt.figure()
+    plt.figure(figsize=(15,10))
+    for i in range(0, numPartialData):
+        #print(completeData[i])
+        label = completeData[i].columns.values.tolist()
+        label = str(label[0])
+        plt.plot(completeData[i], lw=4, marker="o", label=label)
+        plt.legend()
+    plt.title("Entwicklung über den Textverlauf für "+topicFirstWords, fontsize=20)
+    plt.ylabel("Topic scores (absolut)", fontsize=16)
+    plt.xlabel("Textabschnitte", fontsize=16)
+    plt.legend()
+    plt.locator_params(axis = 'x', nbins = 10)
+    plt.setp(plt.xticks()[1], rotation=0, fontsize = 14)   
+    if height != 0:
+        plt.ylim((0.000,height))
+
+    ## Saving the plot to disk.
+    if not os.path.exists(outfolder):
+        os.makedirs(outfolder)
+    ## Format the topic information for display
+    topicsLabel = str(topic)
+    figure_filename = outfolder+"all_"+str(targetCategories[0])+"-"+topicsLabel+".png"
+    plt.savefig(figure_filename, dpi=dpi)
+    plt.close()
+
+
+def complexProgression(averageDataset, 
+                       firstWordsFile, 
+                       outfolder, 
+                       numOfTopics, 
+                       targetCategories, 
+                       fontscale, 
+                       dpi, height, 
+                       mode, topics):
+    """Function to plot topic development over textual progression."""
+    print("Launched complexProgression.")
+    if mode == "sel": 
+        entriesShown = numOfTopics
+        dataToPlot = get_selSimpleProgression_dataToPlot(averageDataset, 
+                                                         firstWordsFile, 
+                                                         entriesShown, 
+                                                         topics)
+        create_selSimpleProgression_lineplot(dataToPlot, 
+                                             outfolder, 
+                                             fontscale, 
+                                             topics, 
+                                             dpi, height)
+    elif mode == "all": 
+        entriesShown = numOfTopics
+        topics = list(range(0, numOfTopics))
+        for topic in topics:
+            topic = str(topic)
+            dataToPlot = get_allComplexProgression_dataToPlot(averageDataset, 
+                                                             firstWordsFile, 
+                                                             entriesShown, 
+                                                             topic,
+                                                             targetCategories)
+            create_allComplexProgression_lineplot(dataToPlot, targetCategories,
+                                                  outfolder, 
+                                                  fontscale, firstWordsFile, 
+                                                  topic, dpi, height)
+    else: 
+        print("Please select a valid value for 'mode'.")
+    print("Done.")
+    
+    
+
 
 ###########################
 ## show_segment         ###
@@ -1063,4 +1884,62 @@ def plot_topicsOverTime(averageDatasets, firstWordsFile, outfolder,
 def show_segment(wdir,segmentID, outfolder): 
     if not os.path.exists(outfolder):
         os.makedirs(outfolder)
-    shutil.copyfile(wdir+"2_segs/"+segmentID+".txt",outfolder+segmentID+".txt")
\ No newline at end of file
+    shutil.copyfile(wdir+"2_segs/"+segmentID+".txt",outfolder+segmentID+".txt")
+
+
+
+
+###########################
+## itemPCA              ###            IN DEVELOPMENT
+###########################
+
+from sklearn.decomposition import PCA
+
+#def build_itemScoreMatrix(averageDatasets, targetCategory, 
+#                          topicsPerItem, sortingCriterium):
+#    """Reads Mallet output (topics with words and word weights) into dataframe.""" 
+#    print("- building item score matrix...")
+#    for averageFile in glob.glob(averageDatasets): 
+#        if targetCategory in averageFile:
+#            itemScores = pd.read_table(averageFile, header=0, index_col=0, sep=",")
+#            itemScores = itemScores.T 
+#            if sortingCriterium == "std": 
+#                itemScores["sorting"] = itemScores.std(axis=1)
+#            elif sortingCriterium == "mean": 
+#                itemScores["sorting"] = itemScores.mean(axis=1)
+#            itemScores = itemScores.sort(columns=["sorting"], axis=0, ascending=False)
+#            itemScoreMatrix = itemScores.iloc[0:topicsPerItem,0:-1]
+#            itemScoreMatrix = itemScoreMatrix.T
+#            #print(itemScoreMatrix)
+#            return itemScoreMatrix
+
+def perform_itemPCA(itemScoreMatrix, targetCategory, topicsPerItem, 
+                    sortingCriterium, figsize, outfolder):
+    print("- doing the PCA...")
+    itemScoreMatrix = itemScoreMatrix.T
+    targetDimensions = 2
+    pca = PCA(n_components=targetDimensions)
+    pca = pca.fit(itemScoreMatrix)
+    pca = pca.transform(itemScoreMatrix)
+#   plt.scatter(pca[0,0:20], pca[1,0:20])
+    for i in list(range(0,len(pca)-1)):
+        plt.scatter(pca[i,:], pca[i+1,:])
+
+
+def itemPCA(averageDatasets, targetCategories, 
+            topicsPerItem, sortingCriterium, figsize, outfolder): 
+    """Function to perform PCA on per-item topic scores and plot the result."""
+    print("Launched itemPCA.")
+    for targetCategory in targetCategories: 
+        ## Load topic scores per item and turn into score matrix
+        ## (Using the function from itemClustering above!)
+        itemScoreMatrix = build_itemScoreMatrix(averageDatasets, targetCategory, 
+                                            topicsPerItem, sortingCriterium)
+        ## Do clustering on the dataframe
+        perform_itemPCA(itemScoreMatrix, targetCategory, topicsPerItem, sortingCriterium, figsize, outfolder)
+    print("Done.")
+
+    
+    
+
+    
diff --git a/tmw_config.py b/tmw_config.py
index 60ec739..d581ab8 100644
--- a/tmw_config.py
+++ b/tmw_config.py
@@ -12,17 +12,37 @@
 # For information on requirements and usage, see the README file.
 
 # This config file is structured as follows: 
+# 0. General Settings
 # 1. Preprocessing Texts
 # 2. Topic Modeling
 # 3. Posprocessing Data
-# 4. Visualization
-# 5. Other / Obsolete
+# 4. Basic Visualizations
+# 5. Advanced Visualizations
+# 6. Other / Obsolete / in development
+
+# You may find a tutorial explaining the purpose of each function 
+# as well as its input, output and other parameters at: 
+# https://www.penflip.com/c.schoech/tmw-tutorial
+
+
+################################
+### GENERAL SETTINGS         ###
+################################
+
+### The following settings depend on the system used.
+### Path to the working directory.
+wdir = "/home/" # end with slash.
+### Path to the TreeTagger file (language-dependent!)
+tagger = "/home/[USER]/Programs/TreeTagger/cmd/tree-tagger-french"
+### Path to Mallet installation directory
+mallet_path = "/home/[USER]/Programs/Mallet/bin/mallet"
+### Path to the font for wordle generation
+font_path = "/home/[USER]/.fonts/AlegreyaSans-Regular.otf"
 
 import tmw
 #print(help(topmod))
 
-### Set the general working directory.
-wdir = "/home/christof/Dropbox/0-Analysen/2015/hybrid/rf740c/" # end with slash.
+
 
 ################################
 ###    PREPROCESSING TEXTS   ###
@@ -39,25 +59,29 @@
 inpath = wdir + "1_txt/*.txt"
 outfolder = wdir + "2_segs/"
 target = 600
-sizetolerancefactor = 1.1 # 1 = exact target; >1 = with some tolerance (1.1 = +/- 10%).
-preserveparagraphs = True # True|False
+sizetolerancefactor = 1.1
+preserveparagraphs = True
 #tmw.segmenter(inpath, outfolder, target, sizetolerancefactor, preserveparagraphs)
 
-### segments_to_bins: inpath, outfile
-### Currently not implemented any more / yet. 
+### segments_to_bins
+### Assign each segment to one bin over textual progression.
+inpath = wdir + "2_segs/*.txt"
+outfolder = wdir + "7_aggregates/"
+binsnb = 3 # number of bins
+#tmw.segments_to_bins(inpath,outfolder, binsnb)
 
 ### pretokenize
 ### Perform some preliminary tokenization.
-inpath = wdir + "2_test/*.txt"
-substitutionsFile = "./extras/fr_pretokenize_subs.csv"
-outfolder = wdir + "3_test/"
-tmw.pretokenize(inpath, substitutionsFile, outfolder)
+inpath = wdir + "2_segs/*.txt"
+outfolder = wdir + "3_tokens/"
+substitutionsFile = wdir+"extras/fr_pretokenize_subs.csv"
+#tmw.pretokenize(inpath, substitutionsFile, outfolder)
 
 ### call_treetagger
 ### Perform lemmatization and POS tagging.
-infolder = wdir + "3_tokens/"
+infolder = wdir + "2_segs/"
 outfolder = wdir + "4_tagged/"
-tagger = "/home/christof/Programs/TreeTagger/cmd/tree-tagger-french"
+tagger = tagger
 #tmw.call_treetagger(infolder, outfolder, tagger) 
 
 ### make_lemmatext
@@ -65,9 +89,15 @@
 inpath = wdir + "4_tagged/*.trt"
 outfolder = wdir + "5_lemmata/"
 mode = "frN" # frN=nouns, esN=nouns, frNV=nouns+verbs, frNVAA=nouns+verbs+adj+adverbs 
-stoplist_errors = "./extras/fr_stopwords_errors.txt" # in tmw folder
+stoplist_errors = wdir+"extras/fr_stopwords_errors.txt" # wdir
 #tmw.make_lemmatext(inpath, outfolder, mode, stoplist_errors)
 
+### substitute
+### Perform some preliminary tokenization.
+inpath = wdir + "5_lemmata/*.txt"
+outfolder = wdir + "5_substituted/"
+substitutionsFile = wdir+"extras/fr_argot-substitutions.csv"
+#tmw.substitute(inpath, substitutionsFile, outfolder)
 
 
 ################################
@@ -76,25 +106,25 @@
 
 ### call_mallet_import
 ### Imports text data into the Mallet corpus format.
-mallet_path = "/home/christof/Programs/Mallet/bin/mallet"
-infolder = wdir + "5_lemmata/"
+mallet_path = mallet_path
+infolder = wdir + "5_substituted/"
 outfolder = wdir + "6_mallet/" 
 outfile = outfolder + "corpus.mallet"
-stoplist_project = "./extras/fr_stopwords_project.txt" # in tmw folder
+stoplist_project = wdir+"extras/fr_stopwords_project.txt" # in tmw folder
 #tmw.call_mallet_import(mallet_path, infolder, outfolder, outfile, stoplist_project)
 
 ### call_mallet_model
 ### Performs the actual topic modeling. 
-mallet_path = "/home/christof/Programs/Mallet/bin/mallet"
+mallet_path = mallet_path
 inputfile = wdir + "6_mallet/corpus.mallet"
 outfolder = wdir + "6_mallet/"
-num_topics = "250"
-optimize_interval = "100"
-num_iterations = "5000"
-num_top_words = "200"
-doc_topics_max = num_topics
-num_threads = "4"
-#tmw.call_mallet_modeling(mallet_path, inputfile, outfolder, num_topics, optimize_interval, num_iterations, num_top_words, doc_topics_max)
+numOfTopics = "250" # string
+optimize_interval = "100" # string
+num_iterations = "5000" # string
+num_top_words = "100" # string
+doc_topics_max = numOfTopics
+num_threads = "4" # string
+#tmw.call_mallet_modeling(mallet_path, inputfile, outfolder, numOfTopics, optimize_interval, num_iterations, num_top_words, doc_topics_max)
 
 
 
@@ -103,24 +133,35 @@
 ################################
 
 ### create_mastermatrix
-### Creates the mastermatrix with all information in one place.
-corpuspath = wdir+"/2_segs/*.txt"
+### Creates a matrix with all information (metadata and topic scores for 
+### each segment) in one place.
+corpuspath = wdir+"2_segs/*.txt"
 outfolder = wdir+"7_aggregates/"
 mastermatrixfile = "mastermatrix.csv"
-metadatafile = wdir+"/metadata.csv"
-topics_in_texts = wdir+"/6_mallet/topics-in-texts.csv"
-number_of_topics = 250
-#tmw.create_mastermatrix(corpuspath, outfolder, mastermatrixfile, metadatafile, topics_in_texts, number_of_topics)
+metadatafile = wdir+"metadata.csv"
+topics_in_texts = wdir+"6_mallet/topics-in-texts.csv"
+numOfTopics = int(numOfTopics)
+useBins = True # True|False
+binDataFile = wdir+"7_aggregates/segs-and-bins.csv"
+###tmw.create_mastermatrix(corpuspath, outfolder, mastermatrixfile, metadatafile, topics_in_texts, numOfTopics, useBins, binDataFile)
 
 ### calculate_averageTopicScores
 ### Based on the mastermatrix, calculates various average topic score datasets.
 mastermatrixfile = wdir+"/7_aggregates/mastermatrix.csv"
 outfolder = wdir+"7_aggregates/"
-# targets: one or several:author|decade|subgenre|author-gender|idno|segmentID|narration
-targets = ["author-name", "author-gender", "title", "decade", "subgenre", 
-           "idno", "segmentID", "narration", "protagonist-policier"] 
+targets = ["segmentID"] 
+#targets = ["subgenre", "author-name", "subsubgenre","decade", "narration", "setting", "author-gender", "title", "protagonist-policier"] 
+#targets = ["author", "author-gender", "title", "decade", "subgenre", 
+#           "idno", "segmentID", "narration", "protagonist-policier", "binID"] 
 #tmw.calculate_averageTopicScores(mastermatrixfile, targets, outfolder)
 
+### calculate_complexAverageTopicScores
+### Based on the mastermatrix, calculates average topic scores for two target categories at once.
+mastermatrixfile = wdir+"/7_aggregates/mastermatrix.csv"
+outfolder = wdir+"7_aggregates/"
+targets = ["decade", "binID"] # 2 targets to combine
+#tmw.calculate_complexAverageTopicScores(mastermatrixfile, targets, outfolder)
+
 ### save_firstWords
 ### Saves the first words of each topic to a separate file.
 topicWordFile = wdir+"6_mallet/topics-with-words.csv"
@@ -128,24 +169,32 @@
 filename = "firstWords.csv"
 #tmw.save_firstWords(topicWordFile, outfolder, filename)
 
+### save_topicRanks
+### Saves the rank (in the overall scores) of each topic to a separate file.
+topicWordFile = wdir+"6_mallet/topics-with-words.csv"
+outfolder = wdir+"7_aggregates/"
+filename = "topicRanks.csv"
+tmw.save_topicRanks(topicWordFile, outfolder, filename)
+
 
 
 ################################
-###    VISUALIZATION         ###
+###  BASIC VISUALIZATION     ###
 ################################
 
 ### make_wordle_from_mallet
 ### Creates a wordle for each topic.
-word_weights_file = wdir + "6_mallet/" + "word-weights.txt"
-topics = 250
+word_weights_file = wdir+"6_mallet/" + "word-weights.txt"
+topicRanksFile = wdir + "7_aggregates/" + "topicRanks.csv"
+numOfTopics = numOfTopics
 words = 40
-outfolder = wdir + "8_visuals/wordles/"
-font_path = "/home/christof/.fonts/AlegreyaSans-Regular.otf"
+outfolder = wdir+"8_visuals/wordles/"
+font_path = font_path
 dpi = 300
-#tmw.make_wordle_from_mallet(word_weights_file,topics,words,outfolder,font_path,dpi)
+#tmw.make_wordle_from_mallet(word_weights_file,numOfTopics, words,outfolder, topicRanksFile, font_path,dpi)
 
 ### crop_images
-### Crops the wordle image files, use if needed.
+### Optional. Crops the wordle image files.
 inpath = wdir + "8_visuals/wordles/*.png"
 outfolder = wdir + "8_visuals/wordles/"
 left = 225 # image start at the left
@@ -156,75 +205,131 @@
 
 ### plot_topTopics
 ### For each item from a category, creates a barchart of the top topics.
-averageDatasets = wdir+"/7_aggregates/avg*.csv" 
-firstWordsFile = wdir+"/7_aggregates/firstWords.csv"
-numberOfTopics = 250 # must be actual number of topics modeled.
-targetCategories = ["author-name", "author-gender", "decade", "subgenre", "title"] 
-# one or several: "author-name", "author-gender", "decade", "subgenre", "title"
-topTopicsShown = 30 
+averageDatasets = wdir+"7_aggregates/avg*.csv" 
+firstWordsFile = wdir+"7_aggregates/firstWords.csv"
+targetCategories = ["title"]
+topTopicsShown = 16 
+numOfTopics = numOfTopics 
 fontscale = 1.0
 height = 0 # 0=automatic and variable
 dpi = 300
+mode = "normalized" #normalized|zscores|absolute
 outfolder = wdir+"/8_visuals/topTopics/"
-#tmw.plot_topTopics(averageDatasets, firstWordsFile, numberOfTopics, targetCategories, topTopicsShown, fontscale, height, dpi, outfolder)
+tmw.plot_topTopics(averageDatasets, firstWordsFile, numOfTopics, targetCategories, mode, topTopicsShown, fontscale, height, dpi, outfolder)
 
-### plot_topItems
+### plot_topItems ###
 ### For each topic, creates a barchart with top items from a category. 
-averageDatasets = wdir+"/7_aggregates/avg*.csv" 
-outfolder = wdir+"/8_visuals/topItems/"
-firstWordsFile = wdir+"/7_aggregates/firstWords.csv"
-numberOfTopics = 250 # must be actual number of topics modeled. 
-targetCategories = ["author-name", "subgenre", "title", "decade", "author-gender"] 
-# choose one or several from: author-name, decade, subgenre, gender, idno, title, segmentID
-topItemsShown = 30 
+averageDatasets = wdir+"7_aggregates/avg*.csv" 
+outfolder = wdir+"8_visuals/topItems/"
+firstWordsFile = wdir+"7_aggregates/firstWords.csv"
+numOfTopics = numOfTopics # must be actual number of topics modeled. 
+targetCategories = ["segmentID"] 
+topItemsShown = 20 
 fontscale = 0.8
 height = 0 # 0=automatic and flexible
 dpi = 300
-#tmw.plot_topItems(averageDatasets, outfolder, firstWordsFile, numberOfTopics, targetCategories, topItemsShown, fontscale, height, dpi)
+#tmw.plot_topItems(averageDatasets, outfolder, firstWordsFile, numOfTopics, targetCategories, topItemsShown, fontscale, height, dpi)
+
+
+
+################################
+###  ADVANCED VISUALIZATION  ###
+################################
 
-### plot_distinctiveness_heatmap
+### plot_distinctiveness_heatmap ###
 ### For each category, make a heatmap of most distinctive topics. 
-averageDatasets = wdir+"/7_aggregates/avg*.csv" 
-firstWordsFile = wdir+"/7_aggregates/firstWords.csv"
-outfolder = wdir+"/8_visuals/distinctiveness/"
-targetCategories = ["author-name", "decade", "subgenre", "gender"] 
-# one or several: "author-name", "decade", "subgenre", "gender", "idno", "title"
-numberOfTopics = 250 # must be actual number of topics modeled.
+averageDatasets = wdir+"7_aggregates/avg*.csv" 
+firstWordsFile = wdir+"7_aggregates/firstWords.csv"
+outfolder = wdir+"8_visuals/distinctiveness/"
+targetCategories = ["protagonist-policier"] 
+mode = "zscores" #normalized|zscores|absolute
+numOfTopics = numOfTopics # actual number of topics modeled.
 topTopicsShown = 20 
 fontscale = 1.0
 dpi = 300
-#tmw.plot_distinctiveness_heatmap(averageDatasets, firstWordsFile, outfolder, targetCategories, numberOfTopics, topTopicsShown, fontscale, dpi)
-
-### plot_topicsOverTime
-### Creates lineplots or areaplots for topic development over time.
-averageDatasets = wdir+"/7_aggregates/avgtopicscores_by-decade.csv" 
-firstWordsFile = wdir+"/7_aggregates/firstWords.csv"
-outfolder = wdir+"/8_visuals/overTime/"
-numberOfTopics = 250 # must be actual number of topics modeled.
+#tmw.plot_distinctiveness_heatmap(averageDatasets, firstWordsFile, mode, outfolder, targetCategories, numOfTopics, topTopicsShown, fontscale, dpi)
+
+### plot_topicsOverTime ###
+###     
+averageDatasets = wdir+"7_aggregates/avgtopicscores_by-decade.csv" 
+firstWordsFile = wdir+"7_aggregates/firstWords.csv"
+outfolder = wdir+"8_visuals/overTime/"
+numOfTopics = numOfTopics # actual number of topics modeled.
 fontscale = 1.0
 dpi = 300
 height = 0 # for lineplot; 0=automatic
 mode = "line" # area|line for areaplot or lineplot
-topics = ["48","67","199"] # list of one or several topics
-#tmw.plot_topicsOverTime(averageDatasets, firstWordsFile, outfolder, numberOfTopics, fontscale, dpi, height, mode, topics)
+topics = ["190", "6"] # list of one or several topics
+#tmw.plot_topicsOverTime(averageDatasets, firstWordsFile, outfolder, numOfTopics, fontscale, dpi, height, mode, topics)
+
+### topicClustering ###
+# This function will create a dendrogram grouping topics based on their word weight similarity.
+wordWeightsFile = wdir+"6_mallet/"+"word-weights.txt"
+outfolder = wdir+"8_visuals/clustering/"
+topicsToUse = numOfTopics # should be all topics.
+wordsPerTopic = 50
+methods=["weighted"] # list
+metrics=["cosine"] # list
+#tmw.topicClustering(wordWeightsFile, wordsPerTopic, outfolder, methods, metrics, topicsToUse)
+
+### itemClustering ###
+# This function creates a dendrogram of items in a category (authors, titles).
+averageDatasets = wdir+"7_aggregates/avg*.csv" 
+figsize = (15,10) # width,height
+outfolder = wdir+"8_visuals/clustering/"
+topicsPerItem = 50 # can be set
+sortingCriterium = "mean" # std|mean
+targetCategories = ["author-name"] # list
+methods=["weighted"] # list
+metrics=["cosine"] # list
+#tmw.itemClustering(averageDatasets, figsize, outfolder, topicsPerItem, targetCategories, methods, metrics, sortingCriterium)
+
+### simpleProgression ###
+### Creates a lineplot of topic development over textual progression.
+averageDataset = wdir+"7_aggregates/avgtopicscores_by-binID.csv" 
+firstWordsFile = wdir+"7_aggregates/firstWords.csv"
+outfolder = wdir+"8_visuals/progression/simple/"
+numOfTopics = numOfTopics # must be actual number of topics modeled.
+fontscale = 1.0
+dpi = 300
+height = 0 # 0=automatic
+mode = "sel" # all|sel 
+topics = ["25", "44", "12"] # if mode="sel": list of topics
+#tmw.simpleProgression(averageDataset, firstWordsFile, outfolder, numOfTopics, fontscale, dpi, height, mode, topics)
+
+### complexProgression ###
+### Creates a lineplot of topic development over textual progression, 
+### but does so separatedly for different target categories.
+averageDataset = wdir+"7_aggregates/complex-avgtopicscores_by-subgenre+binID.csv" 
+firstWordsFile = wdir+"7_aggregates/firstWords.csv"
+outfolder = wdir+"8_visuals/progression/complex/"
+numOfTopics = numOfTopics # must be actual number of topics modeled.
+targetCategories = ["subgenre","binID"] # two values, corresponding to averageDataset
+fontscale = 1.0
+dpi = 300
+height = 0 # for lineplot; 0=automatic
+mode = "all" # all|sel ### only "all" is implemented ##
+#tmw.complexProgression(averageDataset, firstWordsFile, outfolder, numOfTopics, targetCategories, fontscale, dpi, height, mode, topics)
 
 
 
 ################################
-###    OTHER/OBSOLETE        ###
+###  OTHER / OBSOLETE / DEV  ###
 ################################
 
 ### 5c show segment
 ## To read a specific segment, better than looking in the folder.
-segmentID = "rf0546§000083"
+segmentID = "rf1246§0048" # indicate here, manually
 outfolder = wdir+"/9_sel-segs/"
 #tmw.show_segment(wdir,segmentID, outfolder)
 
-### 6b - create_topicscores_lineplot
-inpath = wdir + "7_aggregates/*-lp.csv"  # narrow down as needed
-outfolder = wdir + "8_visuals/lineplots/"
-topicwordfile = wdir + "6_mallet/topics-with-words.csv"
-dpi = 300
-height = 0.050
-genres = ["detection","noir"] # User: set depending on metadata. Available: noir, detection, criminel, experim., archq., blanche, neopl., susp.
-#tmw.create_topicscores_lineplot(inpath,outfolder,topicwordfile,dpi,height,genres)
+### itemPCA ### CURRENTLY NOT WORKING
+averageDatasets = wdir+"7_aggregates/avg*.csv" 
+figsize = (10,10) # width,height
+outfolder = wdir+"8_visuals/clustering/"
+topicsPerItem = 50
+sortingCriterium = "std" # std|mean
+targetCategories = ["subgenre"] # list
+methods=["weighted"] # list
+metrics=["cosine"] # list
+#tmw.itemPCA(averageDatasets, targetCategories, topicsPerItem, sortingCriterium, figsize, outfolder)