From fd0a8edea98dbe322df3e9f0f64eb935ee51b416 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20L=C3=B6ffler?= <s8maloef@stud.uni-saarland.de>
Date: Mon, 4 Aug 2025 17:50:21 +0200
Subject: [PATCH 01/13] Automatically convert to python3 (using 2to3)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Maximilian Löffler <s8maloef@stud.uni-saarland.de>
---
 anonymization/anonymization.py                |  4 +-
 .../author_postprocessing.py                  |  2 +-
 bot_processing/bot_processing.py              |  8 ++--
 codeface_extraction/codeface_extraction.py    |  4 +-
 codeface_extraction/extractions.py            |  8 ++--
 csv_writer/csv_writer.py                      |  2 +-
 issue_processing/issue_processing.py          | 18 ++++----
 issue_processing/jira_issue_processing.py     | 45 ++++++++++---------
 mbox_parsing/mbox_parsing.py                  |  6 +--
 9 files changed, 49 insertions(+), 48 deletions(-)

diff --git a/anonymization/anonymization.py b/anonymization/anonymization.py
index a2e49f0..b11ef52 100644
--- a/anonymization/anonymization.py
+++ b/anonymization/anonymization.py
@@ -343,7 +343,7 @@ def anonymize_authors(author_data, i, author_to_anonymized_author, name_only = F
             gender_data_new = []
 
             for author in gender_data:
-                if author[0] in author_to_anonymized_author_gender.keys():
+                if author[0] in list(author_to_anonymized_author_gender.keys()):
                     new_person = author_to_anonymized_author_gender[author[0]]
                     author[0] = new_person[0]
                     gender_data_new.append(author)
@@ -395,7 +395,7 @@ def run():
     # process arguments
     # - First make all the args absolute
     __resdir = abspath(args.resdir)
-    __codeface_conf, __project_conf = map(abspath, (args.config, args.project))
+    __codeface_conf, __project_conf = list(map(abspath, (args.config, args.project)))
 
     # load configuration
     __conf = Configuration.load(__codeface_conf, __project_conf)
diff --git a/author_postprocessing/author_postprocessing.py b/author_postprocessing/author_postprocessing.py
index 13b1e38..c712ac6 100644
--- a/author_postprocessing/author_postprocessing.py
+++ b/author_postprocessing/author_postprocessing.py
@@ -469,7 +469,7 @@ def run():
     # process arguments
     # - First make all the args absolute
     __resdir = abspath(args.resdir)
-    __codeface_conf, __project_conf = map(abspath, (args.config, args.project))
+    __codeface_conf, __project_conf = list(map(abspath, (args.config, args.project)))
     __backup_data = args.backup
 
     # load configuration
diff --git a/bot_processing/bot_processing.py b/bot_processing/bot_processing.py
index 53a397e..43ff492 100644
--- a/bot_processing/bot_processing.py
+++ b/bot_processing/bot_processing.py
@@ -19,10 +19,10 @@
 """
 
 import argparse
-import httplib
+import http.client
 import os
 import sys
-import urllib
+import urllib.request, urllib.parse, urllib.error
 
 import operator
 from codeface.cli import log
@@ -39,7 +39,7 @@ def run():
 
     # parse arguments
     args = parser.parse_args(sys.argv[1:])
-    __codeface_conf, __project_conf = map(os.path.abspath, (args.config, args.project))
+    __codeface_conf, __project_conf = list(map(os.path.abspath, (args.config, args.project)))
 
     # create configuration
     __conf = Configuration.load(__codeface_conf, __project_conf)
@@ -192,7 +192,7 @@ def add_user_data(bot_data, user_data, known_bots_file):
             continue
 
         # get user information if available
-        if user[0] in user_buffer.keys():
+        if user[0] in list(user_buffer.keys()):
             bot_reduced["user"] = user_buffer[user[0]]
             bot_reduced["prediction"] = user[-1]
             bot_data_reduced.append(bot_reduced)
diff --git a/codeface_extraction/codeface_extraction.py b/codeface_extraction/codeface_extraction.py
index 7cf24ea..3478b1f 100644
--- a/codeface_extraction/codeface_extraction.py
+++ b/codeface_extraction/codeface_extraction.py
@@ -28,7 +28,7 @@
 from codeface.configuration import Configuration
 from codeface.dbmanager import DBManager
 
-import extractions
+from . import extractions
 from csv_writer import csv_writer
 
 
@@ -119,7 +119,7 @@ def run():
     # process arguments
     # - First make all the args absolute
     __resdir = abspath(args.resdir)
-    __codeface_conf, __project_conf = map(abspath, (args.config, args.project))
+    __codeface_conf, __project_conf = list(map(abspath, (args.config, args.project)))
     __extract_commit_messages = args.commit_messages
     __extract_impl = args.implementation
     __extract_on_range_level = args.range
diff --git a/codeface_extraction/extractions.py b/codeface_extraction/extractions.py
index 081a353..081d1be 100644
--- a/codeface_extraction/extractions.py
+++ b/codeface_extraction/extractions.py
@@ -723,7 +723,7 @@ def _reduce_result(self, result):
 
 def fix_characters_in_string(text):
     """
-    Removes control characters such as \r\n \x1b \ufffd from string impl and returns a unicode
+    Removes control characters such as \r\n \x1b \\ufffd from string impl and returns a unicode
     string where all control characters have been replaced by a space.
     :param text: expects a unicode string
     :return: unicode string
@@ -742,7 +742,7 @@ def fix_characters_in_string(text):
 
     # remove all kinds of control characters and emojis
     # see: https://www.fileformat.info/info/unicode/category/index.htm
-    new_text = u"".join(ch if unicodedata.category(ch)[0] != "C" else " " for ch in new_text.decode("unicode-escape"))
+    new_text = "".join(ch if unicodedata.category(ch)[0] != "C" else " " for ch in new_text.decode("unicode-escape"))
 
     return new_text
 
@@ -765,10 +765,10 @@ def fix_name_encoding(name):
 
     try:
         # Apply correct encoding and return unicode string
-        return unicode(make_header(info))
+        return str(make_header(info))
     except UnicodeDecodeError:
         # Undo utf-8 encoding and return unicode string
-        return unicode(name.decode('utf-8'))
+        return str(name.decode('utf-8'))
     except LookupError:
         # Encoding not found, return string as is
         return name
diff --git a/csv_writer/csv_writer.py b/csv_writer/csv_writer.py
index 2804081..b41463a 100644
--- a/csv_writer/csv_writer.py
+++ b/csv_writer/csv_writer.py
@@ -28,7 +28,7 @@ def __encode(line):
 
     lineres = ()  # re-encode column if it is unicode
     for column in line:
-        if type(column) is unicode:
+        if type(column) is str:
             lineres += (column.encode("utf-8"),)
         else:
             lineres += (column,)
diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py
index a901e19..ff80d53 100644
--- a/issue_processing/issue_processing.py
+++ b/issue_processing/issue_processing.py
@@ -24,11 +24,11 @@
 """
 
 import argparse
-import httplib
+import http.client
 import json
 import os
 import sys
-import urllib
+import urllib.request, urllib.parse, urllib.error
 from datetime import datetime, timedelta
 
 import operator
@@ -61,7 +61,7 @@ def run():
 
     # parse arguments
     args = parser.parse_args(sys.argv[1:])
-    __codeface_conf, __project_conf = map(os.path.abspath, (args.config, args.project))
+    __codeface_conf, __project_conf = list(map(os.path.abspath, (args.config, args.project)))
 
     # create configuration
     __conf = Configuration.load(__codeface_conf, __project_conf)
@@ -210,7 +210,7 @@ def update_user_dict(user_dict, user):
     if user is None:
         user = create_deleted_user()
 
-    if not user["username"] in user_dict.keys():
+    if not user["username"] in list(user_dict.keys()):
         if not user["username"] is None and not user["username"] == "":
             user_dict[user["username"]] = user
     else:
@@ -340,7 +340,7 @@ def merge_issue_events(issue_data):
 
             # as we cannot update the referenced issue during iterating over all issues, we need to save the
             # referenced_by event for the referenced issue temporarily
-            if rel_issue["number"] in issue_data_to_update.keys():
+            if rel_issue["number"] in list(issue_data_to_update.keys()):
                 issue_data_to_update[rel_issue["number"]]["eventsList"].append(referenced_issue_event)
             else:
                 ref = dict()
@@ -500,7 +500,7 @@ def merge_issue_events(issue_data):
         issue["eventsList"] = sorted(issue["eventsList"], key=lambda k: k["created_at"])
 
     # updates all the issues by the temporarily stored referenced_by events
-    for key, value in issue_data_to_update.iteritems():
+    for key, value in issue_data_to_update.items():
         for issue in issue_data:
             if issue["number"] == value["number"]:
                 issue["eventsList"] = issue["eventsList"] + value["eventsList"]
@@ -683,14 +683,14 @@ def get_user_string(name, email):
             return "{name} <{email}>".format(name=name, email=email)
 
     def get_id_and_update_user(user, buffer_db_ids=user_id_buffer, buffer_usernames=username_id_buffer):
-        username = unicode(user["username"]).encode("utf-8")
+        username = str(user["username"]).encode("utf-8")
 
         # fix encoding for name and e-mail address
         if user["name"] is not None:
-            name = unicode(user["name"]).encode("utf-8")
+            name = str(user["name"]).encode("utf-8")
         else:
             name = username
-        mail = unicode(user["email"]).encode("utf-8")
+        mail = str(user["email"]).encode("utf-8")
         # construct string for ID service and send query
         user_string = get_user_string(name, mail)
 
diff --git a/issue_processing/jira_issue_processing.py b/issue_processing/jira_issue_processing.py
index d9748ae..fa3e826 100644
--- a/issue_processing/jira_issue_processing.py
+++ b/issue_processing/jira_issue_processing.py
@@ -44,8 +44,9 @@
 from jira import JIRA
 from jira.exceptions import JIRAError
 from time import sleep
+import importlib
 
-reload(sys)
+importlib.reload(sys)
 sys.setdefaultencoding("utf-8")
 
 # global counter for JIRA requests to make sure to not exceed the request limit
@@ -65,7 +66,7 @@ def run():
 
     # parse arguments
     args = parser.parse_args(sys.argv[1:])
-    __codeface_conf, __project_conf = map(os.path.abspath, (args.config, args.project))
+    __codeface_conf, __project_conf = list(map(os.path.abspath, (args.config, args.project)))
 
     # create configuration
     __conf = Configuration.load(__codeface_conf, __project_conf)
@@ -114,9 +115,9 @@ def run():
         processed_issues.extend(issues)
 
     # 4) insert referenced_by events into issue histories
-    for issue_id in referenced_bys.keys():
+    for issue_id in list(referenced_bys.keys()):
         # obtain list of issues which have the current issue id
-        referenced_issue = list(filter(lambda issue: issue["externalId"] == issue_id, processed_issues))
+        referenced_issue = list([issue for issue in processed_issues if issue["externalId"] == issue_id])
         if len(referenced_issue) > 0:
             if len(referenced_issue) > 1:
                 log.warning("Ambiguous issue id " + issue_id + " found in the issue list.")
@@ -235,21 +236,21 @@ def merge_user_with_user_from_csv(user, persons):
     """
 
     new_user = dict()
-    name_utf8 = unicode(user["name"]).encode("utf-8")
-    username_utf8 = unicode(user["username"].lower()).encode("utf-8")
+    name_utf8 = str(user["name"]).encode("utf-8")
+    username_utf8 = str(user["username"].lower()).encode("utf-8")
 
-    if username_utf8 in persons["by_username"].keys():
+    if username_utf8 in list(persons["by_username"].keys()):
         new_user["username"] = username_utf8
-        new_user["name"] = unicode(persons["by_username"].get(username_utf8)[0]).encode("utf-8")
-        new_user["email"] = unicode(persons["by_username"].get(username_utf8)[1]).encode("utf-8")
-    elif name_utf8 in persons["by_name"].keys():
+        new_user["name"] = str(persons["by_username"].get(username_utf8)[0]).encode("utf-8")
+        new_user["email"] = str(persons["by_username"].get(username_utf8)[1]).encode("utf-8")
+    elif name_utf8 in list(persons["by_name"].keys()):
         new_user["username"] = username_utf8
-        new_user["name"] = unicode(persons["by_name"].get(name_utf8)[0]).encode("utf-8")
-        new_user["email"] = unicode(persons["by_name"].get(name_utf8)[1]).encode("utf-8")
+        new_user["name"] = str(persons["by_name"].get(name_utf8)[0]).encode("utf-8")
+        new_user["email"] = str(persons["by_name"].get(name_utf8)[1]).encode("utf-8")
     else:
         new_user["username"] = username_utf8
         new_user["name"] = name_utf8
-        new_user["email"] = unicode(user["email"]).encode("utf-8")
+        new_user["email"] = str(user["email"]).encode("utf-8")
         log.warning("User not in csv-file: " + str(user))
 
     log.info("current User: " + str(user) + ",    new user: " + str(new_user))
@@ -605,10 +606,10 @@ def get_user_string(name, email):
     def get_id_and_update_user(user, buffer_db_ids=user_id_buffer):
         # fix encoding for name and e-mail address
         if user["name"] is not None and user["name"] != "":
-            name = unicode(user["name"]).encode("utf-8")
+            name = str(user["name"]).encode("utf-8")
         else:
-            name = unicode(user["username"]).encode("utf-8")
-        mail = unicode(user["email"]).encode("utf-8")  # empty
+            name = str(user["username"]).encode("utf-8")
+        mail = str(user["email"]).encode("utf-8")  # empty
         # construct string for ID service and send query
         user_string = get_user_string(name, mail)
 
@@ -1000,8 +1001,8 @@ def find_first_existing(source_folder, filenames):
         :return: the first existing file name, None otherwise
         """
 
-        filenames = map(lambda fi: os.path.join(source_folder, fi), filenames)
-        existing = map(lambda fi: os.path.exists(fi), filenames)
+        filenames = [os.path.join(source_folder, fi) for fi in filenames]
+        existing = [os.path.exists(fi) for fi in filenames]
         first = next((i for (i, x) in enumerate(existing) if x), None)
 
         if first is not None:
@@ -1026,11 +1027,11 @@ def find_first_existing(source_folder, filenames):
         persons_by_username = {}
         persons_by_name = {}
         for row in person_data:
-            if not row["AuthorID"] in persons_by_username.keys():
-                author_id_utf8 = unicode(row["AuthorID"]).encode("utf-8")
+            if not row["AuthorID"] in list(persons_by_username.keys()):
+                author_id_utf8 = str(row["AuthorID"]).encode("utf-8")
                 persons_by_username[author_id_utf8] = (row["AuthorName"], row["userEmail"])
-            if not row["AuthorName"] in persons_by_name.keys():
-                author_name_utf8 = unicode(row["AuthorName"]).encode("utf-8")
+            if not row["AuthorName"] in list(persons_by_name.keys()):
+                author_name_utf8 = str(row["AuthorName"]).encode("utf-8")
                 persons_by_name[author_name_utf8] = (row["AuthorName"], row["userEmail"])
 
         persons = dict()
diff --git a/mbox_parsing/mbox_parsing.py b/mbox_parsing/mbox_parsing.py
index fd9fd59..92d8cb9 100644
--- a/mbox_parsing/mbox_parsing.py
+++ b/mbox_parsing/mbox_parsing.py
@@ -69,7 +69,7 @@ def __get_index(mbox, mbox_path, results_folder, schema, reindex):
         writer = ix.writer()
         # add all messages to index
         for message in mbox:
-            writer.add_document(messageID=unicode(message['message-id']), content=__mbox_getbody(message))
+            writer.add_document(messageID=str(message['message-id']), content=__mbox_getbody(message))
         writer.commit()
         log.devinfo("Index created, parsing will begin now.")
     else:
@@ -136,7 +136,7 @@ def __mbox_getbody(message):
             "An image or some other content has been found that cannot be indexed. Message is given an empty body.")
         body = ' '
 
-    return unicode(body, errors="replace")
+    return str(body, errors="replace")
 
 
 def __parse_execute(artifact, schema, my_index, include_filepath):
@@ -247,7 +247,7 @@ def run():
     args = parser.parse_args(sys.argv[1:])
     __resdir = abspath(args.resdir)
     __maildir = abspath(args.maildir)
-    __codeface_conf, __project_conf = map(abspath, (args.config, args.project))
+    __codeface_conf, __project_conf = list(map(abspath, (args.config, args.project)))
 
     # initialize configuration
     __conf = Configuration.load(__codeface_conf, __project_conf)

From d64832e943ead1480d39469deac1c06f1538bd1e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20L=C3=B6ffler?= <s8maloef@stud.uni-saarland.de>
Date: Mon, 4 Aug 2025 18:45:15 +0200
Subject: [PATCH 02/13] Import codeface files required for extraction
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Maximilian Löffler <s8maloef@stud.uni-saarland.de>
---
 codeface/__init__.py      |   1 +
 codeface/configuration.py | 198 ++++++++++++++
 codeface/dbmanager.py     | 431 +++++++++++++++++++++++++++++++
 codeface/linktype.py      |  41 +++
 codeface/util.py          | 527 ++++++++++++++++++++++++++++++++++++++
 5 files changed, 1198 insertions(+)
 create mode 100644 codeface/__init__.py
 create mode 100644 codeface/configuration.py
 create mode 100644 codeface/dbmanager.py
 create mode 100644 codeface/linktype.py
 create mode 100644 codeface/util.py

diff --git a/codeface/__init__.py b/codeface/__init__.py
new file mode 100644
index 0000000..9bad579
--- /dev/null
+++ b/codeface/__init__.py
@@ -0,0 +1 @@
+# coding=utf-8
diff --git a/codeface/configuration.py b/codeface/configuration.py
new file mode 100644
index 0000000..e1fa874
--- /dev/null
+++ b/codeface/configuration.py
@@ -0,0 +1,198 @@
+# This file is part of Codeface. Codeface is free software: you can
+# redistribute it and/or modify it under the terms of the GNU General Public
+# License as published by the Free Software Foundation, version 2.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+# details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+#
+# Copyright 2013 by Siemens AG, Johannes Ebke <johannes.ebke.ext@siemens.com>
+# All Rights Reserved.
+'''
+Configuration module for codeface
+
+Encapsulates a configuration as an immutable dict
+'''
+
+import yaml
+from collections.abc import Mapping
+from logging import getLogger
+
+from tempfile import NamedTemporaryFile
+from codeface.linktype import LinkType
+
+# create logger
+log = getLogger(__name__)
+
+class ConfigurationError(Exception):
+    '''Raised if any part of the configuration is malformed'''
+    pass
+
+class Configuration(Mapping):
+    '''
+    Encapsulates the codeface configuration
+    '''
+
+    GLOBAL_KEYS = ('dbname', 'dbhost', 'dbuser', 'dbpwd',
+            'idServiceHostname', 'idServicePort')
+    GLOBAL_OPTIONAL_KEYS = ('dbport',)
+    PROJECT_KEYS = ('project', 'repo', 'tagging', 'revisions', 'rcs')
+    OPTIONAL_KEYS = ('description', 'ml', 'mailinglists', 'sleepTime',
+                     'proxyHost', 'proxyPort', 'bugsProjectName',
+                     'productAsProject', 'issueTrackerType',
+                     'issueTrackerURL', 'issueTrackerProject',
+                     'issueTrackerUser', 'issueTrackerPassword',
+                     'understand', 'sloccount', 'windowSize', 'numWindows',
+                     'qualityType', 'communicationType', 'artifactType', 'dependencyType')
+    ALL_KEYS = set(GLOBAL_KEYS + GLOBAL_OPTIONAL_KEYS + PROJECT_KEYS +
+                   OPTIONAL_KEYS)
+
+    def __init__(self):
+        '''
+        Initialize an empty configuration object with the default values
+        '''
+        self._conf = {
+                'idServiceHostname' : '127.0.0.1',
+                'idServicePort' : 8080
+                }
+
+        self._conf_file_loc = None
+
+    @classmethod
+    def load(cls, global_conffile, local_conffile=None):
+        '''
+        Load configuration from global/local files
+        '''
+        c = Configuration()
+        log.info("Loading global configuration file '{}'".
+                format(global_conffile))
+        cls._global_conf = c._load(global_conffile)
+        c._conf.update(c._global_conf)
+        if local_conffile:
+            log.info("Loading project configuration file '{}'".
+                    format(local_conffile))
+            cls._project_conf = c._load(local_conffile)
+            c._conf.update(c._project_conf)
+        else:
+            log.info("Not loading project configuration file!")
+        c._initialize()
+        c._check_sanity()
+        return c
+
+    def _load(self, filename):
+        '''Helper function that checks loading errors and logs them'''
+        try:
+            return yaml.load(open(filename), Loader=yaml.SafeLoader)
+        except IOError:
+            log.exception("Could not open configuration file '{}'".
+                    format(filename))
+            raise
+        except yaml.YAMLError:
+            log.exception("Could not parse configuration file '{}'".
+                    format(filename))
+            raise
+
+    def _initialize(self):
+        '''Infer missing values in the configuration'''
+        if "rcs" not in self:
+            self._conf["rcs"] = [None for _ in range(len(self["revisions"]))]
+
+        if "mailinglists" not in self:
+            self._conf["mailinglists"] = []
+            if "ml" in self:
+                self._conf["mailinglists"].append({"name": self["ml"]})
+        for ml in self._conf["mailinglists"]:
+            ml.setdefault("type", "dev")
+            ml.setdefault("source", "gmane")
+
+        if "dbport" not in self:
+            self._conf["dbport"] = 3306
+        else:
+            self._conf["dbport"] = int(self._conf["dbport"])
+
+    def _check_sanity(self):
+        '''
+        Check that the configuration makes sense.
+        :raise ConfigurationError
+        '''
+
+        # Some elementary sanity checks
+        for key in self.GLOBAL_KEYS:
+            if self._project_conf and key in self._project_conf:
+                log.critical("The key '{}' may not be overridden in the "
+                        "project configuration file".format(key))
+                raise ConfigurationError('Invalid configuration key.')
+
+        for key in self.GLOBAL_KEYS + self.PROJECT_KEYS:
+            if key not in self:
+                log.critical("Required key '{}' missing in configuration!"
+                        ''.format(key))
+                raise ConfigurationError('Missing configuration key.')
+
+        if self['tagging'] not in LinkType.get_all_link_types():
+            log.critical('Unsupported tagging mechanism specified!')
+            raise ConfigurationError('Unsupported tagging mechanism.')
+
+        if len(self["revisions"]) < 2:
+            log.info("No revision range specified in configuration, using auto-generated windows")
+
+        if len(self["revisions"]) != len(self["rcs"]):
+            log.critical("Malformed configuration: revision and rcs list "
+                "lengths differ! Found {0} revisions and {1} release "
+                "candidates.".format(len(self["revisions"]), len(self["rcs"])))
+            raise ConfigurationError('Malformed configuration.')
+
+        unknown_keys = [k for k in self if k not in self.ALL_KEYS]
+        for key in unknown_keys:
+            log.warning("Unknown key '{}' in configuration.".format(key))
+
+    def write(self):
+      conf_file = NamedTemporaryFile(mode='w', prefix=self._conf['project'],
+                                     delete=False)
+      yaml.dump(self._conf, conf_file)
+      self._conf_file_loc = conf_file.name
+      conf_file.close()
+
+    def get_conf_file_loc(self):
+      return self._conf_file_loc
+
+    # Function for the Configuration object to function as a dict
+    def __getitem__(self, key):
+        return self._conf[key]
+
+    def __setitem__(self, key, value):
+        self._conf[key] = value
+
+    def __len__(self):
+        return len(self._conf)
+
+    def __iter__(self):
+        return iter(self._conf)
+
+    def __keys__(self):
+        return list(self._conf.keys())
+
+    def __str__(self):
+        '''
+        Return a pretty string for display and logging
+        '''
+        r = []
+        r.append("--- # global codeface configuration")
+        for key in self.GLOBAL_KEYS:
+            if key in self:
+                r.append("{}: {}".format(key, repr(self[key])))
+        r.append("# codeface project configuration")
+        for key in self.PROJECT_KEYS + self.OPTIONAL_KEYS:
+            if key in self:
+                r.append("{}: {}".format(key, repr(self[key])))
+        unknown = [k for k in self if k not in self.ALL_KEYS]
+        if unknown:
+            r.append("# Unknown keys")
+            for key in unknown:
+                r.append("{}: {}".format(key, repr(self[key])))
+        return "\n".join(r)
diff --git a/codeface/dbmanager.py b/codeface/dbmanager.py
new file mode 100644
index 0000000..df917ca
--- /dev/null
+++ b/codeface/dbmanager.py
@@ -0,0 +1,431 @@
+#! /usr/bin/env python
+# This file is part of Codeface. Codeface is free software: you can
+# redistribute it and/or modify it under the terms of the GNU General Public
+# License as published by the Free Software Foundation, version 2.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+# details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+#
+# Copyright 2013 by Siemens AG, Wolfgang Mauerer <wolfgang.mauerer@siemens.com>
+# All Rights Reserved.
+
+# Thin sql database wrapper
+
+import MySQLdb as mdb
+from datetime import datetime, timezone
+from logging import getLogger
+from contextlib import contextmanager
+
+# create logger
+log = getLogger(__name__)
+
+@contextmanager
+def _log_db_error(action, args=None):
+    try:
+        yield
+    except mdb.Error as e:
+        if args:
+            try:
+                action = action % args
+            except:
+                pass
+        log.critical('MySQL error {e[0]} during "{action}": {e[1]}'
+                     ''.format(e=e.args, action=action))
+        raise
+
+
+class DBManager:
+    """This class provides an interface to the codeface sql database."""
+
+    def __init__(self, conf):
+        try:
+            self.con = None
+            self.con = mdb.Connection(host=conf["dbhost"],
+                                      port=conf["dbport"],
+                                      user=conf["dbuser"],
+                                      passwd=conf["dbpwd"],
+                                      db=conf["dbname"])
+            log.debug(
+                "Establishing MySQL connection to "
+                "{c[dbuser]}@{c[dbhost]}:{c[dbport]}, DB '{c[dbname]}'"
+                    .format(c=conf))
+        except mdb.Error as e:
+            log.critical(
+                "Failed to establish MySQL connection to "
+                "{c[dbuser]}@{c[dbhost]}:{c[dbport]}, DB '{c[dbname]}'"
+                ": {e[1]} ({e[0]})"
+                "".format(c=conf, e=e.args))
+            raise
+        self.cur = self.con.cursor()
+
+        max_packet_size = 1024 * 1024 * 256
+        self.doExec("SET GLOBAL max_allowed_packet=%s", (max_packet_size,))
+
+    def __del__(self):
+        if self.con is not None:
+            self.con.close()
+
+    def doExec(self, stmt, args=None):
+        with _log_db_error(stmt, args):
+            retryCount = 0
+            while retryCount < 10:
+                try:
+                    if isinstance(args, list):
+                        res = self.cur.executemany(stmt, args)
+                    else:
+                        res = self.cur.execute(stmt, args)
+                    return res
+                except mdb.OperationalError as dbe:
+                    retryCount += 1
+                    log.info("DBE args: " + str(dbe.args))
+                    if dbe.args[0] == 1213:  # Deadlock! retry...
+                        log.warning("Recoverable deadlock in MySQL - retrying " \
+                                    "(attempt {}).".format(retryCount))
+                    elif dbe.args[0] == 2006:  # Server gone away...
+                        log.warning("MySQL Server gone away, trying to reconnect " \
+                                    "(attempt {}).".format(retryCount))
+                        self.con.ping(True)
+                    elif dbe.args[0] == 2013:  # Lost connection to MySQL server during query...
+                        log.warning("Lost connection to MySQL server during query, " \
+                                    "trying to reconnect (attempt {}).".format(retryCount))
+                        self.con.ping(True)
+                    else:
+                        raise
+
+            # Give up after ten retry attempts and propagate the
+            # problem to the caller. Callers can either fix the problem with
+            # a different query, or the analysis fails
+            log.error("DB access failed after ten attempts, giving up")
+            raise
+
+    def doFetchAll(self):
+        with _log_db_error("fetchall"):
+            return self.cur.fetchall()
+
+    def doCommit(self):
+        with _log_db_error("commit"):
+            return self.con.commit()
+
+    def doExecCommit(self, stmt, args=None):
+        self.doExec(stmt, args)
+        self.doCommit()
+
+    # NOTE: We don't provide any synchronisation since by assumption,
+    # a single project is never analysed from two threads.
+    def getProjectID(self, name, analysisMethod):
+        """
+        Return the project ID of the given name/analysisMethod combination.
+        If the project does not exist yet in the database, it is created.
+        """
+        self.doExec("SELECT id FROM project WHERE name=%s "
+                    "AND analysisMethod=%s", (name, analysisMethod))
+        if self.cur.rowcount == 0:
+            # Project is not contained in the database
+            log.info("Creating new project {}/{}".
+                        format(name, analysisMethod))
+            self.doExecCommit("INSERT INTO project (name, analysisMethod) " +
+                              "VALUES (%s, %s);", (name, analysisMethod))
+            self.doExec("SELECT id FROM project WHERE name=%s;", (name,))
+        elif self.cur.rowcount > 1:
+            raise Exception("Duplicate projects {}/{} in database!".
+                            format(name, analysisMethod))
+        pid = self.doFetchAll()[0][0]
+        log.info("Using project {}/{} with ID {}".
+                    format(name, analysisMethod, pid))
+        return pid
+
+    def get_project(self, pid):
+        self.doExec("SELECT name, analysisMethod FROM project"
+                    " WHERE id=%s", pid)
+        if self.cur.rowcount == 0:
+            raise Exception("Project id {} not found!".format(pid))
+        return self.doFetchAll()[0]
+
+    def get_edgelist(self, cid):
+        self.doExec("SELECT fromId, toId, weight FROM edgelist \
+                    WHERE clusterId={}".format(cid))
+        if self.cur.rowcount == 0:
+            raise Exception("Cluster id {} not found!".format(cid))
+        return self.doFetchAll()
+
+    def get_file_dev(self, project_id, range_id):
+        self.doExec("SELECT * FROM (SELECT id, commitHash, commitDate, author, description " \
+                    "FROM commit WHERE projectId={} AND releaseRangeId={}) AS Commits " \
+                    "INNER JOIN (SELECT file, commitId, SUM(size) AS fileSize " \
+                    "FROM commit_dependency GROUP BY commitId, file) AS commitFileLOC " \
+                    "ON Commits.id=commitFileLOC.commitId ORDER BY " \
+                    "commitFileLOC.file, commitFileLOC.commitId".format(project_id, range_id))
+
+        if self.cur.rowcount == 0:
+            raise Exception("Could not obtain file-dev information for project {} "\
+                            "(release range {}!".format(project_id, range_id))
+        return self.doFetchAll()
+
+    def get_release_ranges(self, project_id):
+        self.doExec("SELECT id FROM release_range \
+                    WHERE projectId={}".format(project_id))
+        if self.cur.rowcount == 0:
+            raise Exception("No release ranges found for project {}!"
+                            .format(project_id))
+        return [range_entry[0] for range_entry in self.doFetchAll()]
+
+    def get_cluster_id(self, pid, release_range_id=None):
+        if release_range_id:
+            self.doExec("SELECT id FROM cluster WHERE clusterNumber=-1 \
+                        AND projectId={} AND releaseRangeId={}"
+                        .format(pid, release_range_id))
+        else:
+            self.doExec("SELECT id FROM cluster WHERE clusterNumber=-1 \
+                        AND projectId={}".format(pid))
+        if self.cur.rowcount == 0:
+            raise Exception("Cluster from project {} not found!".format(pid))
+        return self.doFetchAll()[0][0]
+
+    def get_project_persons(self, pid):
+        self.doExec("SELECT id, name FROM person \
+                    WHERE projectId={}".format(pid))
+        if self.cur.rowcount == 0:
+            raise Exception("Persons from project {} not found!".format(pid))
+        return (self.doFetchAll())
+
+    def getTagID(self, projectID, tag, type):
+        """Determine the ID of a tag, given its textual form and the type"""
+        self.doExec("SELECT id FROM release_timeline WHERE projectId=%s " +
+                    "AND tag=%s AND type=%s", (projectID, tag, type))
+        if self.cur.rowcount != 1:
+            raise Exception("Tag '{}' of type {} is {} times in the DB!".
+                            format(tag, type, self.cur.rowcount))
+        return self.doFetchAll()[0][0]
+
+    def getCommitId(self, projectId, commitHash):
+        self.doExec("SELECT id FROM commit" +
+                    " WHERE commitHash=%s AND projectId=%s"
+                    , (commitHash, projectId))
+        if self.cur.rowcount == 0:
+            raise Exception("Commit from project {} not found!".
+                            format(projectId))
+        return self.doFetchAll()[0][0]
+
+    def getRevisionID(self, projectID, tag):
+        return self.getTagID(projectID, tag, "release")
+
+    def getRCID(self, projectID, tag):
+        return self.getTagID(projectID, tag, "rc")
+
+    def getReleaseRangeID(self, projectID, revisionIDs):
+        """Given a pair of release IDs, determine the release range ID"""
+        self.doExec("SELECT id FROM release_range WHERE projectId=%s " +
+                    "AND releaseStartId=%s AND releaseEndId=%s",
+                    (projectID, revisionIDs[0], revisionIDs[1]))
+        if self.cur.rowcount != 1:
+            raise Exception("Release range from '{r[0]}' to '{r[1]}' is {c} "
+                            "times in the DB!".
+                            format(r=revisionIDs, c=self.cur.rowcount))
+        return self.doFetchAll()[0][0]
+
+    def getProjectTimeRange(self, pid):
+        """Given a project ID, determine the start and end date of available VCS data.
+           Returns a tuple with start end end date in the form YYYY-MM-DD"""
+        self.doExec("SELECT MIN(date_start) FROM revisions_view "
+                    "WHERE projectId={}".format(pid))
+        if self.cur.rowcount == 0:
+            raise Exception("No start date for pid {} found!".format(pid))
+        date_start = self.doFetchAll()[0][0].strftime("%Y-%m-%d")
+
+        self.doExec("SELECT MAX(date_end) FROM revisions_view "
+                    "WHERE projectId={}".format(pid))
+        if self.cur.rowcount == 0:
+            raise Exception("No end date for pid {} found!".format(pid))
+        date_end = self.doFetchAll()[0][0].strftime("%Y-%m-%d")
+
+        return (date_start, date_end)
+
+    def get_commit_cdate(self, pid, hash):
+        """Given a project ID and a commit hash, obtain the commit date
+           in format YYYY-MM-DD"""
+        self.doExec("SELECT commitDate FROM commit "
+                    "WHERE projectId={} and commitHash='{}'".format(pid, hash))
+        if self.cur.rowcount == 0:
+            raise Exception("No date found for commit {} (pid {}) found!".format(hash, pid))
+        date = self.doFetchAll()[0][0].strftime("%Y-%m-%d")
+
+        return (date)
+
+    def get_release_range(self, project_id, range_id):
+        self.doExec(
+            "SELECT st.tag, nd.tag, rc.tag FROM release_range "
+            "LEFT JOIN release_timeline AS st ON st.id=releaseStartId "
+            "LEFT JOIN release_timeline AS nd ON nd.id=releaseEndId "
+            "LEFT JOIN release_timeline AS rc ON rc.id=releaseRCStartId "
+            "WHERE release_range.projectId=%s AND release_range.id=%s",
+            (project_id, range_id))
+        ranges = self.doFetchAll()
+        if self.cur.rowcount == 0:
+            raise Exception("Range id {} not found!".format(project_id))
+        return ranges[0]
+
+    def get_num_commits_in_range(self, range_id):
+        self.doExec("SELECT COUNT(*) FROM commit WHERE releaseRangeId={}".format(range_id))
+        if self.cur.rowcount == 0:
+            raise Exception("Range id {} not found in get_num_commits_in_range!".format(range_id))
+        return self.doFetchAll()[0][0]
+
+    def update_release_timeline(self, project, tagging, revs, rcs,
+                                recreate_project=False):
+        '''
+        For a project, update the release timeline table with the given
+        revisions. If existing releases/rcs from the timeline are not in
+        order, the conservative approach is taken and the whole project is
+        recreated to avoid inconsistencies.
+
+        Returns true if the project had to be recreated.
+        '''
+        assert len(revs) >= 2
+        assert len(revs) == len(rcs)
+        rcs = [rc if rc else rev for rc, rev in zip(rcs, revs)]
+        pid = self.getProjectID(project, tagging)
+
+        if not recreate_project:
+            # First check if the release timeline is sane and in order
+            self.doExec("SELECT tag FROM release_timeline WHERE projectId=%s "
+                        "AND type='release' ORDER BY id", (pid,))
+            tags = [tag for (tag,) in self.doFetchAll()]
+            if len(set(tags)) != len(tags):
+                log.error("Database corrupted: Duplicate release entries in "
+                          "release_timeline! Recreating project.")
+                recreate_project = True
+            if len(tags) == 0:
+                recreate_project = True
+
+        # Check that the tags are in the same order
+        if not recreate_project:
+            for i, tag in enumerate(tags):
+                if i >= len(revs):
+                    log.warning("List of revisions to analyse was shortened.")
+                    break
+                if revs[i] != tag:
+                    log.error("Release number {} changed tag from {} to "
+                              "{}. Recreating project.".
+                              format(i, tag, revs[i]))
+                    recreate_project = True
+                    break
+
+        # Check that the RC tags are in order
+        if not recreate_project:
+            self.doExec("SELECT tag FROM release_timeline WHERE "
+                        "projectId=%s AND type='rc' ORDER BY id", (pid,))
+            rctags = [tag for (tag,) in self.doFetchAll()]
+            if len(set(rctags)) != len(rctags):
+                log.error("Database corrupted: Duplicate RC entries in "
+                          "release_timeline! Recreating project.")
+                recreate_project = True
+
+        # Check for changes in release candidates
+        # Note that the first RC is unused, since it refers to the end
+        # of a previous period
+        if not recreate_project:
+            for i, tag in enumerate(rctags):
+                if i + 1 >= len(rcs):
+                    log.warning("List of release candidates to analyse "
+                                "was shortened.")
+                    break
+                if rcs[i + 1] != tag:
+                    log.error("Release candidate number {} changed tag "
+                              "from {} to {}. Recreating project.".
+                              format(i, tag, rcs[i + 1]))
+                    recreate_project = True
+                    break
+
+        # Go through the release ranges and check if they have changed
+        if not recreate_project:
+            self.doExec(
+                "SELECT st.tag, nd.tag, rc.tag FROM release_range "
+                "LEFT JOIN release_timeline AS st ON st.id=releaseStartId "
+                "LEFT JOIN release_timeline AS nd ON nd.id=releaseEndId "
+                "LEFT JOIN release_timeline AS rc ON rc.id=releaseRCStartId "
+                "WHERE release_range.projectId=%s ORDER BY release_range.id",
+                (pid,))
+            ranges = self.doFetchAll()
+            if len(set(ranges)) != len(tags) - 1:
+                log.error("Database corrupted: Number of release ranges"
+                          " does not match number of release tags!")
+                recreate_project = True
+
+            for i, (start, end, rc) in enumerate(self.doFetchAll()):
+                if i + 1 >= len(revs) or recreate_project:
+                    # List of revisions to analyse was shortened
+                    break
+                if (start, end) != (revs[i], revs[i + 1]):
+                    log.error("Release range {} changed from {} to {}."
+                              " Recreating project.".
+                              format(i, (start, end), (revs[i], revs[i + 1])))
+                    recreate_project = True
+                    break
+                if rc != rcs[i + 1]:
+                    log.error("Release candidate {} changed from {} to {}."
+                              " Recreating project.".
+                              format(i, rc, rcs[i + 1]))
+                    recreate_project = True
+                    break
+
+        # Recreate project if necessary
+        if recreate_project:
+            # This should ripple through the database and delete
+            # all referencing entries for project
+            log.warning("Deleting and re-creating project {}/{}.".
+                        format(project, tagging))
+            self.doExecCommit("DELETE FROM `project` WHERE id=%s", (pid,))
+            pid = self.getProjectID(project, tagging)
+            tags = []
+            rctags = []
+
+        # at this point we have verified that the first len(tags)
+        # entries are identical
+        new_ranges_to_process = []
+        if len(revs) > len(tags):
+            n_new = len(revs) - len(tags)
+            log.info("Adding {} new releases...".format(n_new))
+            previous_rev = None
+            if len(tags) > 0:
+                previous_rev = tags[-1]
+            for rev, rc in list(zip(revs, rcs))[len(tags):]:
+                self.doExecCommit("INSERT INTO release_timeline "
+                                  "(type, tag, projectId) "
+                                  "VALUES (%s, %s, %s)",
+                                  ("release", rev, pid))
+
+                if previous_rev is not None and rc:
+                    self.doExecCommit("INSERT INTO release_timeline "
+                                      "(type, tag, projectId) "
+                                      "VALUES (%s, %s, %s)",
+                                      ("rc", rc, pid))
+
+                if previous_rev is not None:
+                    startID = self.getRevisionID(pid, previous_rev)
+                    endID = self.getRevisionID(pid, rev)
+                    if rc:
+                        rcID = self.getRCID(pid, rc)
+                    else:
+                        rcID = "NULL"
+                    self.doExecCommit("INSERT INTO release_range "
+                                      "(releaseStartId, releaseEndId, "
+                                      "projectId, releaseRCStartId) "
+                                      "VALUES (%s, %s, %s, %s)",
+                                      (startID, endID, pid, rcID))
+                    new_ranges_to_process.append(self.getReleaseRangeID(pid,
+                                                                        (startID, endID)))
+                previous_rev = rev
+        # now we are in a well-defined state.
+        # Return the ids of the release ranges we have to process
+        return new_ranges_to_process
+
+
+def tstamp_to_sql(tstamp):
+    """Convert a Unix timestamp into an SQL compatible DateTime string"""
+    return (datetime.fromtimestamp(tstamp, tz=timezone.utc).strftime("%Y-%m-%d %H:%M:%S"))
diff --git a/codeface/linktype.py b/codeface/linktype.py
new file mode 100644
index 0000000..db08a80
--- /dev/null
+++ b/codeface/linktype.py
@@ -0,0 +1,41 @@
+## This file is part of Codeface. Codeface is free software: you can
+## redistribute it and/or modify it under the terms of the GNU General Public
+## License as published by the Free Software Foundation, version 2.
+##
+## This program is distributed in the hope that it will be useful, but WITHOUT
+## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+## FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+## details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+##
+## Copyright 2013 by Siemens AG, Wolfgang Mauerer <wolfgang.mauerer@siemens.com>
+## Copyright 2014 by Matthias Dittrich <matthi.d@gmail.com>
+## All Rights Reserved.
+
+__author__ = 'drag0on'
+
+
+#enum-like class to distinguish between the various
+#methods used to link individuals
+class LinkType:
+    tag = "tag"
+    proximity = "proximity"
+    committer2author = "committer2author"
+    file = "file"
+    feature = "feature"
+    feature_file = "feature_file"
+
+    _all_link_types = \
+        (tag, proximity, committer2author, file, feature, feature_file)
+
+    @staticmethod
+    def get_all_link_types():
+        return LinkType._all_link_types
+
+    @staticmethod
+    def get_tag_types():
+        return ["Signed-off-by", "Acked-by", "CC", "Reviewed-by",
+                "Reported-by", "Tested-by", "Patch"]
diff --git a/codeface/util.py b/codeface/util.py
new file mode 100644
index 0000000..807ecc4
--- /dev/null
+++ b/codeface/util.py
@@ -0,0 +1,527 @@
+## This file is part of Codeface. Codeface is free software: you can
+## redistribute it and/or modify it under the terms of the GNU General Public
+## License as published by the Free Software Foundation, version 2.
+##
+## This program is distributed in the hope that it will be useful, but WITHOUT
+## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+## FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+## details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program; if not, write to the Free Software
+## Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+##
+## Copyright 2013 by Siemens AG, Wolfgang Mauerer <wolfgang.mauerer@siemens.com>
+## All Rights Reserved.
+'''
+Utility functions for running external commands
+'''
+
+import os
+import os.path
+import re
+import shutil
+import signal
+import sys
+import traceback
+from collections import OrderedDict, namedtuple
+from glob import glob
+from math import sqrt
+from multiprocessing import Process, Queue, Lock
+from pkg_resources import resource_filename
+from subprocess import Popen, PIPE
+from tempfile import NamedTemporaryFile, mkdtemp
+from time import sleep
+from threading import enumerate as threading_enumerate
+from queue import Empty
+from datetime import timedelta, datetime
+import logging
+log = logging.getLogger(__name__)
+
+# Represents a job submitted to the batch pool.
+BatchJobTuple = namedtuple('BatchJobTuple', ['id', 'func', 'args', 'kwargs',
+        'deps', 'startmsg', 'endmsg'])
+class BatchJob(BatchJobTuple):
+    def __init__(self, *args, **kwargs):
+        super(BatchJob, self).__init__(*args, **kwargs)
+        self.done = False
+        self.submitted = False
+
+class BatchJobPool(object):
+    '''
+    Implementation of a dependency-respecting batch pool
+
+    This system uses a pool of N worker processes to run jobs. Since the
+    multiprocessing module is used, all functions, args and kwargs must be
+    pickleable. Specifically, this means that only functions defined at
+    top-level in a module can be used here.
+
+    Jobs can be created using pool.add(function, args, kwargs, deps=deps))
+    where deps can be a list of job handles previously returned by
+    pool.add. If multiprocessing is disabled, the functions are run
+    immediately and None is returned.
+
+    Call pool.join() to start execution and wait until all jobs are complete.
+    If a work item raises an exception, the join() will terminate with
+    that exception, if pickleable, or a generic Exception if otherwise.
+    '''
+
+    def __init__(self, n_cores):
+        self.n_cores = n_cores
+        self.next_id = 1
+        self.jobs = OrderedDict() # Dictionary of jobs (ordered for repeatability)
+
+        # Initialize workers and their work and done queues
+        self.work_queue, self.done_queues, self.workers = Queue(), [], []
+        if n_cores > 1:
+            # When n_cores is 1 we doen't use the process anyway.
+            # However the pycharm debugger goes crasy when we start the
+            # process, so as a workaround don't start anything when
+            # n_core is 1.
+            for i in range(n_cores):
+                dq = Queue()
+                w = Process(target=batchjob_worker_function, args=(self.work_queue, dq))
+                self.done_queues.append(dq)
+                self.workers.append(w)
+                w.start()
+
+    def _is_ready(self, job):
+        '''Returns true if the job is ready for submission'''
+        if job.done or job.submitted:
+            return False
+        return all(self.jobs[j].done for j in job.deps if j is not None)
+
+    def _submit(self, job):
+        '''Submit the job if it is ready'''
+        if self._is_ready(job):
+            self.work_queue.put(job)
+            job.submitted = True
+
+    def add(self, func, args, kwargs={}, deps=(), startmsg=None, endmsg=None):
+        '''
+        Add a job that executes func(*args, **kwargs) and depends on the
+        jobs with the ids listed in deps.
+        This function returns a job ID which can be used as a dependency
+        in other calls to add.
+        If n_cores is 1; this call immediately executes the given function
+        and returns None
+        '''
+        if self.n_cores == 1:
+            log.info(startmsg)
+            func(*args, **kwargs)
+            log.info(endmsg)
+            return None
+        job_id = self.next_id
+        self.next_id += 1
+        j = BatchJob(job_id, func, args, kwargs, deps, startmsg, endmsg)
+        self.jobs[job_id] = j
+        return job_id
+
+    def join(self):
+        '''
+        Submit jobs and wait for all jobs to finish.
+        '''
+        try:
+            while not all(j.done for j in self.jobs.values()):
+                # Put jobs that are ready onto the work queue
+                for j in self.jobs.values():
+                    self._submit(j)
+                # Wait for a result from the done_queues
+                for dq in self.done_queues:
+                    try:
+                        res = dq.get(block=False)
+                    except Empty:
+                        continue
+                    if res is None:
+                        log.fatal("Uncaught exception in worker thread!")
+                        raise Exception("Failure in Batch Pool")
+                    if isinstance(res, Exception):
+                        log.fatal("Uncaught exception in worker thread:")
+                        raise res
+                    log.debug("Job {} has finished!".format(res))
+                    self.jobs[res].done = True
+                # Check if workers died
+                for w in self.workers:
+                    if not w.is_alive():
+                        w.join()
+                        raise Exception("A Worker died unexpectedly!")
+                sleep(0.01)
+        finally:
+            # Terminate and join the workers
+            # Wait 100ms to allow backtraces to be logged
+            sleep(0.1)
+            log.devinfo("Terminating workers...")
+            for w in self.workers:
+                w.terminate()
+            log.devinfo("Workers terminated.")
+
+def batchjob_worker_function(work_queue, done_queue):
+    '''
+    Worker function executed in a separate process.
+    This function pulls work items off the work queue; terminates if there
+    is no item for 0.5s; otherwise executes the work item. Any exception
+    is reraised after putting a None onto the done_queue (triggering an
+    exception in the main process)
+    '''
+    # Silently quit on CTRL+C
+    signal.signal(signal.SIGINT, handle_sigint_silent)
+    while True:
+        try:
+            job = work_queue.get(block=True)
+        except ValueError as ve:
+            # This happens when the main loop stops before we do
+            return
+        log.debug("Starting job id {}".format(job.id))
+        try:
+            if job.startmsg:
+                log.info(job.startmsg)
+            job.func(*job.args, **job.kwargs)
+            if job.endmsg:
+                log.info(job.endmsg)
+            log.debug("Finished work id {}".format(job.id))
+            done_queue.put(job.id)
+        except Exception as e:
+            log.debug("Failed work id {}".format(job.id))
+            done_queue.put(Exception(e.__class__.__name__ + ": " +
+                    str(e) + "\n" + traceback.format_exc()))
+
+# Function to dump the stacks of all threads
+def get_stack_dump():
+    id2name = dict([(th.ident, th.name) for th in threading_enumerate()])
+    code = ["Stack dump:"]
+    for threadId, stack in sys._current_frames().items():
+        code.append("")
+        code.append("# Thread: %s(%d)" % (id2name.get(threadId,""), threadId))
+        for filename, lineno, name, line in traceback.extract_stack(stack):
+            code.append('File: "%s", line %d, in %s' % (filename, lineno, name))
+            if line:
+                code.append("  %s" % (line.strip()))
+    return code
+
+# Signal handler that dumps all stacks and terminates
+# Lock l dis-interleaves the stack traces of processes
+l = Lock()
+def handle_sigint(signal, frame):
+    with l:
+        log.fatal("CTRL-C pressed!")
+        for c in get_stack_dump():
+            log.devinfo(c)
+    # This call raises a SystemExit exception in the
+    # stack frame that was interrupted by the signal
+    # For the main thread, this is what we want.
+    sys.exit(-1)
+
+# Signal handler that dumps all stacks and terminates silently
+# Also uses the Lock l to dis-interleave the stack traces
+def handle_sigint_silent(signal, frame):
+    with l:
+        for c in get_stack_dump():
+            log.devinfo(c)
+    logging.shutdown()
+    # Since we want to terminate worker threads with prejudice,
+    # we use os._exit, which directly terminates the process.
+    # otherwise the worker try/catch will also catch the SystemExit
+    os.exit_(-1)
+
+def handle_sigterm(signal, frame):
+    # Since we want to terminate worker threads with prejudice,
+    # we use os._exit, which directly terminates the process.
+    # otherwise the worker try/catch will also catch the SystemExit
+    logging.shutdown()
+    os._exit(-1)
+
+def handle_sigusr1(signal, frame):
+    for c in get_stack_dump():
+        log.info(c)
+
+# Dump all the stacks in case of CTRL-C
+signal.signal(signal.SIGINT, handle_sigint)
+# Also dump on sigterm
+signal.signal(signal.SIGTERM, handle_sigterm)
+# Also dump on sigusr1, but do not terminate
+signal.signal(signal.SIGUSR1, handle_sigusr1)
+
+def execute_command(cmd, ignore_errors=False, direct_io=False, cwd=None, silent_errors=False):
+    '''
+    Execute the command `cmd` specified as a list of ['program', 'arg', ...]
+    If ignore_errors is true, a non-zero exit code will be ignored (and a warning
+    messages will be issued), otherwise an exception is raised. If silent_errors is True,
+    no messages will be emitted even in case of an error (but exceptions will still be raised).
+    If direct_io is True, do not capture the stdin and stdout of the command.
+    Returns the stdout of the command.
+    '''
+    jcmd = " ".join(cmd)
+    log.debug("Running command: {}".format(jcmd))
+    try:
+        if direct_io:
+            pipe = Popen(cmd, cwd=cwd)
+        else:
+            pipe = Popen(cmd, stdout=PIPE, stderr=PIPE, cwd=cwd)
+        stdout, stderr = pipe.communicate()
+    except OSError:
+        log.error("Error executing command {}!".format(jcmd))
+        raise
+
+    if pipe.returncode != 0:
+        if ignore_errors:
+            if not(silent_errors):
+                log.warning("Command '{}' failed with exit code {}. Ignored.".
+                            format(jcmd, pipe.returncode))
+        else:
+            if not(direct_io) and not(silent_errors):
+                log.info("Command '{}' stdout:".format(jcmd))
+                for line in stdout.splitlines():
+                    log.info(line)
+                log.info("Command '{}' stderr:".format(jcmd))
+                for line in stderr.splitlines():
+                    log.info(line)
+            msg = "Command '{}' failed with exit code {}. \n" \
+                  "(stdout: {}\nstderr: {})"\
+                  .format(jcmd, pipe.returncode, stdout, stderr)
+            if not(silent_errors):
+                log.error(msg)
+            raise Exception(msg)
+    return stdout
+
+def _convert_dot_file(dotfile):
+    '''
+    Convert duplicate edges in the given dot file into edges with
+    a larger pen width.
+    '''
+    res = []
+    edges = {}
+    edge_spec = re.compile("\s+(\d+) -> (\d+);")
+
+    file = open(dotfile, "r")
+    lines = [line.strip("\n") for line in file]
+    # Modify the header (copyright line + digraph)
+    lines[0] = "digraph {"
+    lines[1] = "node[fontsize=30, shape=\"box\"];"
+
+    lines[len(lines)-1] = "" # Skip closing brace
+
+    for line in lines:
+        m = re.match(edge_spec, line)
+        if m:
+            a, b = m.group(1), m.group(2)
+            edges[(a,b)] = edges.get((a,b), 0) + 1
+        else:
+            res.append(line + "\n")
+
+    # sort the edges for reproducibility
+    for ((a, b), count) in sorted(edges.items()):
+        res.append("{0} -> {1} [weight={2} penwidth={3}];\n".
+              format(a,b,count, sqrt(float(count))))
+
+    res.append("overlap=prism;\n")
+    res.append("splines=true;\n")
+    res.append("}\n")
+    return res
+
+def layout_graph(filename):
+    out = NamedTemporaryFile(mode="w", delete=False)
+    out.writelines(_convert_dot_file(filename))
+    out.close() # flushes the cache
+    cmd = []
+    cmd.append("dot")
+    cmd.append("-Kfdp")
+    cmd.append("-Tpdf")
+    cmd.append("-Gcharset=utf-8")
+    cmd.append("-o{0}.pdf".format(os.path.splitext(filename)[0]))
+    cmd.append(out.name)
+    execute_command(cmd, ignore_errors=True)
+    # Manually remove the temporary file
+    os.unlink(out.name)
+
+def generate_report(start_rev, end_rev, resdir):
+    log.devinfo("  -> Generating report")
+    report_base = "report-{0}_{1}".format(start_rev, end_rev)
+
+    # Run perl script to generate report LaTeX file
+    cmd = []
+    cmd.append(resource_filename(__name__, "perl/create_report.pl"))
+    cmd.append(resdir)
+    cmd.append("{0}--{1}".format(start_rev, end_rev))
+    with open(os.path.join(resdir, report_base + ".tex"), 'w') as f:
+        f.write(execute_command(cmd))
+
+    # Compile report with lualatex
+    cmd = []
+    cmd.append("lualatex")
+    cmd.append("-interaction=nonstopmode")
+    cmd.append(os.path.join(resdir, report_base + ".tex"))
+
+    # We run latex in a temporary directory so that it's easy to
+    # get rid of the log files etc. created during the run that are
+    # not relevant for the final result
+    orig_wd = os.getcwd()
+    tmpdir = mkdtemp()
+
+    os.chdir(tmpdir)
+    execute_command(cmd, ignore_errors=True)
+    try:
+        shutil.copy(report_base + ".pdf", resdir)
+    except IOError:
+        log.warning("Could not copy report PDF (missing input data?)")
+
+    os.chdir(orig_wd)
+    shutil.rmtree(tmpdir)
+
+def generate_reports(start_rev, end_rev, range_resdir):
+    files = glob(os.path.join(range_resdir, "*.dot"))
+    log.info("  -> Generating Reports...")
+    for file in files:
+        layout_graph(file)
+    generate_report(start_rev, end_rev, range_resdir)
+
+def check4ctags():
+    # check if the appropriate ctags is installed on the system
+    prog_name    = 'Exuberant Ctags'
+    prog_version = 'Exuberant Ctags 5.9~svn20110310'
+    cmd = "ctags-exuberant --version".split()
+
+    res = execute_command(cmd, None)
+
+    if not(res.startswith(prog_name)):
+        log.error("program '{0}' does not exist".format(prog_name))
+        raise Exception("ctags-exuberant not found")
+
+    if not(res.startswith(prog_version)):
+        # TODO: change this to use standard mechanism for error logging
+        log.error("Ctags version '{0}' not found".format(prog_version))
+        raise Exception("Incompatible ctags-exuberant version")
+
+
+def check4cppstats():
+    """
+    check if the appropriate cppstats is installed on the system.
+    """
+    # We can not check the version directly as there is no version switch
+    # on cppstats We just check if the first line is OK.
+    line = "cppstats v0.9."
+    cmd = "/usr/bin/env cppstats --version".split()
+    res = execute_command(cmd)
+    if not (res.startswith(line)):
+        error_message = "expected the first line to start with '{0}' but "\
+                        "got '{1}'".format(line, res[0])
+        log.error("program cppstats does not exist, or it is not working "
+                  "as expected ({0}"
+                  .format(error_message))
+        raise Exception("no working cppstats found ({0})"
+                        .format(error_message))
+
+
+def gen_prefix(i, num_ranges, start_rev, end_rev):
+    if (len(start_rev) == 40):
+        # When revisions are given by commit hashes, shorten them since
+        # they don't carry any meaning
+        start_rev = start_rev[0:6]
+        end_rev = end_rev[0:6]
+    return("  -> Revision range {0}/{1} ({2}..{3}): ".format(i, num_ranges,
+                                                             start_rev, end_rev))
+
+def gen_range_path(base_path, i, start_rev, end_rev):
+    if (len(start_rev) == 40):
+        # Same logic as above, but construct a file system path
+        start_rev = start_rev[0:6]
+        end_rev = end_rev[0:6]
+    return(os.path.join(base_path, "{0}--{1}-{2}".
+                        format(str(i).zfill(3), start_rev, end_rev)))
+
+
+def parse_iso_git_date(date_string):
+    # from http://stackoverflow.com/questions/526406/python-time-to-age-part-2-timezones
+    try:
+        offset = int(date_string[-5:])
+    except:
+        log.error("could not extract timezone info from \"{0}\""
+                  .format(date_string))
+        raise
+    minutes = (offset if offset > 0 else -offset) % 100
+    delta = timedelta(hours=offset / 100,
+                      minutes=minutes if offset > 0 else -minutes)
+    # In future python versions we can use "%Y-%m-%d %H:%M:%S %z"
+    # this way we don't need the above workaround, currently %z isn't
+    # working as documented
+    fmt = "%Y-%m-%d %H:%M:%S"
+    parsed_date = datetime.strptime(date_string[:-6], fmt)
+    parsed_date -= delta
+    return parsed_date
+
+# Determine settings for the size and amount of analysis windows. If nothing
+# specific is provided, use default settings
+def get_analysis_windows(conf):
+    window_size_months = 3
+    num_window = -1
+
+    if "windowSize" in conf.keys():
+        window_size_months = conf["windowSize"]
+    if "numWindows" in conf.keys():
+        num_window = conf["numWindows"]
+
+    return window_size_months, num_window
+
+def generate_analysis_windows(repo, window_size_months):
+    """
+    Generates a list of revisions (commit hash) in increments of the window_size
+    parameter. The window_size parameter specifies the number of months between
+    revisions. This function is useful when the git repository has no tags
+    referencing releases.
+    """
+    cmd_date = 'git --git-dir={0} show --format=%ad  --date=iso8601'\
+        .format(repo).split()
+    latest_date_result = execute_command(cmd_date).splitlines()[0]
+    latest_commit = parse_iso_git_date(latest_date_result)
+
+    print_fmt = "%Y-%m-%dT%H:%M:%S+0000"
+    month = timedelta(days=30)
+
+    def get_before_arg(num_months):
+        date = latest_commit - num_months * month
+        return '--before=' + date.strftime(print_fmt)
+
+    revs = []
+    start = window_size_months  # Window size time ago
+    end = 0  # Present time
+    cmd_base = 'git --git-dir={0} log --no-merges --format=%H,%ct,%ci'\
+        .format(repo).split()
+    cmd_base_max1 = cmd_base + ['--max-count=1']
+    cmd = cmd_base_max1 + [get_before_arg(end)]
+    rev_end = execute_command(cmd).splitlines()
+    revs.extend(rev_end)
+
+    while start != end:
+        cmd = cmd_base_max1 + [get_before_arg(start)]
+        rev_start = execute_command(cmd).splitlines()
+
+        if len(rev_start) == 0:
+            start = end
+            cmd = cmd_base + ['--reverse']
+            rev_start = [execute_command(cmd).splitlines()[0]]
+        else:
+            end = start
+            start = end + window_size_months
+
+        # Check if any commits occurred since the last analysis window
+        if rev_start[0] != revs[0]:
+            revs = rev_start + revs
+        # else: no commit happened since last window, don't add duplicate
+        #       revisions
+    # End while
+
+    # Check that commit dates are monotonic, in some cases the earliest
+    # first commit does not carry the earliest commit date
+    revs = [rev.split(",") for rev in revs]
+    rev_len = len(revs)
+    if int(revs[0][1]) > int(revs[1][1]):
+      del revs[0]
+
+    # Extract hash values and dates intro seperate lists
+    revs_hash = [rev[0] for rev in revs]
+    revs_date = [rev[2].split(" ")[0] for rev in revs]
+
+    # We cannot detect release canndidate tags in this analysis mode,
+    # so provide a list with None entries
+    rcs = [None for x in range(len(revs))]
+
+    return revs_hash, rcs, revs_date

From 96c4208360c209245395a11b0d54a5e68ab15ed8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20L=C3=B6ffler?= <s8maloef@stud.uni-saarland.de>
Date: Mon, 4 Aug 2025 18:46:52 +0200
Subject: [PATCH 03/13] Remove dependency on codeface

---
 codeface_extraction/codeface_extraction.py |  9 +++++----
 codeface_extraction/extractions.py         | 13 ++++++-------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/codeface_extraction/codeface_extraction.py b/codeface_extraction/codeface_extraction.py
index 3478b1f..d5df6a0 100644
--- a/codeface_extraction/codeface_extraction.py
+++ b/codeface_extraction/codeface_extraction.py
@@ -22,15 +22,16 @@
 
 import argparse
 import sys
+from logging import getLogger
 from os.path import abspath
 
-from codeface.cli import log
-from codeface.configuration import Configuration
-from codeface.dbmanager import DBManager
-
 from . import extractions
 from csv_writer import csv_writer
+from codeface.dbmanager import DBManager
+from codeface.configuration import Configuration
 
+# create logger
+log = getLogger(__name__)
 
 ##
 # RUN FOR ALL PROJECTS
diff --git a/codeface_extraction/extractions.py b/codeface_extraction/extractions.py
index 081d1be..edeefda 100644
--- a/codeface_extraction/extractions.py
+++ b/codeface_extraction/extractions.py
@@ -26,18 +26,18 @@
 import os
 import unicodedata
 import re
+from logging import getLogger
 from ftfy import fix_encoding
 from email.header import decode_header, make_header
 
-from codeface.cli import log
 from codeface.util import gen_range_path
 
+log = getLogger(__name__)
 
 #
 # GET EXTRACTIONS
 #
 
-
 def get_extractions(dbm, conf, resdir, csv_writer, extract_commit_messages, extract_impl, extract_on_range_level):
     # all extractions are subclasses of Extraction:
     # instantiate them all!
@@ -117,7 +117,7 @@ def __init__(self, dbm, conf, res_dir, csv_writer):
     def is_project_level(self):
         """Check if this extraction is on project level (i.e., {revision} is not on the SQL statement)."""
 
-        return not ("{revision}" in self.sql)
+        return "{revision}" not in self.sql
 
     def is_generic_extraction(self):
         """Check if this extraction is generic (i.e., it can be used for several artifacts and, hence,
@@ -441,7 +441,7 @@ def __init__(self, dbm, conf, resdir, csv_writer):
     def get_list(self):
         result = self._run_sql(None, None)
         lines = self._reduce_result(result)
-        return [rev for (rev, date) in lines]
+        return [rev for (rev, _) in lines]
 
 
 #
@@ -737,8 +737,8 @@ def fix_characters_in_string(text):
     new_text = fix_encoding(text)
 
     # remove unicode characters from "Specials" block
-     # see: https://www.compart.com/en/unicode/block/U+FFF0
-    new_text = re.sub(r"\\ufff.", " ", new_text.encode("unicode-escape"))
+    # see: https://www.compart.com/en/unicode/block/U+FFF0
+    new_text = re.sub(r"\\ufff.", " ", new_text).encode("unicode-escape")
 
     # remove all kinds of control characters and emojis
     # see: https://www.fileformat.info/info/unicode/category/index.htm
@@ -772,5 +772,4 @@ def fix_name_encoding(name):
     except LookupError:
         # Encoding not found, return string as is
         return name
-    return name
 

From 454efa3094070c8445bfc8dcf58ddb5c7a848881 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20L=C3=B6ffler?= <s8maloef@stud.uni-saarland.de>
Date: Fri, 29 Aug 2025 12:15:14 +0200
Subject: [PATCH 04/13] Rename folder for codeface fragments
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Maximilian Löffler <s8maloef@stud.uni-saarland.de>
---
 {codeface => codeface_utils}/__init__.py      |   0
 codeface_utils/cluster/PersonInfo.py          |  54 ++++
 codeface_utils/cluster/idManager.py           | 281 ++++++++++++++++++
 {codeface => codeface_utils}/configuration.py |  37 ++-
 {codeface => codeface_utils}/dbmanager.py     |  96 ++++--
 {codeface => codeface_utils}/linktype.py      |   0
 {codeface => codeface_utils}/util.py          | 153 ++++++++--
 7 files changed, 561 insertions(+), 60 deletions(-)
 rename {codeface => codeface_utils}/__init__.py (100%)
 create mode 100644 codeface_utils/cluster/PersonInfo.py
 create mode 100644 codeface_utils/cluster/idManager.py
 rename {codeface => codeface_utils}/configuration.py (86%)
 rename {codeface => codeface_utils}/dbmanager.py (83%)
 rename {codeface => codeface_utils}/linktype.py (100%)
 rename {codeface => codeface_utils}/util.py (77%)

diff --git a/codeface/__init__.py b/codeface_utils/__init__.py
similarity index 100%
rename from codeface/__init__.py
rename to codeface_utils/__init__.py
diff --git a/codeface_utils/cluster/PersonInfo.py b/codeface_utils/cluster/PersonInfo.py
new file mode 100644
index 0000000..5884108
--- /dev/null
+++ b/codeface_utils/cluster/PersonInfo.py
@@ -0,0 +1,54 @@
+# This file is part of Codeface. Codeface is free software: you can
+# redistribute it and/or modify it under the terms of the GNU General Public
+# License as published by the Free Software Foundation, version 2.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+# details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+#
+# Copyright 2010, 2011 by Wolfgang Mauerer <wm@linux-kernel.net>
+# Copyright 2012, 2013, Siemens AG, Wolfgang Mauerer <wolfgang.mauerer@siemens.com>
+# All Rights Reserved.
+
+from __future__ import absolute_import
+
+
+class PersonInfo:
+    """ Information about a commiter, and his relation to other commiters"""
+
+    def __init__(self, ID=None, name="", email=""):
+        self.ID = ID
+        self.name = name
+        self.email = email
+
+    def __str__(self):
+        return self.name + " <" + self.email + ">"
+
+    def setID(self, ID):
+        self.ID = ID
+    def getID(self):
+        return self.ID
+
+    def setName(self, name):
+        self.name = name
+    def getName(self):
+        if self.name == "":
+            return self.email
+        return self.name
+
+    def setEmail(self, email):
+        self.email = email
+    def getEmail(self):
+        return self.email
+
+
+############################ Test cases #########################
+if __name__ == "__main__":
+    personInfo = PersonInfo("sepp")
+
+# TODO: Implement a couple of test cases
diff --git a/codeface_utils/cluster/idManager.py b/codeface_utils/cluster/idManager.py
new file mode 100644
index 0000000..b6f68d3
--- /dev/null
+++ b/codeface_utils/cluster/idManager.py
@@ -0,0 +1,281 @@
+# This file is part of Codeface. Codeface is free software: you can
+# redistribute it and/or modify it under the terms of the GNU General Public
+# License as published by the Free Software Foundation, version 2.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+# details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+#
+# Copyright 2010, 2011 by Wolfgang Mauerer <wm@linux-kernel.net>
+# Copyright 2012, 2013 by Siemens AG, Wolfgang Mauerer <wolfgang.mauerer@siemens.com>
+# All Rights Reserved.
+
+from __future__ import absolute_import
+import re
+from email.utils import parseaddr
+from logging import getLogger; log = getLogger(__name__)
+import six.moves.http_client
+import six.moves.urllib.request, six.moves.urllib.parse, six.moves.urllib.error
+import json
+import string
+import random
+import time
+from ..util import encode_as_utf8
+from six.moves import range
+from abc import ABC, abstractmethod
+import pandas
+
+from codeface_utils.cluster.PersonInfo import PersonInfo
+
+
+class idManager(ABC):
+
+    def __init__(self):
+        self.subsys_names = []
+
+        # Cache identical requests to the server
+        self._cache = {}
+
+        # Map IDs to an instance of PersonInfo
+        self.persons = {}
+
+        # Map a name, email address, or a combination of both to the numeric ID
+        # assigned to the developer
+        self.person_ids = {}
+
+        self.fixup_emailPattern = re.compile(r'([^<]+)\s+<([^>]+)>')
+        self.commaNamePattern = re.compile(r'([^,\s]+),\s+(.+)')
+
+    @abstractmethod
+    def _query_user_id(self, person_id):
+        pass
+
+    @abstractmethod
+    def getPersonFromDB(self, person_id):
+        pass
+
+    def getPersonID(self, addr):
+        """Obtain a unique ID from contributor identity credentials.
+
+        The IDs are managed by a central csv file.
+        Managing multiple identities for the same person is also
+        handled there.
+        """
+
+        (name, email) = self._decompose_addr(addr)
+        if (name, email) not in self._cache:
+            self._cache[(name, email)] = self._query_user_id(name, email)
+        ID = self._cache[(name, email)]
+
+        # Construct a local instance of PersonInfo for the contributor
+        # if it is not yet available
+        if (ID not in self.persons):
+            self.persons[ID] = PersonInfo(self.subsys_names, ID, name, email)
+
+        return ID
+
+    def getPersons(self):
+        return self.persons
+
+    def getPI(self, ID):
+        return self.persons[ID]
+
+    # We need the subsystem names because PersonInfo instances
+    # are created from this class -- and we want to know in which
+    # subsystem(s) a developer is active
+    def setSubsysNames(self, subsys_names):
+        self.subsys_names = subsys_names
+
+    def getSubsysNames(self):
+        return self.subsys_names
+
+    def _cleanName(self, name):
+        # Remove or replace characters in names that are known
+        # to cause parsing problems in later stages
+        name = name.replace('\"', "")
+        name = name.replace("\'", "")
+        name = name.strip()
+
+        return name
+
+    def _decompose_addr(self, addr):
+        addr = addr.replace("[", "").replace("]", "")
+        (name, email) = parseaddr(addr)
+
+        # Handle cases where the name is unknown from commits that potentially
+        # predate the era of git, where only an e-mail address was given.
+        # In such a case, we set the name to the e-mail address. Otherwise,
+        # all authors with unknown name would be matched to one person.
+        if (name == "unknown" or name == "unknown (none)" or name == "none"):
+            name = email
+
+        # The eMail parser cannot handle Surname, Name <email@domain.tld> properly.
+        # Provide a fixup hack for this case
+        if (name == "" or email.count("@") == 0):
+            m = re.search(self.fixup_emailPattern, addr)
+            if m:
+                name = m.group(1)
+                email = m.group(2)
+                m2 = re.search(self.commaNamePattern, name)
+                if m2:
+                    # Replace "Surname, Name" by "Name Surname"
+                    name = "{0} {1}".format(m2.group(2), m2.group(1))
+
+                # print "Fixup for addr {0} required -> ({1}/{2})".format(addr, name, email)
+            else:
+                # check for the following special format: email@domain.tld <>
+                strangePattern = re.compile(r'(.*@.*)\s+(<>)')
+                m3 = re.search(strangePattern, addr)
+                if m3:
+                    # Replace addr by "email <email@domain.tld>"
+                    name = m3.group(1).split("@")[0] # get name before @ symbol
+                    email = m3.group(1)
+                    # print "Fixup for addr {0} required -> ({1}/{2})".format(addr, name, email)
+                else:
+                    # In this case, no eMail address was specified.
+                    # print("Fixup for email required, but FAILED for {0}".format(addr))
+                    name = addr
+                    rand_str = "".join(random.choice(string.ascii_lowercase + string.digits)
+                                       for i in range(10))
+                    email = "could.not.resolve@" + rand_str
+
+        email = email.lower()
+
+        name = self._cleanName(name)
+        email = self._cleanName(email)
+
+        return (name, email)
+
+
+class dbIdManager(idManager):
+    """Provide unique IDs for developers.
+
+    This class provides an interface to the REST id server. Heuristics to
+    detect developers who operate under multiple identities are included
+    in the server."""
+
+    def __init__(self, dbm, conf):
+        super().__init__()
+
+        self._idMgrServer = conf["idServiceHostname"]
+        self._idMgrPort = conf["idServicePort"]
+        self._conn = six.moves.http_client.HTTPConnection(self._idMgrServer, self._idMgrPort)
+
+        # Create a project ID
+        self._dbm = dbm
+        # TODO: Pass the analysis method to idManager via the configuration
+        # file. However, the method should not influence the id scheme so
+        # that the results are easily comparable.
+        self._projectID = self._dbm.getProjectID(conf["project"],
+                                                 conf["tagging"])
+
+        # Construct request headers
+        self.headers = {"Content-type":
+                            "application/x-www-form-urlencoded; charset=utf-8",
+                        "Accept": "text/plain"}
+
+    def _query_user_id(self, name, email):
+        """Query the ID database for a contributor ID"""
+
+        name = encode_as_utf8(name)
+        params = six.moves.urllib.parse.urlencode({'projectID': self._projectID,
+                                   'name': name,
+                                   'email': email})
+
+        try:
+            self._conn.request("POST", "/post_user_id", params, self.headers)
+            res = self._conn.getresponse()
+        except:
+            retryCount = 0
+            successful = False
+            while (retryCount <= 10 and not successful):
+                log.warning("Could not reach ID service. Try to reconnect " \
+                            "(attempt {}).".format(retryCount));
+                self._conn.close()
+                self._conn = six.moves.http_client.HTTPConnection(self._idMgrServer, self._idMgrPort)
+                time.sleep(60)
+                #self._conn.ping(True)
+                try:
+                    self._conn.request("POST", "/post_user_id", params, self.headers)
+                    res = self._conn.getresponse()
+                    successful = True
+                except:
+                    if retryCount < 10:
+                        retryCount += 1
+                    else:
+                        retryCount += 1
+                        log.exception("Could not reach ID service. Is the server running?\n")
+                        raise
+
+        # TODO: We should handle errors by throwing an exception instead
+        # of silently ignoring them
+        result = res.read()
+        jsond = json.loads(result)
+        try:
+            id = jsond["id"]
+        except KeyError:
+            raise Exception("Bad response from server: '{}'".format(jsond))
+
+        return (id)
+
+    def getPersonID(self, addr):
+        """Obtain a unique ID from contributor identity credentials.
+
+        The IDs are managed by a central database accessed via REST.
+        Managing multiple identities for the same person is also
+        handled there. Safety against concurrent access is provided by
+        the database.
+        """
+
+        (name, email) = self._decompose_addr(addr)
+        if not (name, email) in self._cache:
+            self._cache[(name, email)] = self._query_user_id(name, email)
+        ID = self._cache[(name, email)]
+
+        # Construct a local instance of PersonInfo for the contributor
+        # if it is not yet available
+        if ID not in self.persons:
+            self.persons[ID] = PersonInfo(self.subsys_names, ID, name, email)
+
+        return ID
+
+    def getPersonFromDB(self, person_id):
+        """Query the ID database for a contributor and all corresponding data"""
+
+        try:
+            self._conn.request("GET", "/getUser/{}".format(person_id), headers=self.headers)
+            res = self._conn.getresponse()
+        except:
+            self._conn.close()
+            self._conn = six.moves.http_client.HTTPConnection(self._idMgrServer, self._idMgrPort)
+            retryCount = 0
+            successful = False
+            while (retryCount <= 10 and not successful):
+                log.warning("Could not reach ID service. Try to reconnect " \
+                            "(attempt {}).".format(retryCount));
+                self._conn.close()
+                self._conn = six.moves.http_client.HTTPConnection(self._idMgrServer, self._idMgrPort)
+                time.sleep(60)
+                #self._conn.ping(True)
+                try:
+                    self._conn.request("GET", "/getUser/{}".format(person_id), headers=self.headers)
+                    res = self._conn.getresponse()
+                    successful = True
+                except:
+                    if retryCount < 10:
+                        retryCount += 1
+                    else:
+                        retryCount += 1
+                        log.exception("Could not reach ID service. Is the server running?\n")
+                        raise
+
+        result = res.read()
+        jsond = json.loads(result)[0]
+
+        return (jsond)
+
diff --git a/codeface/configuration.py b/codeface_utils/configuration.py
similarity index 86%
rename from codeface/configuration.py
rename to codeface_utils/configuration.py
index e1fa874..d43f8d4 100644
--- a/codeface/configuration.py
+++ b/codeface_utils/configuration.py
@@ -19,15 +19,17 @@
 Encapsulates a configuration as an immutable dict
 '''
 
+from __future__ import absolute_import
 import yaml
 from collections.abc import Mapping
+from six.moves import range
 from logging import getLogger
 
-from tempfile import NamedTemporaryFile
-from codeface.linktype import LinkType
+from codeface_utils.linktype import LinkType
+
 
-# create logger
 log = getLogger(__name__)
+from tempfile import NamedTemporaryFile
 
 class ConfigurationError(Exception):
     '''Raised if any part of the configuration is malformed'''
@@ -39,8 +41,8 @@ class Configuration(Mapping):
     '''
 
     GLOBAL_KEYS = ('dbname', 'dbhost', 'dbuser', 'dbpwd',
-            'idServiceHostname', 'idServicePort')
-    GLOBAL_OPTIONAL_KEYS = ('dbport',)
+                   'idServiceHostname', 'idServicePort')
+    GLOBAL_OPTIONAL_KEYS = ('dbport', 'useCsv')
     PROJECT_KEYS = ('project', 'repo', 'tagging', 'revisions', 'rcs')
     OPTIONAL_KEYS = ('description', 'ml', 'mailinglists', 'sleepTime',
                      'proxyHost', 'proxyPort', 'bugsProjectName',
@@ -48,7 +50,8 @@ class Configuration(Mapping):
                      'issueTrackerURL', 'issueTrackerProject',
                      'issueTrackerUser', 'issueTrackerPassword',
                      'understand', 'sloccount', 'windowSize', 'numWindows',
-                     'qualityType', 'communicationType', 'artifactType', 'dependencyType')
+                     'qualityType', 'communicationType', 'artifactType', 'dependencyType',
+                     'csvFile', 'csvSeparator')
     ALL_KEYS = set(GLOBAL_KEYS + GLOBAL_OPTIONAL_KEYS + PROJECT_KEYS +
                    OPTIONAL_KEYS)
 
@@ -64,19 +67,19 @@ def __init__(self):
         self._conf_file_loc = None
 
     @classmethod
-    def load(cls, global_conffile, local_conffile=None):
+    def load(self, global_conffile, local_conffile=None):
         '''
         Load configuration from global/local files
         '''
         c = Configuration()
         log.info("Loading global configuration file '{}'".
                 format(global_conffile))
-        cls._global_conf = c._load(global_conffile)
+        self._global_conf = c._load(global_conffile)
         c._conf.update(c._global_conf)
         if local_conffile:
             log.info("Loading project configuration file '{}'".
                     format(local_conffile))
-            cls._project_conf = c._load(local_conffile)
+            self._project_conf = c._load(local_conffile)
             c._conf.update(c._project_conf)
         else:
             log.info("Not loading project configuration file!")
@@ -87,7 +90,7 @@ def load(cls, global_conffile, local_conffile=None):
     def _load(self, filename):
         '''Helper function that checks loading errors and logs them'''
         try:
-            return yaml.load(open(filename), Loader=yaml.SafeLoader)
+            return yaml.load(open(filename, 'r'), Loader=yaml.SafeLoader)
         except IOError:
             log.exception("Could not open configuration file '{}'".
                     format(filename))
@@ -100,7 +103,7 @@ def _load(self, filename):
     def _initialize(self):
         '''Infer missing values in the configuration'''
         if "rcs" not in self:
-            self._conf["rcs"] = [None for _ in range(len(self["revisions"]))]
+            self._conf["rcs"] = [None for x in range(len(self["revisions"]))]
 
         if "mailinglists" not in self:
             self._conf["mailinglists"] = []
@@ -129,12 +132,12 @@ def _check_sanity(self):
                 raise ConfigurationError('Invalid configuration key.')
 
         for key in self.GLOBAL_KEYS + self.PROJECT_KEYS:
-            if key not in self:
+            if not key in self:
                 log.critical("Required key '{}' missing in configuration!"
                         ''.format(key))
                 raise ConfigurationError('Missing configuration key.')
 
-        if self['tagging'] not in LinkType.get_all_link_types():
+        if not self['tagging'] in LinkType.get_all_link_types():
             log.critical('Unsupported tagging mechanism specified!')
             raise ConfigurationError('Unsupported tagging mechanism.')
 
@@ -147,6 +150,14 @@ def _check_sanity(self):
                 "candidates.".format(len(self["revisions"]), len(self["rcs"])))
             raise ConfigurationError('Malformed configuration.')
 
+        if self["useCsv"]:
+            if not "csvFile" in self:
+                log.critical("Malformed configuration: useCsv is true, but "
+                    "csvFile is not specified.")
+                raise ConfigurationError('Malformed configuration.')
+            if not "csvSeparator" in self:
+                self["csvSeparator"] = ","
+
         unknown_keys = [k for k in self if k not in self.ALL_KEYS]
         for key in unknown_keys:
             log.warning("Unknown key '{}' in configuration.".format(key))
diff --git a/codeface/dbmanager.py b/codeface_utils/dbmanager.py
similarity index 83%
rename from codeface/dbmanager.py
rename to codeface_utils/dbmanager.py
index df917ca..4f8895d 100644
--- a/codeface/dbmanager.py
+++ b/codeface_utils/dbmanager.py
@@ -17,10 +17,15 @@
 
 # Thin sql database wrapper
 
+from __future__ import absolute_import
+from __future__ import print_function
 import MySQLdb as mdb
-from datetime import datetime, timezone
-from logging import getLogger
+import time
+from datetime import datetime
+from logging import getLogger;
 from contextlib import contextmanager
+from six.moves import range
+from six.moves import zip
 
 # create logger
 log = getLogger(__name__)
@@ -44,13 +49,28 @@ class DBManager:
     """This class provides an interface to the codeface sql database."""
 
     def __init__(self, conf):
+
+        self.conf = conf
+        self.__openConnection(conf)
+
+        # max_packet_size = 1024 * 1024 * 512
+        # self.doExec("SET GLOBAL max_allowed_packet=%s", (max_packet_size,))
+
+    def __del__(self):
+        if self.con != None:
+            self.con.close()
+
+    def __openConnection(self, conf):
         try:
             self.con = None
             self.con = mdb.Connection(host=conf["dbhost"],
                                       port=conf["dbport"],
                                       user=conf["dbuser"],
                                       passwd=conf["dbpwd"],
-                                      db=conf["dbname"])
+                                      db=conf["dbname"],
+                                      charset="utf8",
+                                      use_unicode=True)
+            self.cur = self.con.cursor()
             log.debug(
                 "Establishing MySQL connection to "
                 "{c[dbuser]}@{c[dbhost]}:{c[dbport]}, DB '{c[dbname]}'"
@@ -62,14 +82,7 @@ def __init__(self, conf):
                 ": {e[1]} ({e[0]})"
                 "".format(c=conf, e=e.args))
             raise
-        self.cur = self.con.cursor()
 
-        max_packet_size = 1024 * 1024 * 256
-        self.doExec("SET GLOBAL max_allowed_packet=%s", (max_packet_size,))
-
-    def __del__(self):
-        if self.con is not None:
-            self.con.close()
 
     def doExec(self, stmt, args=None):
         with _log_db_error(stmt, args):
@@ -87,21 +100,53 @@ def doExec(self, stmt, args=None):
                     if dbe.args[0] == 1213:  # Deadlock! retry...
                         log.warning("Recoverable deadlock in MySQL - retrying " \
                                     "(attempt {}).".format(retryCount))
+                    elif dbe.args[0] == 2003:  # Can't connect to MySQL server
+                        log.warning("Can't connect to MySQL server - retrying " \
+                                    "(attempt {}).".format(retryCount))
+                        time.sleep(60)
+                        log.warning("Try opening new connection")
+                        self.con.close()
+                        log.warning("Connection successfully closed")
+                        self.__openConnection(self.conf)
+                        log.warning("Opening new connection successful")
                     elif dbe.args[0] == 2006:  # Server gone away...
                         log.warning("MySQL Server gone away, trying to reconnect " \
                                     "(attempt {}).".format(retryCount))
-                        self.con.ping(True)
-                    elif dbe.args[0] == 2013:  # Lost connection to MySQL server during query...
+                        time.sleep(60)
+                        log.warning("Try opening new connection")
+                        self.con.close()
+                        log.warning("Connection successfully closed")
+                        self.__openConnection(self.conf)
+                        log.warning("Opening new connection successful")
+                    elif dbe.args[0] == 2013 or dbe.args[0] == 1053:  # Lost connection to MySQL server during query | Server shutdown in progress
                         log.warning("Lost connection to MySQL server during query, " \
                                     "trying to reconnect (attempt {}).".format(retryCount))
+                        time.sleep(60)
+                        log.warning("Try opening new connection")
+                        self.con.close()
+                        log.warning("Connection successfully closed")
+                        self.__openConnection(self.conf)
+                        log.warning("Opening new connection successful")
+                    elif dbe.args[0] == 1153:  # Got a packet bigger than 'max_allowed_packet' bytes
+                        log.warning("Sent a too big packet ({lnos} lines), retrying with smaller packets.".format(
+                            lnos=len(args)))
+                        ## split package into smaller packets of size 'chunk_size'
+                        chunk_size = 100
+                        args_list = [args[i:i + chunk_size] for i in range(0, len(args), chunk_size)]
+                        ## retrying
+                        time.sleep(60)
                         self.con.ping(True)
+                        for chunk in args_list:
+                            self.doExec(stmt, chunk)
                     else:
+                        self.con.close()
                         raise
 
-            # Give up after ten retry attempts and propagate the
-            # problem to the caller. Callers can either fix the problem with
-            # a different query, or the analysis fails
+            # Give up after too many retry attempts and propagate the
+            # problem to the caller. Either it's fixed with a different
+            # query, or the analysis fails
             log.error("DB access failed after ten attempts, giving up")
+            self.con.close()
             raise
 
     def doFetchAll(self):
@@ -203,13 +248,18 @@ def getTagID(self, projectID, tag, type):
                             format(tag, type, self.cur.rowcount))
         return self.doFetchAll()[0][0]
 
-    def getCommitId(self, projectId, commitHash):
-        self.doExec("SELECT id FROM commit" +
-                    " WHERE commitHash=%s AND projectId=%s"
-                    , (commitHash, projectId))
+    def getCommitId(self, projectId, commitHash, releaseRangeID=None):
+        stmt = "SELECT id FROM commit WHERE commitHash=%s AND projectId=%s"
+        args = (commitHash, projectId)
+
+        if (releaseRangeID):
+            stmt += " AND releaseRangeId=%s"
+            args += (releaseRangeID, )
+
+        self.doExec(stmt, args)
         if self.cur.rowcount == 0:
-            raise Exception("Commit from project {} not found!".
-                            format(projectId))
+            raise Exception("Commit {0} from project {1} not found!".
+                            format(commitHash, projectId))
         return self.doFetchAll()[0][0]
 
     def getRevisionID(self, projectID, tag):
@@ -394,7 +444,7 @@ def update_release_timeline(self, project, tagging, revs, rcs,
             previous_rev = None
             if len(tags) > 0:
                 previous_rev = tags[-1]
-            for rev, rc in list(zip(revs, rcs))[len(tags):]:
+            for rev, rc in zip(revs, rcs)[len(tags):]:
                 self.doExecCommit("INSERT INTO release_timeline "
                                   "(type, tag, projectId) "
                                   "VALUES (%s, %s, %s)",
@@ -428,4 +478,4 @@ def update_release_timeline(self, project, tagging, revs, rcs,
 
 def tstamp_to_sql(tstamp):
     """Convert a Unix timestamp into an SQL compatible DateTime string"""
-    return (datetime.fromtimestamp(tstamp, tz=timezone.utc).strftime("%Y-%m-%d %H:%M:%S"))
+    return (datetime.utcfromtimestamp(tstamp).strftime("%Y-%m-%d %H:%M:%S"))
diff --git a/codeface/linktype.py b/codeface_utils/linktype.py
similarity index 100%
rename from codeface/linktype.py
rename to codeface_utils/linktype.py
diff --git a/codeface/util.py b/codeface_utils/util.py
similarity index 77%
rename from codeface/util.py
rename to codeface_utils/util.py
index 807ecc4..d859dcf 100644
--- a/codeface/util.py
+++ b/codeface_utils/util.py
@@ -17,6 +17,8 @@
 Utility functions for running external commands
 '''
 
+from __future__ import absolute_import
+import logging; log = logging.getLogger(__name__)
 import os
 import os.path
 import re
@@ -24,19 +26,24 @@
 import signal
 import sys
 import traceback
+import unicodedata
 from collections import OrderedDict, namedtuple
 from glob import glob
 from math import sqrt
-from multiprocessing import Process, Queue, Lock
-from pkg_resources import resource_filename
+from multiprocessing import Process, Queue, JoinableQueue, Lock
+from pickle import dumps, PicklingError
+from importlib.resources import files
 from subprocess import Popen, PIPE
 from tempfile import NamedTemporaryFile, mkdtemp
 from time import sleep
 from threading import enumerate as threading_enumerate
-from queue import Empty
+from six.moves.queue import Empty
 from datetime import timedelta, datetime
-import logging
-log = logging.getLogger(__name__)
+from ftfy import fix_encoding
+from six.moves import map
+import six
+from six.moves import range
+from six.moves import zip
 
 # Represents a job submitted to the batch pool.
 BatchJobTuple = namedtuple('BatchJobTuple', ['id', 'func', 'args', 'kwargs',
@@ -150,10 +157,10 @@ def join(self):
             # Terminate and join the workers
             # Wait 100ms to allow backtraces to be logged
             sleep(0.1)
-            log.devinfo("Terminating workers...")
+            log.info("Terminating workers...")
             for w in self.workers:
                 w.terminate()
-            log.devinfo("Workers terminated.")
+            log.info("Workers terminated.")
 
 def batchjob_worker_function(work_queue, done_queue):
     '''
@@ -205,7 +212,7 @@ def handle_sigint(signal, frame):
     with l:
         log.fatal("CTRL-C pressed!")
         for c in get_stack_dump():
-            log.devinfo(c)
+            log.info(c)
     # This call raises a SystemExit exception in the
     # stack frame that was interrupted by the signal
     # For the main thread, this is what we want.
@@ -216,12 +223,12 @@ def handle_sigint(signal, frame):
 def handle_sigint_silent(signal, frame):
     with l:
         for c in get_stack_dump():
-            log.devinfo(c)
+            log.info(c)
     logging.shutdown()
     # Since we want to terminate worker threads with prejudice,
     # we use os._exit, which directly terminates the process.
     # otherwise the worker try/catch will also catch the SystemExit
-    os.exit_(-1)
+    os._exit(-1)
 
 def handle_sigterm(signal, frame):
     # Since we want to terminate worker threads with prejudice,
@@ -290,7 +297,7 @@ def _convert_dot_file(dotfile):
     '''
     res = []
     edges = {}
-    edge_spec = re.compile("\s+(\d+) -> (\d+);")
+    edge_spec = re.compile(r"\s+(\d+) -> (\d+);")
 
     file = open(dotfile, "r")
     lines = [line.strip("\n") for line in file]
@@ -334,12 +341,12 @@ def layout_graph(filename):
     os.unlink(out.name)
 
 def generate_report(start_rev, end_rev, resdir):
-    log.devinfo("  -> Generating report")
+    log.info("  -> Generating report")
     report_base = "report-{0}_{1}".format(start_rev, end_rev)
 
     # Run perl script to generate report LaTeX file
     cmd = []
-    cmd.append(resource_filename(__name__, "perl/create_report.pl"))
+    cmd.append(files(__package__).joinpath("perl/create_report.pl"))
     cmd.append(resdir)
     cmd.append("{0}--{1}".format(start_rev, end_rev))
     with open(os.path.join(resdir, report_base + ".tex"), 'w') as f:
@@ -376,20 +383,20 @@ def generate_reports(start_rev, end_rev, range_resdir):
 
 def check4ctags():
     # check if the appropriate ctags is installed on the system
-    prog_name    = 'Exuberant Ctags'
-    prog_version = 'Exuberant Ctags 5.9~svn20110310'
-    cmd = "ctags-exuberant --version".split()
+    prog_name    = 'Universal Ctags'
+    prog_version = 'Universal Ctags 5.9.0, Copyright (C) 2015 Universal Ctags Team'
+    cmd = "ctags-universal --version".split()
 
     res = execute_command(cmd, None)
 
     if not(res.startswith(prog_name)):
         log.error("program '{0}' does not exist".format(prog_name))
-        raise Exception("ctags-exuberant not found")
+        raise Exception("ctags-universal not found")
 
     if not(res.startswith(prog_version)):
         # TODO: change this to use standard mechanism for error logging
         log.error("Ctags version '{0}' not found".format(prog_version))
-        raise Exception("Incompatible ctags-exuberant version")
+        raise Exception("Incompatible ctags-universal version")
 
 
 def check4cppstats():
@@ -454,9 +461,9 @@ def get_analysis_windows(conf):
     window_size_months = 3
     num_window = -1
 
-    if "windowSize" in conf.keys():
+    if "windowSize" in list(conf.keys()):
         window_size_months = conf["windowSize"]
-    if "numWindows" in conf.keys():
+    if "numWindows" in list(conf.keys()):
         num_window = conf["numWindows"]
 
     return window_size_months, num_window
@@ -473,11 +480,26 @@ def generate_analysis_windows(repo, window_size_months):
     latest_date_result = execute_command(cmd_date).splitlines()[0]
     latest_commit = parse_iso_git_date(latest_date_result)
 
+    cmd_root_commit_dates = 'git --git-dir={0} log --max-parents=0 --format=%ad  --date=iso8601'\
+        .format(repo).split()
+    root_commit_dates_result = execute_command(cmd_root_commit_dates).splitlines()
+    earliest_root_commit_date = min([parse_iso_git_date(root_commit) for root_commit in root_commit_dates_result])
+
     print_fmt = "%Y-%m-%dT%H:%M:%S+0000"
     month = timedelta(days=30)
 
     def get_before_arg(num_months):
         date = latest_commit - num_months * month
+
+        # Due to a bug in git, broken author information in commit objects can lead to a timestamp of 0 when using the
+        # --before option although the dates themselves are not broken and can be parsed without problems.
+        # For more details, see the whole thread conversation on the git mailing list here:
+        # https://lore.kernel.org/git/7728e059-d58d-cce7-c011-fbc16eb22fb9@cs.uni-saarland.de/
+        # To avoid running into an infinite while loop below (due to timestamps being 0), check if the date is earlier
+        # than the date of the earliest root commit and break if this is the case.
+        if date < earliest_root_commit_date:
+            raise ValueError("The before-arg date is earlier than the earliest commit in the repository.")
+
         return '--before=' + date.strftime(print_fmt)
 
     revs = []
@@ -491,13 +513,20 @@ def get_before_arg(num_months):
     revs.extend(rev_end)
 
     while start != end:
-        cmd = cmd_base_max1 + [get_before_arg(start)]
-        rev_start = execute_command(cmd).splitlines()
+
+        try:
+            cmd = cmd_base_max1 + [get_before_arg(start)]
+            rev_start = execute_command(cmd).splitlines()
+        except ValueError as ve:
+            rev_start = []
+            log.info("rev_start would be earlier than earliest root commit. Start at initial commit instead...")
 
         if len(rev_start) == 0:
             start = end
-            cmd = cmd_base + ['--reverse']
-            rev_start = [execute_command(cmd).splitlines()[0]]
+            #cmd = cmd_base + ['--reverse']
+            #rev_start = [execute_command(cmd).splitlines()[0]]
+            cmd = cmd_base + ['--max-parents=0']
+            rev_start = [execute_command(cmd).splitlines()[-1]]
         else:
             end = start
             start = end + window_size_months
@@ -525,3 +554,79 @@ def get_before_arg(num_months):
     rcs = [None for x in range(len(revs))]
 
     return revs_hash, rcs, revs_date
+
+
+def encode_as_utf8(string):
+    """
+    Encode the given string properly in UTF-8,
+    independent from its internal representation (str or unicode).
+
+    This function removes any control characters and four-byte-encoded unicode characters and replaces them
+    with " ". (Four-byte-encoded unicode characters do not work with 'utf8' encoding of MySQL.)
+
+    :param string: any string
+    :return: the UTF-8 encoded string of type str
+    """
+
+    try:
+        string = string.decode("utf-8")
+    except:
+        # if we have a string, we transform it to unicode
+        if isinstance(string, str):
+            string = six.text_type(string, "unicode-escape", errors="replace")
+
+    ## maybe not a string/unicode at all, return rightaway
+    if not isinstance(string, six.text_type):
+        return string
+
+    # convert to real unicode-utf8 encoded string, fix_text ensures proper encoding
+    new_string = fix_encoding(string)
+
+    # remove unicode characters from "Specials" block
+    # see: https://www.compart.com/en/unicode/block/U+FFF0
+    new_string = re.sub(r"\\ufff.", " ", new_string.encode("unicode-escape"))
+
+    # remove all kinds of control characters and emojis
+    # see: https://www.fileformat.info/info/unicode/category/index.htm
+    new_string = u"".join(ch if unicodedata.category(ch)[0] != "C" else " " for ch in new_string.decode("unicode-escape"))
+
+    new_string = new_string.encode("utf-8")
+
+    # replace any 4-byte characters with a single space (previously: four_byte_replacement)
+    try:
+        # UCS-4 build
+        four_byte_regex = re.compile(u"[\U00010000-\U0010ffff]")
+    except re.error:
+        # UCS-2 build
+        four_byte_regex = re.compile(u"[\uD800-\uDBFF][\uDC00-\uDFFF]")
+
+    four_byte_replacement = r" "  # r":4bytereplacement:"
+    new_string = four_byte_regex.sub(four_byte_replacement, new_string.decode("utf-8")).encode("utf-8")
+
+    return str(new_string)
+
+
+def encode_items_as_utf8(items):
+    """
+    Encode the given list/tuple/dict of strings properly in UTF-8,
+    independent from its internal representation (str or unicode).
+
+    This function uses encode_as_utf8(string) internally.
+
+    :param string: any string
+    :return: the UTF-8 encoded string of type str
+    """
+
+    # unpack values if we have a dictionary
+    items_unpacked = items
+    if isinstance(items, dict):
+        items_unpacked = list(items.values())
+
+    # encode each item as UTF-8 properly
+    items_enc = list(map(encode_as_utf8, items_unpacked))
+
+    # add key for dict again
+    if isinstance(items, dict):
+        items_enc = dict(zip(list(items.keys()), items_enc))
+
+    return items_enc

From 347238451357f5a17dd3e3014f7a4bd25872895a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20L=C3=B6ffler?= <s8maloef@stud.uni-saarland.de>
Date: Fri, 29 Aug 2025 12:17:05 +0200
Subject: [PATCH 05/13] Introduce CSV-based IdManager class
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Using the csvIdManager, we can use all functionality but extraction
without a codeface installation. The csvIdManager then replaces the
MySQL-based IdManager from codeface.

Signed-off-by: Maximilian Löffler <s8maloef@stud.uni-saarland.de>
---
 codeface_utils/cluster/idManager.py | 91 ++++++++++++++++++++++++-----
 1 file changed, 77 insertions(+), 14 deletions(-)

diff --git a/codeface_utils/cluster/idManager.py b/codeface_utils/cluster/idManager.py
index b6f68d3..b319511 100644
--- a/codeface_utils/cluster/idManager.py
+++ b/codeface_utils/cluster/idManager.py
@@ -36,8 +36,6 @@
 class idManager(ABC):
 
     def __init__(self):
-        self.subsys_names = []
-
         # Cache identical requests to the server
         self._cache = {}
 
@@ -52,7 +50,7 @@ def __init__(self):
         self.commaNamePattern = re.compile(r'([^,\s]+),\s+(.+)')
 
     @abstractmethod
-    def _query_user_id(self, person_id):
+    def _query_user_id(self, name, email):
         pass
 
     @abstractmethod
@@ -75,7 +73,7 @@ def getPersonID(self, addr):
         # Construct a local instance of PersonInfo for the contributor
         # if it is not yet available
         if (ID not in self.persons):
-            self.persons[ID] = PersonInfo(self.subsys_names, ID, name, email)
+            self.persons[ID] = PersonInfo(ID, name, email)
 
         return ID
 
@@ -85,15 +83,6 @@ def getPersons(self):
     def getPI(self, ID):
         return self.persons[ID]
 
-    # We need the subsystem names because PersonInfo instances
-    # are created from this class -- and we want to know in which
-    # subsystem(s) a developer is active
-    def setSubsysNames(self, subsys_names):
-        self.subsys_names = subsys_names
-
-    def getSubsysNames(self):
-        return self.subsys_names
-
     def _cleanName(self, name):
         # Remove or replace characters in names that are known
         # to cause parsing problems in later stages
@@ -240,7 +229,7 @@ def getPersonID(self, addr):
         # Construct a local instance of PersonInfo for the contributor
         # if it is not yet available
         if ID not in self.persons:
-            self.persons[ID] = PersonInfo(self.subsys_names, ID, name, email)
+            self.persons[ID] = PersonInfo(ID, name, email)
 
         return ID
 
@@ -279,3 +268,77 @@ def getPersonFromDB(self, person_id):
 
         return (jsond)
 
+
+class csvIdManager(idManager):
+    """Provide unique IDs for developers.
+
+    This class provides an interface to CSV id files.
+    """
+    def __init__(self, conf):
+        super().__init__()
+
+        # CSV file containing the IDs
+        self.csv_file = conf["csvFile"]
+        self.csv_sep  = conf["csvSeparator"]
+        self.df = self._verifyCsvFile()
+
+    def _verifyCsvFile(self):
+        with open(self.csv_file, "r") as file:
+            df = pandas.read_csv(file, sep=self.csv_sep, names=['ID', 'name', 'email'])
+            return df
+
+    def _addRow(self, name, email):
+
+        # determine next ID
+        max_id = self.df['ID'].max()
+        next_id = 0 if bool(pandas.isna(max_id)) else int(max_id) + 1
+
+        # append new row
+        self.df = self.df._append({
+            'ID': next_id,
+            'name': name,
+            'email': email
+        }, ignore_index=True)
+
+        # dump df to file
+        file = open(self.csv_file, "w")
+        self.df.to_csv(file, sep=self.csv_sep, index=False, header=False)
+
+        return next_id
+
+    def _query_user_id(self, name, email):
+        """Query the ID csv file for a contributor ID"""
+
+        # no name is okay, but no email is not
+        if not email:
+            return -1
+
+        # Match by name and email.
+        # Disregard random string after "could.not.resolve@" in email
+        # to avoid creating multiple entries for the same person.
+        if email.startswith("could.not.resolve@"):
+            rows = self.df[(self.df['name'] == name) &
+                           (self.df['email'].str.startswith("could.not.resolve@"))]
+        else:
+            rows = self.df[(self.df['name'] == name) &
+                           (self.df['email'] == email)]
+
+        if len(rows) == 0:
+            name = '' if not name else name
+            return self._addRow(name, email)
+
+        elif len(rows) == 1:
+            return int(rows['ID'].values[0])
+
+        else:
+            raise Exception("Constructed author list is in invalid format. Duplicate entries found")
+
+    def getPersonFromDB(self, person_id):
+        """Get a PersonInfo instance from the database by ID."""
+        if person_id not in self.persons:
+            rows = self.df[self.df['ID'] == person_id]
+            if len(rows) == 1:
+                name = rows['name'].values[0]
+                email = rows['email'].values[0]
+                self.persons[person_id] = PersonInfo(person_id, name, email)
+        return self.persons.get(person_id, None)

From 202a188a724548ff7db1ed274ae156efe29e0bad Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20L=C3=B6ffler?= <s8maloef@stud.uni-saarland.de>
Date: Sat, 30 Aug 2025 17:12:54 +0200
Subject: [PATCH 06/13] Adapt usage of codeface fragments in all modules
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Maximilian Löffler <s8maloef@stud.uni-saarland.de>
---
 anonymization/anonymization.py                |  9 ++--
 .../author_postprocessing.py                  |  8 +--
 bot_processing/bot_processing.py              | 11 ++--
 codeface_extraction/codeface_extraction.py    |  4 +-
 codeface_extraction/extractions.py            |  3 +-
 issue_processing/issue_processing.py          | 43 ++++++++--------
 issue_processing/jira_issue_processing.py     | 51 ++++++++++---------
 mbox_parsing/mbox_parsing.py                  | 22 ++++----
 8 files changed, 82 insertions(+), 69 deletions(-)

diff --git a/anonymization/anonymization.py b/anonymization/anonymization.py
index b11ef52..fdddebc 100644
--- a/anonymization/anonymization.py
+++ b/anonymization/anonymization.py
@@ -29,15 +29,14 @@
 import sys
 from os import path, walk, makedirs
 from os.path import abspath
-from shutil import copy
-
-from codeface.cli import log
-from codeface.configuration import Configuration
-from codeface.dbmanager import DBManager
+from logging import getLogger
 
+from codeface_utils.configuration import Configuration
 from csv_writer import csv_writer
 
 
+log = getLogger(__name__)
+
 ##
 # RUN POSTPROCESSING
 ##
diff --git a/author_postprocessing/author_postprocessing.py b/author_postprocessing/author_postprocessing.py
index c712ac6..f3b0ca9 100644
--- a/author_postprocessing/author_postprocessing.py
+++ b/author_postprocessing/author_postprocessing.py
@@ -42,14 +42,14 @@
 from os import path, walk, makedirs
 from os.path import abspath
 from shutil import copy
+from logging import getLogger
 
-from codeface.cli import log
-from codeface.configuration import Configuration
-from codeface.dbmanager import DBManager
-
+from codeface_utils.configuration import Configuration
 from csv_writer import csv_writer
 
 
+log = getLogger(__name__)
+
 ##
 # RUN POSTPROCESSING
 ##
diff --git a/bot_processing/bot_processing.py b/bot_processing/bot_processing.py
index 43ff492..14bdd56 100644
--- a/bot_processing/bot_processing.py
+++ b/bot_processing/bot_processing.py
@@ -25,11 +25,14 @@
 import urllib.request, urllib.parse, urllib.error
 
 import operator
-from codeface.cli import log
-from codeface.configuration import Configuration
+from logging import getLogger
 
+from codeface_utils.configuration import Configuration
 from csv_writer import csv_writer
 
+
+log = getLogger(__name__)
+
 def run():
     # get all needed paths and arguments for the method call.
     parser = argparse.ArgumentParser(prog='codeface-extraction-bots-github', description='Codeface extraction')
@@ -75,7 +78,7 @@ def load_bot_data(bot_file, header = True):
     :return: the read bot data
     """
 
-    log.devinfo("Read bot data from file '{}'...".format(bot_file))
+    log.info("Read bot data from file '{}'...".format(bot_file))
 
     # check if file exists and exit early if not
     if not os.path.exists(bot_file):
@@ -99,7 +102,7 @@ def load_user_data(user_data_file):
     :return: the read user data
     """
 
-    log.devinfo("Read user data from file '{}'...".format(user_data_file))
+    log.info("Read user data from file '{}'...".format(user_data_file))
 
     # check if file exists and exit early if not
     if not os.path.exists(user_data_file):
diff --git a/codeface_extraction/codeface_extraction.py b/codeface_extraction/codeface_extraction.py
index d5df6a0..62b36ef 100644
--- a/codeface_extraction/codeface_extraction.py
+++ b/codeface_extraction/codeface_extraction.py
@@ -27,8 +27,8 @@
 
 from . import extractions
 from csv_writer import csv_writer
-from codeface.dbmanager import DBManager
-from codeface.configuration import Configuration
+from codeface_utils.dbmanager import DBManager
+from codeface_utils.configuration import Configuration
 
 # create logger
 log = getLogger(__name__)
diff --git a/codeface_extraction/extractions.py b/codeface_extraction/extractions.py
index edeefda..b294be2 100644
--- a/codeface_extraction/extractions.py
+++ b/codeface_extraction/extractions.py
@@ -30,7 +30,8 @@
 from ftfy import fix_encoding
 from email.header import decode_header, make_header
 
-from codeface.util import gen_range_path
+from codeface_utils.util import gen_range_path
+
 
 log = getLogger(__name__)
 
diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py
index ff80d53..adbee53 100644
--- a/issue_processing/issue_processing.py
+++ b/issue_processing/issue_processing.py
@@ -30,16 +30,19 @@
 import sys
 import urllib.request, urllib.parse, urllib.error
 from datetime import datetime, timedelta
+from logging import getLogger
 
 import operator
-from codeface.cli import log
-from codeface.cluster.idManager import idManager
-from codeface.configuration import Configuration
-from codeface.dbmanager import DBManager
+from codeface_utils.cluster.idManager import dbIdManager, csvIdManager
+from codeface_utils.configuration import Configuration
+from codeface_utils.dbmanager import DBManager
 from dateutil import parser as dateparser
 
 from csv_writer import csv_writer
 
+
+log = getLogger(__name__)
+
 # known types from JIRA and GitHub default labels
 known_types = {"bug", "improvement", "enhancement", "new feature", "task", "test", "wish"}
 
@@ -95,7 +98,7 @@ def load(source_folder):
     """
 
     srcfile = os.path.join(source_folder, "issues.json")
-    log.devinfo("Loading Github issues from file '{}'...".format(srcfile))
+    log.info("Loading Github issues from file '{}'...".format(srcfile))
 
     # check if file exists and exit early if not
     if not os.path.exists(srcfile):
@@ -232,7 +235,7 @@ def reformat_issues(issue_data):
     :return: the re-arranged issue data
     """
 
-    log.devinfo("Re-arranging Github issues...")
+    log.info("Re-arranging Github issues...")
 
     # re-process all issues
     for issue in issue_data:
@@ -670,10 +673,13 @@ def insert_user_data(issues, conf, resdir):
     user_id_buffer = dict()
     # create buffer for usernames (key: username)
     username_id_buffer = dict()
-    # open database connection
-    dbm = DBManager(conf)
-    # open ID-service connection
-    idservice = idManager(dbm, conf)
+
+    # connect to ID service
+    if conf["useCsv"]:
+        idservice = csvIdManager(conf)
+    else:
+        dbm = DBManager(conf)
+        idservice = dbIdManager(dbm, conf)
 
     def get_user_string(name, email):
         if not email or email is None:
@@ -683,26 +689,23 @@ def get_user_string(name, email):
             return "{name} <{email}>".format(name=name, email=email)
 
     def get_id_and_update_user(user, buffer_db_ids=user_id_buffer, buffer_usernames=username_id_buffer):
-        username = str(user["username"]).encode("utf-8")
 
         # fix encoding for name and e-mail address
-        if user["name"] is not None:
-            name = str(user["name"]).encode("utf-8")
-        else:
-            name = username
-        mail = str(user["email"]).encode("utf-8")
+        name = user["name"] if "name" in user else str(user["username"])
+        mail = user["email"] # empty
+
         # construct string for ID service and send query
         user_string = get_user_string(name, mail)
 
         # check buffer to reduce amount of DB queries
         if user_string in buffer_db_ids:
-            log.devinfo("Returning person id for user '{}' from buffer.".format(user_string))
+            log.info("Returning person id for user '{}' from buffer.".format(user_string))
             if username is not None:
                 buffer_usernames[username] = buffer_db_ids[user_string]
             return buffer_db_ids[user_string]
 
         # get person information from ID service
-        log.devinfo("Passing user '{}' to ID service.".format(user_string))
+        log.info("Passing user '{}' to ID service.".format(user_string))
         idx = idservice.getPersonID(user_string)
 
         # add user information to buffer
@@ -719,11 +722,11 @@ def get_user_from_id(idx, buffer_db=user_buffer):
 
         # check whether user information is in buffer to reduce amount of DB queries
         if idx in buffer_db:
-            log.devinfo("Returning user '{}' from buffer.".format(idx))
+            log.info("Returning user '{}' from buffer.".format(idx))
             return buffer_db[idx]
 
         # get person information from ID service
-        log.devinfo("Passing user id '{}' to ID service.".format(idx))
+        log.info("Passing user id '{}' to ID service.".format(idx))
         person = idservice.getPersonFromDB(idx)
         user = dict()
         user["email"] = person["email1"]  # column "email1"
diff --git a/issue_processing/jira_issue_processing.py b/issue_processing/jira_issue_processing.py
index fa3e826..5d763fa 100644
--- a/issue_processing/jira_issue_processing.py
+++ b/issue_processing/jira_issue_processing.py
@@ -29,15 +29,15 @@
 import time
 import csv
 import json
+from logging import getLogger
 
 from xml.dom.minidom import parse
 from datetime import datetime
 from dateutil import parser as dateparser
 
-from codeface.cli import log
-from codeface.cluster.idManager import idManager
-from codeface.configuration import Configuration
-from codeface.dbmanager import DBManager
+from codeface_utils.cluster.idManager import dbIdManager, csvIdManager
+from codeface_utils.configuration import Configuration
+from codeface_utils.dbmanager import DBManager
 
 from csv_writer import csv_writer
 
@@ -49,6 +49,9 @@
 importlib.reload(sys)
 sys.setdefaultencoding("utf-8")
 
+
+log = getLogger(__name__)
+
 # global counter for JIRA requests to make sure to not exceed the request limit
 jira_request_counter = 0
 max_requests = 45000 # 50,000 JIRA requests per 24 hours are allowed
@@ -173,7 +176,7 @@ def load_xml(source_folder, xml_file):
     """
 
     srcfile = os.path.join(source_folder, xml_file)
-    log.devinfo("Loading issues from file '{}'...".format(srcfile))
+    log.info("Loading issues from file '{}'...".format(srcfile))
 
     try:
         # parse the xml-file
@@ -373,7 +376,7 @@ def parse_xml(issue_data, persons, skip_history, referenced_bys):
 
             text = comment_x.firstChild
             if text is None:
-                log.warn("Empty comment in issue " + issue["id"])
+                log.warning("Empty comment in issue " + issue["id"])
                 comment["text"] = ""
             else:
                 comment["text"] = text.data
@@ -441,7 +444,7 @@ def load_issues_via_api(issues, persons, url, referenced_bys):
             api_issue = jira_project.issue(issue["externalId"], expand="changelog")
             changelog = api_issue.changelog
         except JIRAError:
-            log.warn("JIRA Error: Changelog cannot be extracted for issue " + issue["externalId"] + ". History omitted!")
+            log.warning("JIRA Error: Changelog cannot be extracted for issue " + issue["externalId"] + ". History omitted!")
             changelog = None
 
         histories = list()
@@ -479,7 +482,7 @@ def load_issues_via_api(issues, persons, url, referenced_bys):
                         if hasattr(change, "author"):
                             user = create_user(change.author.displayName, change.author.name, "")
                         else:
-                            log.warn("No author for history: " + str(change.id) + " created at " + str(change.created))
+                            log.warning("No author for history: " + str(change.id) + " created at " + str(change.created))
                             user = create_user("","","")
                         history["author"] = merge_user_with_user_from_csv(user, persons)
                         history["date"] = format_time(change.created)
@@ -499,7 +502,7 @@ def load_issues_via_api(issues, persons, url, referenced_bys):
                         if hasattr(change, "author"):
                             user = create_user(change.author.displayName, change.author.name, "")
                         else:
-                            log.warn("No author for history: " + str(change.id) + " created at " + str(change.created))
+                            log.warning("No author for history: " + str(change.id) + " created at " + str(change.created))
                             user = create_user("","","")
                         history["author"] = merge_user_with_user_from_csv(user, persons)
                         history["date"] = format_time(change.created)
@@ -591,10 +594,13 @@ def insert_user_data(issues, conf):
     user_buffer = dict()
     # create buffer for user ids (key: user string)
     user_id_buffer = dict()
-    # open database connection
-    dbm = DBManager(conf)
-    # open ID-service connection
-    idservice = idManager(dbm, conf)
+
+    # connect to ID service
+    if conf["useCsv"]:
+        idservice = csvIdManager(conf)
+    else:
+        dbm = DBManager(conf)
+        idservice = dbIdManager(dbm, conf)
 
     def get_user_string(name, email):
         if not email or email is None:
@@ -604,22 +610,21 @@ def get_user_string(name, email):
             return "{name} <{email}>".format(name=name, email=email)
 
     def get_id_and_update_user(user, buffer_db_ids=user_id_buffer):
+
         # fix encoding for name and e-mail address
-        if user["name"] is not None and user["name"] != "":
-            name = str(user["name"]).encode("utf-8")
-        else:
-            name = str(user["username"]).encode("utf-8")
-        mail = str(user["email"]).encode("utf-8")  # empty
+        name = user["name"] if "name" in user else str(user["username"])
+        mail = user["email"]
+
         # construct string for ID service and send query
         user_string = get_user_string(name, mail)
 
         # check buffer to reduce amount of DB queries
         if user_string in buffer_db_ids:
-            log.devinfo("Returning person id for user '{}' from buffer.".format(user_string))
+            log.info("Returning person id for user '{}' from buffer.".format(user_string))
             return buffer_db_ids[user_string]
 
         # get person information from ID service
-        log.devinfo("Passing user '{}' to ID service.".format(user_string))
+        log.info("Passing user '{}' to ID service.".format(user_string))
         idx = idservice.getPersonID(user_string)
 
         # add user information to buffer
@@ -632,11 +637,11 @@ def get_user_from_id(idx, buffer_db=user_buffer):
 
         # check whether user information is in buffer to reduce amount of DB queries
         if idx in buffer_db:
-            log.devinfo("Returning user '{}' from buffer.".format(idx))
+            log.info("Returning user '{}' from buffer.".format(idx))
             return buffer_db[idx]
 
         # get person information from ID service
-        log.devinfo("Passing user id '{}' to ID service.".format(idx))
+        log.info("Passing user id '{}' to ID service.".format(idx))
         person = idservice.getPersonFromDB(idx)
         user = dict()
         user["email"] = person["email1"]  # column "email1"
@@ -1021,7 +1026,7 @@ def find_first_existing(source_folder, filenames):
         log.error("Person files '{}' do not exist! Exiting early...".format(person_files))
         sys.exit(-1)
 
-    log.devinfo("Loading person csv from file '{}'...".format(srcfile))
+    log.info("Loading person csv from file '{}'...".format(srcfile))
     with open(srcfile, "r") as f:
         person_data = csv.DictReader(f, delimiter=",", skipinitialspace=True)
         persons_by_username = {}
diff --git a/mbox_parsing/mbox_parsing.py b/mbox_parsing/mbox_parsing.py
index 92d8cb9..ef9ae41 100644
--- a/mbox_parsing/mbox_parsing.py
+++ b/mbox_parsing/mbox_parsing.py
@@ -28,18 +28,20 @@
 import shutil
 import sys
 from os.path import abspath
+from logging import getLogger
 
-from codeface.cli import log
-from codeface.configuration import Configuration
 from joblib import Parallel, delayed
 from whoosh import index  # import create_in, open_dir, exists_in
 from whoosh.analysis import StandardAnalyzer
 from whoosh.fields import Schema, TEXT, ID
 from whoosh.qparser import QueryParser
 
+from codeface_utils.configuration import Configuration
 from csv_writer import csv_writer
 
 
+log = getLogger(__name__)
+
 def __get_index(mbox, mbox_path, results_folder, schema, reindex):
     """Initialize the search index (and create it, if needed
 
@@ -56,13 +58,13 @@ def __get_index(mbox, mbox_path, results_folder, schema, reindex):
     index_path = os.path.join(results_folder, "mbox-index", os.path.basename(mbox_path))
     # 1) if reindexing, remove the index folder
     if os.path.exists(index_path) and reindex:
-        log.devinfo("Removing index from path '{}'...".format(index_path))
+        log.info("Removing index from path '{}'...".format(index_path))
         shutil.rmtree(index_path)
     # 2) Check if we need to create the index for Whoosh full-text search
-    log.devinfo("Checking for index in results folder...")
+    log.info("Checking for index in results folder...")
     if (not os.path.exists(index_path)) or (not index.exists_in(index_path)):
         # 2.1) create index
-        log.devinfo("Creating index for text search in results folder.")
+        log.info("Creating index for text search in results folder.")
         os.makedirs(index_path)  # create path
         index.create_in(index_path, schema)  # initialize as index path
         ix = index.open_dir(index_path)  # open as index path
@@ -71,10 +73,10 @@ def __get_index(mbox, mbox_path, results_folder, schema, reindex):
         for message in mbox:
             writer.add_document(messageID=str(message['message-id']), content=__mbox_getbody(message))
         writer.commit()
-        log.devinfo("Index created, parsing will begin now.")
+        log.info("Index created, parsing will begin now.")
     else:
         # 2.2) load index
-        log.devinfo("Index has already been created, parsing will begin right away.")
+        log.info("Index has already been created, parsing will begin right away.")
         ix = index.open_dir(index_path)
 
     return ix
@@ -131,8 +133,8 @@ def __mbox_getbody(message):
         body = message.get_payload(decode=True)
 
     if body is None:
-        log.devinfo(message.get_content_type())
-        log.devinfo(
+        log.info(message.get_content_type())
+        log.info(
             "An image or some other content has been found that cannot be indexed. Message is given an empty body.")
         body = ' '
 
@@ -149,7 +151,7 @@ def __parse_execute(artifact, schema, my_index, include_filepath):
     :return: a match list of tuples (file name, artifact, message ID)
     """
 
-    log.devinfo("Searching for artifact ({}, {})...".format(artifact[0], artifact[1]))
+    log.info("Searching for artifact ({}, {})...".format(artifact[0], artifact[1]))
 
     result = []
 

From 9df79c0b67742378795325effeaa511d4e34d25c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20L=C3=B6ffler?= <s8maloef@stud.uni-saarland.de>
Date: Sat, 30 Aug 2025 17:14:16 +0200
Subject: [PATCH 07/13] Increase consistency with pyhton3 conventions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Maximilian Löffler <s8maloef@stud.uni-saarland.de>
---
 anonymization/anonymization.py                | 10 +++---
 .../author_postprocessing.py                  | 24 +++++++-------
 bot_processing/bot_processing.py              |  8 ++---
 codeface_utils/cluster/idManager.py           | 33 ++++++++++---------
 codeface_utils/configuration.py               | 21 ++++++------
 codeface_utils/dbmanager.py                   | 11 +++----
 csv_writer/csv_writer.py                      |  7 ++--
 issue_processing/issue_processing.py          | 23 ++++++-------
 issue_processing/jira_issue_processing.py     |  9 ++---
 9 files changed, 67 insertions(+), 79 deletions(-)

diff --git a/anonymization/anonymization.py b/anonymization/anonymization.py
index fdddebc..bcd76b1 100644
--- a/anonymization/anonymization.py
+++ b/anonymization/anonymization.py
@@ -103,13 +103,13 @@ def anonymize_authors(author_data, i, author_to_anonymized_author, name_only = F
 
             # Don't anonymize the deleted user as this one might be needed for filtering (but add it to the dictionary)
             if orig_author == "Deleted user" and orig_email == "ghost@github.com":
-                if not (orig_author, orig_email) in author_to_anonymized_author:
+                if (orig_author, orig_email) not in author_to_anonymized_author:
                     author_to_anonymized_author[(orig_author, orig_email)] = (orig_author, orig_email)
             else:
                 # check whether (name, e-mail) pair isn't already present in the dictionary
-                if not (orig_author, orig_email) in author_to_anonymized_author:
+                if (orig_author, orig_email) not in author_to_anonymized_author:
                         # check if just the name (without e-mail address) isn't already present in the dictionary
-                        if not orig_author in author_to_anonymized_author:
+                        if orig_author not in author_to_anonymized_author:
                             # if the author has an empty name, only anonymize their e-mail address
                             if not author[1] == "":
                                 author[1] = ("developer" + str(i))
@@ -140,7 +140,7 @@ def anonymize_authors(author_data, i, author_to_anonymized_author, name_only = F
 
 
     # Check for all files in the result directory of the project whether they need to be anonymized
-    for filepath, dirnames, filenames in walk(data_path):
+    for filepath, _, filenames in walk(data_path):
 
         # (1) Anonymize authors lists
         if authors_list in filenames:
@@ -169,7 +169,7 @@ def anonymize_authors(author_data, i, author_to_anonymized_author, name_only = F
             # anonymize authors
             author_data, i, author_to_anonymized_author = \
               anonymize_authors(author_data, i, author_to_anonymized_author)
-          
+
             author_data_gender, i_gender, author_to_anonymized_author_gender = \
               anonymize_authors(author_data_gender, i_gender, author_to_anonymized_author_gender, name_only = True)
 
diff --git a/author_postprocessing/author_postprocessing.py b/author_postprocessing/author_postprocessing.py
index f3b0ca9..4ec0600 100644
--- a/author_postprocessing/author_postprocessing.py
+++ b/author_postprocessing/author_postprocessing.py
@@ -67,7 +67,7 @@ def perform_data_backup(results_path, results_path_backup):
         log.info("Backup folder already exists. No backup is to be performed.")
         return
 
-    for filepath, dirnames, filenames in walk(results_path):
+    for filepath, _, filenames in walk(results_path):
         for filename in filenames:
                 if filename.endswith(".list"):
                     current_file = path.join(filepath, filename)
@@ -119,7 +119,7 @@ def is_github_noreply_author(name, email):
 
 
     # Check for all files in the result directory of the project whether they need to be adjusted
-    for filepath, dirnames, filenames in walk(data_path):
+    for filepath, _, filenames in walk(data_path):
 
         # (1) Remove author 'GitHub <noreply@github.com>' from authors list
         if authors_list in filenames:
@@ -148,7 +148,7 @@ def is_github_noreply_author(name, email):
                 if not is_github_noreply_author(email[0], email[1]):
                     email_data_new.append(email)
                 else:
-                    log.warn("Remove email %s as it was sent by %s <%s>.", email[2], email[0], email[1])
+                    log.warning("Remove email %s as it was sent by %s <%s>.", email[2], email[0], email[1])
             csv_writer.write_to_csv(f, email_data_new)
 
 
@@ -198,19 +198,19 @@ def is_github_noreply_author(name, email):
                         # ignore merge commits in the commit data, we consistently ignore them also if they are added
                         # to a pull request. Hence, the corresponding "commit_added" event will be removed now (i.e.,
                         # not added to the new issue data any more).
-                        log.warn("Commit %s is added in the GitHub issue data, but not part of the commit data. " +
-                                 "Remove the corresponding 'commit_added' event from the issue data...", commit_hash)
+                        log.warning("Commit %s is added in the GitHub issue data, but not part of the commit data. " +
+                                    "Remove the corresponding 'commit_added' event from the issue data...", commit_hash)
                 elif is_github_noreply_author(event[9], event[10]):
                     # the event is authored by 'GitHub <noreply@github.com>', but is not a "commit_added" event, so we
                     # neglect this event and remove it now (i.e., not add it to the new issue data any more).
-                    log.warn("Event %s is authored by %s <%s>. Remove this event form the issue data...",
-                             event[8], event[9], event[10])
+                    log.warning("Event %s is authored by %s <%s>. Remove this event form the issue data...",
+                                event[8], event[9], event[10])
                 elif (is_github_noreply_author(event[12], event[13][1:-1])
                       and (event[8] == mentioned_event or event[8] == subscribed_event)):
                     # the event references 'GitHub <noreply@github.com>', so we neglect this event and remove it now
                     # (i.e., not add it to the new issue data any more).
-                    log.warn("Event %s by %s <%s> references %s <%s>. Remove this event from the issue data...",
-                             event[8], event[9], event[10], event[12], event[13])
+                    log.warning("Event %s by %s <%s> references %s <%s>. Remove this event from the issue data...",
+                                event[8], event[9], event[10], event[12], event[13])
                 else:
                     issue_data_new.append(event)
 
@@ -229,7 +229,7 @@ def is_github_noreply_author(name, email):
                 if not is_github_noreply_author(entry[0], entry[1]):
                     bot_data_new.append(entry)
                 else:
-                    log.warn("Remove entry %s <%s> from bots list.", entry[0], entry[1])
+                    log.warning("Remove entry %s <%s> from bots list.", entry[0], entry[1])
 
             csv_writer.write_to_csv(f, bot_data_new)
 
@@ -285,7 +285,7 @@ def run_postprocessing(conf, resdir, backup_data):
         return
 
     # Check for all files in the result directory of the project whether they need to be adjusted
-    for filepath, dirnames, filenames in walk(data_path):
+    for filepath, _, filenames in walk(data_path):
 
         # (1) Adjust authors lists
         if authors_list in filenames:
@@ -302,7 +302,7 @@ def run_postprocessing(conf, resdir, backup_data):
 
             for author in author_data:
                 # keep author entry only if it should not be removed
-                if not author in author_data_to_remove:
+                if author not in author_data_to_remove:
                     author_data_new.append(author)
             csv_writer.write_to_csv(f, author_data_new)
 
diff --git a/bot_processing/bot_processing.py b/bot_processing/bot_processing.py
index 14bdd56..00cf099 100644
--- a/bot_processing/bot_processing.py
+++ b/bot_processing/bot_processing.py
@@ -19,12 +19,8 @@
 """
 
 import argparse
-import http.client
 import os
 import sys
-import urllib.request, urllib.parse, urllib.error
-
-import operator
 from logging import getLogger
 
 from codeface_utils.configuration import Configuration
@@ -200,7 +196,7 @@ def add_user_data(bot_data, user_data, known_bots_file):
             bot_reduced["prediction"] = user[-1]
             bot_data_reduced.append(bot_reduced)
         else:
-            log.warn("User '{}' in bot data does not occur in GitHub user data. Remove user...".format(user[0]))
+            log.warning("User '{}' in bot data does not occur in GitHub user data. Remove user...".format(user[0]))
 
     # check whether known GitHub bots occur in the GitHub issue data and, if so, update the bot data accordingly
     bot_data_reduced = check_with_known_bot_list(known_bots_file, bot_data, user_buffer, bot_data_reduced)
@@ -227,7 +223,7 @@ def print_to_disk(bot_data, results_folder):
                  user["user"]["email"],
                  user["prediction"]
                 )
-        if not entry in lines:
+        if entry not in lines:
             lines.append(entry)
 
     # write to output file
diff --git a/codeface_utils/cluster/idManager.py b/codeface_utils/cluster/idManager.py
index b319511..52cd04b 100644
--- a/codeface_utils/cluster/idManager.py
+++ b/codeface_utils/cluster/idManager.py
@@ -18,21 +18,22 @@
 from __future__ import absolute_import
 import re
 from email.utils import parseaddr
-from logging import getLogger; log = getLogger(__name__)
-import six.moves.http_client
-import six.moves.urllib.request, six.moves.urllib.parse, six.moves.urllib.error
+from logging import getLogger
+import http.client as http_client
+import urllib.parse as urlparse
 import json
 import string
 import random
 import time
-from ..util import encode_as_utf8
-from six.moves import range
 from abc import ABC, abstractmethod
 import pandas
 
 from codeface_utils.cluster.PersonInfo import PersonInfo
+from ..util import encode_as_utf8
 
 
+log = getLogger(__name__)
+
 class idManager(ABC):
 
     def __init__(self):
@@ -130,7 +131,7 @@ def _decompose_addr(self, addr):
                     # print("Fixup for email required, but FAILED for {0}".format(addr))
                     name = addr
                     rand_str = "".join(random.choice(string.ascii_lowercase + string.digits)
-                                       for i in range(10))
+                                       for _ in range(10))
                     email = "could.not.resolve@" + rand_str
 
         email = email.lower()
@@ -153,7 +154,7 @@ def __init__(self, dbm, conf):
 
         self._idMgrServer = conf["idServiceHostname"]
         self._idMgrPort = conf["idServicePort"]
-        self._conn = six.moves.http_client.HTTPConnection(self._idMgrServer, self._idMgrPort)
+        self._conn = http_client.HTTPConnection(self._idMgrServer, self._idMgrPort)
 
         # Create a project ID
         self._dbm = dbm
@@ -172,9 +173,9 @@ def _query_user_id(self, name, email):
         """Query the ID database for a contributor ID"""
 
         name = encode_as_utf8(name)
-        params = six.moves.urllib.parse.urlencode({'projectID': self._projectID,
-                                   'name': name,
-                                   'email': email})
+        params = urlparse.urlencode({'projectID': self._projectID,
+                                     'name': name,
+                                     'email': email})
 
         try:
             self._conn.request("POST", "/post_user_id", params, self.headers)
@@ -184,9 +185,9 @@ def _query_user_id(self, name, email):
             successful = False
             while (retryCount <= 10 and not successful):
                 log.warning("Could not reach ID service. Try to reconnect " \
-                            "(attempt {}).".format(retryCount));
+                            "(attempt {}).".format(retryCount))
                 self._conn.close()
-                self._conn = six.moves.http_client.HTTPConnection(self._idMgrServer, self._idMgrPort)
+                self._conn = http_client.HTTPConnection(self._idMgrServer, self._idMgrPort)
                 time.sleep(60)
                 #self._conn.ping(True)
                 try:
@@ -222,7 +223,7 @@ def getPersonID(self, addr):
         """
 
         (name, email) = self._decompose_addr(addr)
-        if not (name, email) in self._cache:
+        if (name, email) not in self._cache:
             self._cache[(name, email)] = self._query_user_id(name, email)
         ID = self._cache[(name, email)]
 
@@ -241,14 +242,14 @@ def getPersonFromDB(self, person_id):
             res = self._conn.getresponse()
         except:
             self._conn.close()
-            self._conn = six.moves.http_client.HTTPConnection(self._idMgrServer, self._idMgrPort)
+            self._conn = http_client.HTTPConnection(self._idMgrServer, self._idMgrPort)
             retryCount = 0
             successful = False
             while (retryCount <= 10 and not successful):
                 log.warning("Could not reach ID service. Try to reconnect " \
-                            "(attempt {}).".format(retryCount));
+                            "(attempt {}).".format(retryCount))
                 self._conn.close()
-                self._conn = six.moves.http_client.HTTPConnection(self._idMgrServer, self._idMgrPort)
+                self._conn = http_client.HTTPConnection(self._idMgrServer, self._idMgrPort)
                 time.sleep(60)
                 #self._conn.ping(True)
                 try:
diff --git a/codeface_utils/configuration.py b/codeface_utils/configuration.py
index d43f8d4..9b437b6 100644
--- a/codeface_utils/configuration.py
+++ b/codeface_utils/configuration.py
@@ -20,16 +20,15 @@
 '''
 
 from __future__ import absolute_import
-import yaml
+from tempfile import NamedTemporaryFile
 from collections.abc import Mapping
-from six.moves import range
 from logging import getLogger
+import yaml
 
 from codeface_utils.linktype import LinkType
 
 
 log = getLogger(__name__)
-from tempfile import NamedTemporaryFile
 
 class ConfigurationError(Exception):
     '''Raised if any part of the configuration is malformed'''
@@ -67,19 +66,19 @@ def __init__(self):
         self._conf_file_loc = None
 
     @classmethod
-    def load(self, global_conffile, local_conffile=None):
+    def load(cls, global_conffile, local_conffile=None):
         '''
         Load configuration from global/local files
         '''
         c = Configuration()
         log.info("Loading global configuration file '{}'".
                 format(global_conffile))
-        self._global_conf = c._load(global_conffile)
+        cls._global_conf = c._load(global_conffile)
         c._conf.update(c._global_conf)
         if local_conffile:
             log.info("Loading project configuration file '{}'".
                     format(local_conffile))
-            self._project_conf = c._load(local_conffile)
+            cls._project_conf = c._load(local_conffile)
             c._conf.update(c._project_conf)
         else:
             log.info("Not loading project configuration file!")
@@ -103,7 +102,7 @@ def _load(self, filename):
     def _initialize(self):
         '''Infer missing values in the configuration'''
         if "rcs" not in self:
-            self._conf["rcs"] = [None for x in range(len(self["revisions"]))]
+            self._conf["rcs"] = [None for _ in range(len(self["revisions"]))]
 
         if "mailinglists" not in self:
             self._conf["mailinglists"] = []
@@ -132,12 +131,12 @@ def _check_sanity(self):
                 raise ConfigurationError('Invalid configuration key.')
 
         for key in self.GLOBAL_KEYS + self.PROJECT_KEYS:
-            if not key in self:
+            if key not in self:
                 log.critical("Required key '{}' missing in configuration!"
                         ''.format(key))
                 raise ConfigurationError('Missing configuration key.')
 
-        if not self['tagging'] in LinkType.get_all_link_types():
+        if self['tagging'] not in LinkType.get_all_link_types():
             log.critical('Unsupported tagging mechanism specified!')
             raise ConfigurationError('Unsupported tagging mechanism.')
 
@@ -151,11 +150,11 @@ def _check_sanity(self):
             raise ConfigurationError('Malformed configuration.')
 
         if self["useCsv"]:
-            if not "csvFile" in self:
+            if "csvFile" not in self:
                 log.critical("Malformed configuration: useCsv is true, but "
                     "csvFile is not specified.")
                 raise ConfigurationError('Malformed configuration.')
-            if not "csvSeparator" in self:
+            if "csvSeparator" not in self:
                 self["csvSeparator"] = ","
 
         unknown_keys = [k for k in self if k not in self.ALL_KEYS]
diff --git a/codeface_utils/dbmanager.py b/codeface_utils/dbmanager.py
index 4f8895d..8170d12 100644
--- a/codeface_utils/dbmanager.py
+++ b/codeface_utils/dbmanager.py
@@ -21,11 +21,10 @@
 from __future__ import print_function
 import MySQLdb as mdb
 import time
-from datetime import datetime
-from logging import getLogger;
+from datetime import datetime, timezone
+from logging import getLogger
 from contextlib import contextmanager
-from six.moves import range
-from six.moves import zip
+
 
 # create logger
 log = getLogger(__name__)
@@ -57,7 +56,7 @@ def __init__(self, conf):
         # self.doExec("SET GLOBAL max_allowed_packet=%s", (max_packet_size,))
 
     def __del__(self):
-        if self.con != None:
+        if self.con is not None:
             self.con.close()
 
     def __openConnection(self, conf):
@@ -478,4 +477,4 @@ def update_release_timeline(self, project, tagging, revs, rcs,
 
 def tstamp_to_sql(tstamp):
     """Convert a Unix timestamp into an SQL compatible DateTime string"""
-    return (datetime.utcfromtimestamp(tstamp).strftime("%Y-%m-%d %H:%M:%S"))
+    return datetime.fromtimestamp(tstamp, tz=timezone.utc).strftime("%Y-%m-%d %H:%M:%S")
diff --git a/csv_writer/csv_writer.py b/csv_writer/csv_writer.py
index b41463a..fac9205 100644
--- a/csv_writer/csv_writer.py
+++ b/csv_writer/csv_writer.py
@@ -45,14 +45,13 @@ def write_to_csv(file_path, lines, append=False):
     :param append: Flag if lines shall be appended to file or overwrite file
     """
 
-    open_mode = "a+b" if append else "wb"
+    open_mode = "a" if append else "w"
 
-    with open(file_path, open_mode) as csv_file:
+    with open(file_path, mode=open_mode, encoding="utf-8") as csv_file:
         wr = csv.writer(csv_file, delimiter=';', lineterminator='\n', quoting=csv.QUOTE_NONNUMERIC)
         # encode in proper UTF-8 before writing to file
         for line in lines:
-            line_encoded = __encode(line)
-            wr.writerow(line_encoded)
+            wr.writerow(line)
 
 def read_from_csv(file_path, delimiter=";"):
     """
diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py
index adbee53..0274bd8 100644
--- a/issue_processing/issue_processing.py
+++ b/issue_processing/issue_processing.py
@@ -24,15 +24,12 @@
 """
 
 import argparse
-import http.client
 import json
 import os
 import sys
-import urllib.request, urllib.parse, urllib.error
 from datetime import datetime, timedelta
 from logging import getLogger
 
-import operator
 from codeface_utils.cluster.idManager import dbIdManager, csvIdManager
 from codeface_utils.configuration import Configuration
 from codeface_utils.dbmanager import DBManager
@@ -194,7 +191,7 @@ def lookup_user(user_dict, user):
         user["email"] is None or user["email"] == ""):
 
         # lookup user only if username is not None and not empty
-        if not user["username"] is None and not user["username"] == "":
+        if user["username"] is not None and not user["username"] == "":
             user = user_dict[user["username"]]
 
     return user
@@ -213,8 +210,8 @@ def update_user_dict(user_dict, user):
     if user is None:
         user = create_deleted_user()
 
-    if not user["username"] in list(user_dict.keys()):
-        if not user["username"] is None and not user["username"] == "":
+    if user["username"] not in list(user_dict.keys()):
+        if user["username"] is not None and not user["username"] == "":
             user_dict[user["username"]] = user
     else:
         user_in_dict = user_dict[user["username"]]
@@ -425,7 +422,7 @@ def merge_issue_events(issue_data):
         # add dismissal comments to the list of comments
         for event in issue["eventsList"]:
 
-            if (event["event"] == "review_dismissed" and not event["dismissalMessage"] is None
+            if (event["event"] == "review_dismissed" and event["dismissalMessage"] is not None
                and not event["dismissalMessage"] == ""):
                 dismissalComment = dict()
                 dismissalComment["event"] = "commented"
@@ -503,7 +500,7 @@ def merge_issue_events(issue_data):
         issue["eventsList"] = sorted(issue["eventsList"], key=lambda k: k["created_at"])
 
     # updates all the issues by the temporarily stored referenced_by events
-    for key, value in issue_data_to_update.items():
+    for _, value in issue_data_to_update.items():
         for issue in issue_data:
             if issue["number"] == value["number"]:
                 issue["eventsList"] = issue["eventsList"] + value["eventsList"]
@@ -538,7 +535,7 @@ def reformat_events(issue_data):
             users = update_user_dict(users, event["user"])
 
             # 3) add or update users which are ref_target of the current event
-            if not event["ref_target"] is None and not event["ref_target"] == "":
+            if event["ref_target"] is not None and not event["ref_target"] == "":
                 users = update_user_dict(users, event["ref_target"])
 
     # as the user dictionary is created, start re-formating the event information of all issues
@@ -639,7 +636,7 @@ def reformat_events(issue_data):
                 event["event_info_1"] = issue["state_new"]
                 event["event_info_2"] = issue["resolution"]
 
-            elif event["event"] == "referenced" and not event["commit"] is None:
+            elif event["event"] == "referenced" and event["commit"] is not None:
                 # remove "referenced" events originating from commits
                 # as they are handled as referenced commit
                 events_to_remove.append(event)
@@ -729,9 +726,9 @@ def get_user_from_id(idx, buffer_db=user_buffer):
         log.info("Passing user id '{}' to ID service.".format(idx))
         person = idservice.getPersonFromDB(idx)
         user = dict()
-        user["email"] = person["email1"]  # column "email1"
-        user["name"] = person["name"]  # column "name"
-        user["id"] = person["id"]  # column "id"
+        user["email"] = person.getEmail()  # column "email1"
+        user["name"] = person.getName()  # column "name"
+        user["id"] = person.getID()  # column "id"
 
         # add user information to buffer
         buffer_db[idx] = user
diff --git a/issue_processing/jira_issue_processing.py b/issue_processing/jira_issue_processing.py
index 5d763fa..032bb86 100644
--- a/issue_processing/jira_issue_processing.py
+++ b/issue_processing/jira_issue_processing.py
@@ -26,13 +26,11 @@
 import argparse
 import os
 import sys
-import time
 import csv
 import json
 from logging import getLogger
 
 from xml.dom.minidom import parse
-from datetime import datetime
 from dateutil import parser as dateparser
 
 from codeface_utils.cluster.idManager import dbIdManager, csvIdManager
@@ -47,7 +45,6 @@
 import importlib
 
 importlib.reload(sys)
-sys.setdefaultencoding("utf-8")
 
 
 log = getLogger(__name__)
@@ -294,7 +291,7 @@ def parse_xml(issue_data, persons, skip_history, referenced_bys):
 
         resolved = issue_x.getElementsByTagName("resolved")
         issue["resolveDate"] = ""
-        if (len(resolved) > 0) and (not resolved[0] is None):
+        if (len(resolved) > 0) and (resolved[0] is not None):
             resolveDate = resolved[0].firstChild.data
             issue["resolveDate"] = format_time(resolveDate)
 
@@ -1032,10 +1029,10 @@ def find_first_existing(source_folder, filenames):
         persons_by_username = {}
         persons_by_name = {}
         for row in person_data:
-            if not row["AuthorID"] in list(persons_by_username.keys()):
+            if row["AuthorID"] not in list(persons_by_username.keys()):
                 author_id_utf8 = str(row["AuthorID"]).encode("utf-8")
                 persons_by_username[author_id_utf8] = (row["AuthorName"], row["userEmail"])
-            if not row["AuthorName"] in list(persons_by_name.keys()):
+            if row["AuthorName"] not in list(persons_by_name.keys()):
                 author_name_utf8 = str(row["AuthorName"]).encode("utf-8")
                 persons_by_name[author_name_utf8] = (row["AuthorName"], row["userEmail"])
 

From e9372fede49d86248701e1fd5536b0ae0a633838 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20L=C3=B6ffler?= <s8maloef@stud.uni-saarland.de>
Date: Thu, 18 Sep 2025 16:20:16 +0200
Subject: [PATCH 08/13] Update copyright headers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Maximilian Löffler <s8maloef@stud.uni-saarland.de>
---
 anonymization/anonymization.py                |  1 +
 .../author_postprocessing.py                  |  1 +
 bot_processing/bot_processing.py              |  1 +
 codeface_extraction/codeface_extraction.py    |  1 +
 codeface_extraction/extractions.py            |  1 +
 codeface_utils/cluster/PersonInfo.py          |  9 +++--
 codeface_utils/cluster/idManager.py           | 12 +++++--
 codeface_utils/configuration.py               | 11 ++++--
 codeface_utils/dbmanager.py                   |  1 +
 codeface_utils/linktype.py                    | 35 ++++++++++---------
 codeface_utils/util.py                        | 35 +++++++++++--------
 csv_writer/csv_writer.py                      |  1 +
 issue_processing/issue_processing.py          |  1 +
 issue_processing/jira_issue_processing.py     |  2 +-
 mbox_parsing/mbox_parsing.py                  |  1 +
 15 files changed, 72 insertions(+), 41 deletions(-)

diff --git a/anonymization/anonymization.py b/anonymization/anonymization.py
index bcd76b1..251675b 100644
--- a/anonymization/anonymization.py
+++ b/anonymization/anonymization.py
@@ -15,6 +15,7 @@
 # Copyright 2015-2017 by Claus Hunsen <hunsen@fim.uni-passau.de>
 # Copyright 2021 by Thomas Bock <bockthom@cs.uni-saarland.de>
 # Copyright 2022 by Christian Hechtl <hechtl@cs.uni-saarland.de>
+# Copyright 2025 by Maximilian Löffler <s8maloef@stud.uni-saarland.de>
 # All Rights Reserved.
 """
 This file is able to anonymize authors and issue titles after the extraction from the Codeface database was performed.
diff --git a/author_postprocessing/author_postprocessing.py b/author_postprocessing/author_postprocessing.py
index 4ec0600..53caeb2 100644
--- a/author_postprocessing/author_postprocessing.py
+++ b/author_postprocessing/author_postprocessing.py
@@ -14,6 +14,7 @@
 #
 # Copyright 2015-2017 by Claus Hunsen <hunsen@fim.uni-passau.de>
 # Copyright 2020-2022 by Thomas Bock <bockthom@cs.uni-saarland.de>
+# Copyright 2025 by Maximilian Löffler <s8maloef@stud.uni-saarland.de>
 # All Rights Reserved.
 """
 This file is able to disambiguate authors after the extraction from the Codeface database was performed. A manually
diff --git a/bot_processing/bot_processing.py b/bot_processing/bot_processing.py
index 00cf099..a4b56c5 100644
--- a/bot_processing/bot_processing.py
+++ b/bot_processing/bot_processing.py
@@ -13,6 +13,7 @@
 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 #
 # Copyright 2021-2022 by Thomas Bock <bockthom@cs.uni-saarland.de>
+# Copyright 2025 by Maximilian Löffler <s8maloef@stud.uni-saarland.de>
 # All Rights Reserved.
 """
 This file is able to extract information on bot/human users from csv files.
diff --git a/codeface_extraction/codeface_extraction.py b/codeface_extraction/codeface_extraction.py
index 62b36ef..05d42c5 100644
--- a/codeface_extraction/codeface_extraction.py
+++ b/codeface_extraction/codeface_extraction.py
@@ -15,6 +15,7 @@
 # Copyright 2015-2017 by Claus Hunsen <hunsen@fim.uni-passau.de>
 # Copyright 2016, 2018-2019 by Thomas Bock <bockthom@fim.uni-passau.de>
 # Copyright 2018 by Barbara Eckl <ecklbarb@fim.uni-passau.de>
+# Copyright 2025 by Maximilian Löffler <s8maloef@stud.uni-saarland.de>
 # All Rights Reserved.
 """
 This file is able to extract developer--artifact relations from the Codeface database.
diff --git a/codeface_extraction/extractions.py b/codeface_extraction/extractions.py
index b294be2..9c636dd 100644
--- a/codeface_extraction/extractions.py
+++ b/codeface_extraction/extractions.py
@@ -17,6 +17,7 @@
 # Copyright 2019, 2021 by Thomas Bock <bockthom@cs.uni-saarland.de>
 # Copyright 2018 by Barbara Eckl <ecklbarb@fim.uni-passau.de>
 # Copyright 2018 by Tina Schuh <schuht@fim.uni-passau.de>
+# Copyright 2025 by Maximilian Löffler <s8maloef@stud.uni-saarland.de>
 # All Rights Reserved.
 """
 This file provides the class 'Extraction' and all of its subclasses.
diff --git a/codeface_utils/cluster/PersonInfo.py b/codeface_utils/cluster/PersonInfo.py
index 5884108..69fe1bb 100644
--- a/codeface_utils/cluster/PersonInfo.py
+++ b/codeface_utils/cluster/PersonInfo.py
@@ -1,6 +1,6 @@
-# This file is part of Codeface. Codeface is free software: you can
-# redistribute it and/or modify it under the terms of the GNU General Public
-# License as published by the Free Software Foundation, version 2.
+# This file is part of codeface-extraction, which is free software: you
+# can redistribute it and/or modify it under the terms of the GNU General
+# Public License as published by the Free Software Foundation, version 2.
 #
 # This program is distributed in the hope that it will be useful, but WITHOUT
 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
@@ -14,6 +14,9 @@
 # Copyright 2010, 2011 by Wolfgang Mauerer <wm@linux-kernel.net>
 # Copyright 2012, 2013, Siemens AG, Wolfgang Mauerer <wolfgang.mauerer@siemens.com>
 # All Rights Reserved.
+#
+# The code in this file originates from:
+# https://github.com/siemens/codeface/blob/master/codeface/cluster/PersonInfo.py
 
 from __future__ import absolute_import
 
diff --git a/codeface_utils/cluster/idManager.py b/codeface_utils/cluster/idManager.py
index 52cd04b..132ae28 100644
--- a/codeface_utils/cluster/idManager.py
+++ b/codeface_utils/cluster/idManager.py
@@ -1,6 +1,6 @@
-# This file is part of Codeface. Codeface is free software: you can
-# redistribute it and/or modify it under the terms of the GNU General Public
-# License as published by the Free Software Foundation, version 2.
+# This file is part of codeface-extraction, which is free software: you
+# can redistribute it and/or modify it under the terms of the GNU General
+# Public License as published by the Free Software Foundation, version 2.
 #
 # This program is distributed in the hope that it will be useful, but WITHOUT
 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
@@ -13,7 +13,13 @@
 #
 # Copyright 2010, 2011 by Wolfgang Mauerer <wm@linux-kernel.net>
 # Copyright 2012, 2013 by Siemens AG, Wolfgang Mauerer <wolfgang.mauerer@siemens.com>
+# Copyright 2025 by Maximilian Löffler <s8maloef@stud.uni-saarland.de>
 # All Rights Reserved.
+#
+# The code in this file originates from:
+# https://github.com/siemens/codeface/blob/master/codeface/cluster/idManager.py
+# We inherit the 'idManager' and 'dbIdManager' classes from codeface.
+# The 'csvManager' class is original.
 
 from __future__ import absolute_import
 import re
diff --git a/codeface_utils/configuration.py b/codeface_utils/configuration.py
index 9b437b6..78bdc6f 100644
--- a/codeface_utils/configuration.py
+++ b/codeface_utils/configuration.py
@@ -1,6 +1,6 @@
-# This file is part of Codeface. Codeface is free software: you can
-# redistribute it and/or modify it under the terms of the GNU General Public
-# License as published by the Free Software Foundation, version 2.
+# This file is part of codeface-extraction, which is free software: you
+# can redistribute it and/or modify it under the terms of the GNU General
+# Public License as published by the Free Software Foundation, version 2.
 #
 # This program is distributed in the hope that it will be useful, but WITHOUT
 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
@@ -12,7 +12,12 @@
 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 #
 # Copyright 2013 by Siemens AG, Johannes Ebke <johannes.ebke.ext@siemens.com>
+# Copyright 2025 by Maximilian Löffler <s8maloef@stud.uni-saarland.de>
 # All Rights Reserved.
+#
+# The code in this file originates from:
+# https://github.com/siemens/codeface/blob/master/codeface/configuration.py
+
 '''
 Configuration module for codeface
 
diff --git a/codeface_utils/dbmanager.py b/codeface_utils/dbmanager.py
index 8170d12..aecc172 100644
--- a/codeface_utils/dbmanager.py
+++ b/codeface_utils/dbmanager.py
@@ -13,6 +13,7 @@
 # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 #
 # Copyright 2013 by Siemens AG, Wolfgang Mauerer <wolfgang.mauerer@siemens.com>
+# Copyright 2025 by Maximilian Löffler <s8maloef@stud.uni-saarland.de>
 # All Rights Reserved.
 
 # Thin sql database wrapper
diff --git a/codeface_utils/linktype.py b/codeface_utils/linktype.py
index db08a80..c9b0dfb 100644
--- a/codeface_utils/linktype.py
+++ b/codeface_utils/linktype.py
@@ -1,19 +1,22 @@
-## This file is part of Codeface. Codeface is free software: you can
-## redistribute it and/or modify it under the terms of the GNU General Public
-## License as published by the Free Software Foundation, version 2.
-##
-## This program is distributed in the hope that it will be useful, but WITHOUT
-## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-## FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
-## details.
-##
-## You should have received a copy of the GNU General Public License
-## along with this program; if not, write to the Free Software
-## Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-##
-## Copyright 2013 by Siemens AG, Wolfgang Mauerer <wolfgang.mauerer@siemens.com>
-## Copyright 2014 by Matthias Dittrich <matthi.d@gmail.com>
-## All Rights Reserved.
+# This file is part of codeface-extraction, which is free software: you
+# can redistribute it and/or modify it under the terms of the GNU General
+# Public License as published by the Free Software Foundation, version 2.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+# details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+#
+# Copyright 2013 by Siemens AG, Wolfgang Mauerer <wolfgang.mauerer@siemens.com>
+# Copyright 2014 by Matthias Dittrich <matthi.d@gmail.com>
+# All Rights Reserved.
+#
+# The code in this file originates from:
+# https://github.com/siemens/codeface/blob/master/codeface/linktype.py
 
 __author__ = 'drag0on'
 
diff --git a/codeface_utils/util.py b/codeface_utils/util.py
index d859dcf..82ad09c 100644
--- a/codeface_utils/util.py
+++ b/codeface_utils/util.py
@@ -1,18 +1,23 @@
-## This file is part of Codeface. Codeface is free software: you can
-## redistribute it and/or modify it under the terms of the GNU General Public
-## License as published by the Free Software Foundation, version 2.
-##
-## This program is distributed in the hope that it will be useful, but WITHOUT
-## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-## FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
-## details.
-##
-## You should have received a copy of the GNU General Public License
-## along with this program; if not, write to the Free Software
-## Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-##
-## Copyright 2013 by Siemens AG, Wolfgang Mauerer <wolfgang.mauerer@siemens.com>
-## All Rights Reserved.
+# This file is part of codeface-extraction, which is free software: you
+# can redistribute it and/or modify it under the terms of the GNU General
+# Public License as published by the Free Software Foundation, version 2.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+# details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+#
+# Copyright 2013 by Siemens AG, Wolfgang Mauerer <wolfgang.mauerer@siemens.com>
+# Copyright 2025 by Maximilian Löffler <s8maloef@stud.uni-saarland.de>
+# All Rights Reserved.
+#
+# The code in this file originates from:
+# https://github.com/siemens/codeface/blob/master/codeface/util.py
+
 '''
 Utility functions for running external commands
 '''
diff --git a/csv_writer/csv_writer.py b/csv_writer/csv_writer.py
index fac9205..f950694 100644
--- a/csv_writer/csv_writer.py
+++ b/csv_writer/csv_writer.py
@@ -15,6 +15,7 @@
 # Copyright 2017 by Claus Hunsen <hunsen@fim.uni-passau.de>
 # Copyright 2018 by Anselm Fehnker <fehnker@fim.uni-passau.de>
 # Copyright 2020-2021 by Thomas Bock <bockthom@cs.uni-saarland.de>
+# Copyright 2025 by Maximilian Löffler <s8maloef@stud.uni-saarland.de>
 # All Rights Reserved.
 """
 This file provides the needed functions for standardized CSV writing
diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py
index 0274bd8..3b53cab 100644
--- a/issue_processing/issue_processing.py
+++ b/issue_processing/issue_processing.py
@@ -18,6 +18,7 @@
 # Copyright 2018-2019 by Anselm Fehnker <fehnker@fim.uni-passau.de>
 # Copyright 2019 by Thomas Bock <bockthom@fim.uni-passau.de>
 # Copyright 2020-2021 by Thomas Bock <bockthom@cs.uni-saarland.de>
+# Copyright 2025 by Maximilian Löffler <s8maloef@stud.uni-saarland.de>
 # All Rights Reserved.
 """
 This file is able to extract Github issue data from json files.
diff --git a/issue_processing/jira_issue_processing.py b/issue_processing/jira_issue_processing.py
index 032bb86..bfc2214 100644
--- a/issue_processing/jira_issue_processing.py
+++ b/issue_processing/jira_issue_processing.py
@@ -17,7 +17,7 @@
 # Copyright 2018 by Barbara Eckl <ecklbarb@fim.uni-passau.de>
 # Copyright 2018-2019 by Anselm Fehnker <fehnker@fim.uni-passau.de>
 # Copyright 2020-2021 by Thomas Bock <bockthom@cs.uni-saarland.de>
-# Copyright 2023 by Maximilian Löffler <s8maloef@stud.uni-saarland.de>
+# Copyright 2023, 2025 by Maximilian Löffler <s8maloef@stud.uni-saarland.de>
 # All Rights Reserved.
 """
 This file is able to extract Jira issue data from xml files.
diff --git a/mbox_parsing/mbox_parsing.py b/mbox_parsing/mbox_parsing.py
index ef9ae41..be337fa 100644
--- a/mbox_parsing/mbox_parsing.py
+++ b/mbox_parsing/mbox_parsing.py
@@ -15,6 +15,7 @@
 # Copyright 2017 by Raphael Nömmer <noemmer@fim.uni-passau.de>
 # Copyright 2017-2019 by Claus Hunsen <hunsen@fim.uni-passau.de>
 # Copyright 2018-2019 by Thomas Bock <bockthom@fim.uni-passau.de>
+# Copyright 2025 by Maximilian Löffler <s8maloef@stud.uni-saarland.de>
 # All Rights Reserved.
 """
 This file is able to extract artifact occurrences in e-mail within mbox files.

From eef2c8ba66f4d32c6319c3a0bcdcd3eae5babec5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20L=C3=B6ffler?= <s8maloef@stud.uni-saarland.de>
Date: Thu, 18 Sep 2025 18:02:51 +0200
Subject: [PATCH 09/13] Remove unused code artifacts
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Maximilian Löffler <s8maloef@stud.uni-saarland.de>
---
 codeface_utils/cluster/PersonInfo.py |  57 ---
 codeface_utils/cluster/idManager.py  |  47 +--
 codeface_utils/linktype.py           |   2 -
 codeface_utils/util.py               | 562 +--------------------------
 csv_writer/csv_writer.py             |  13 -
 5 files changed, 16 insertions(+), 665 deletions(-)
 delete mode 100644 codeface_utils/cluster/PersonInfo.py

diff --git a/codeface_utils/cluster/PersonInfo.py b/codeface_utils/cluster/PersonInfo.py
deleted file mode 100644
index 69fe1bb..0000000
--- a/codeface_utils/cluster/PersonInfo.py
+++ /dev/null
@@ -1,57 +0,0 @@
-# This file is part of codeface-extraction, which is free software: you
-# can redistribute it and/or modify it under the terms of the GNU General
-# Public License as published by the Free Software Foundation, version 2.
-#
-# This program is distributed in the hope that it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
-# FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
-# details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-#
-# Copyright 2010, 2011 by Wolfgang Mauerer <wm@linux-kernel.net>
-# Copyright 2012, 2013, Siemens AG, Wolfgang Mauerer <wolfgang.mauerer@siemens.com>
-# All Rights Reserved.
-#
-# The code in this file originates from:
-# https://github.com/siemens/codeface/blob/master/codeface/cluster/PersonInfo.py
-
-from __future__ import absolute_import
-
-
-class PersonInfo:
-    """ Information about a commiter, and his relation to other commiters"""
-
-    def __init__(self, ID=None, name="", email=""):
-        self.ID = ID
-        self.name = name
-        self.email = email
-
-    def __str__(self):
-        return self.name + " <" + self.email + ">"
-
-    def setID(self, ID):
-        self.ID = ID
-    def getID(self):
-        return self.ID
-
-    def setName(self, name):
-        self.name = name
-    def getName(self):
-        if self.name == "":
-            return self.email
-        return self.name
-
-    def setEmail(self, email):
-        self.email = email
-    def getEmail(self):
-        return self.email
-
-
-############################ Test cases #########################
-if __name__ == "__main__":
-    personInfo = PersonInfo("sepp")
-
-# TODO: Implement a couple of test cases
diff --git a/codeface_utils/cluster/idManager.py b/codeface_utils/cluster/idManager.py
index 132ae28..5e64c90 100644
--- a/codeface_utils/cluster/idManager.py
+++ b/codeface_utils/cluster/idManager.py
@@ -34,7 +34,6 @@
 from abc import ABC, abstractmethod
 import pandas
 
-from codeface_utils.cluster.PersonInfo import PersonInfo
 from ..util import encode_as_utf8
 
 
@@ -46,13 +45,6 @@ def __init__(self):
         # Cache identical requests to the server
         self._cache = {}
 
-        # Map IDs to an instance of PersonInfo
-        self.persons = {}
-
-        # Map a name, email address, or a combination of both to the numeric ID
-        # assigned to the developer
-        self.person_ids = {}
-
         self.fixup_emailPattern = re.compile(r'([^<]+)\s+<([^>]+)>')
         self.commaNamePattern = re.compile(r'([^,\s]+),\s+(.+)')
 
@@ -65,31 +57,15 @@ def getPersonFromDB(self, person_id):
         pass
 
     def getPersonID(self, addr):
-        """Obtain a unique ID from contributor identity credentials.
-
-        The IDs are managed by a central csv file.
-        Managing multiple identities for the same person is also
-        handled there.
-        """
+        """Obtain a unique ID from contributor identity credentials."""
 
         (name, email) = self._decompose_addr(addr)
         if (name, email) not in self._cache:
             self._cache[(name, email)] = self._query_user_id(name, email)
         ID = self._cache[(name, email)]
 
-        # Construct a local instance of PersonInfo for the contributor
-        # if it is not yet available
-        if (ID not in self.persons):
-            self.persons[ID] = PersonInfo(ID, name, email)
-
         return ID
 
-    def getPersons(self):
-        return self.persons
-
-    def getPI(self, ID):
-        return self.persons[ID]
-
     def _cleanName(self, name):
         # Remove or replace characters in names that are known
         # to cause parsing problems in later stages
@@ -219,27 +195,6 @@ def _query_user_id(self, name, email):
 
         return (id)
 
-    def getPersonID(self, addr):
-        """Obtain a unique ID from contributor identity credentials.
-
-        The IDs are managed by a central database accessed via REST.
-        Managing multiple identities for the same person is also
-        handled there. Safety against concurrent access is provided by
-        the database.
-        """
-
-        (name, email) = self._decompose_addr(addr)
-        if (name, email) not in self._cache:
-            self._cache[(name, email)] = self._query_user_id(name, email)
-        ID = self._cache[(name, email)]
-
-        # Construct a local instance of PersonInfo for the contributor
-        # if it is not yet available
-        if ID not in self.persons:
-            self.persons[ID] = PersonInfo(ID, name, email)
-
-        return ID
-
     def getPersonFromDB(self, person_id):
         """Query the ID database for a contributor and all corresponding data"""
 
diff --git a/codeface_utils/linktype.py b/codeface_utils/linktype.py
index c9b0dfb..617d11f 100644
--- a/codeface_utils/linktype.py
+++ b/codeface_utils/linktype.py
@@ -18,8 +18,6 @@
 # The code in this file originates from:
 # https://github.com/siemens/codeface/blob/master/codeface/linktype.py
 
-__author__ = 'drag0on'
-
 
 #enum-like class to distinguish between the various
 #methods used to link individuals
diff --git a/codeface_utils/util.py b/codeface_utils/util.py
index 82ad09c..1a1d673 100644
--- a/codeface_utils/util.py
+++ b/codeface_utils/util.py
@@ -18,184 +18,19 @@
 # The code in this file originates from:
 # https://github.com/siemens/codeface/blob/master/codeface/util.py
 
-'''
-Utility functions for running external commands
-'''
-
 from __future__ import absolute_import
-import logging; log = logging.getLogger(__name__)
+import logging
 import os
 import os.path
 import re
-import shutil
-import signal
 import sys
 import traceback
 import unicodedata
-from collections import OrderedDict, namedtuple
-from glob import glob
-from math import sqrt
-from multiprocessing import Process, Queue, JoinableQueue, Lock
-from pickle import dumps, PicklingError
-from importlib.resources import files
-from subprocess import Popen, PIPE
-from tempfile import NamedTemporaryFile, mkdtemp
-from time import sleep
 from threading import enumerate as threading_enumerate
-from six.moves.queue import Empty
-from datetime import timedelta, datetime
 from ftfy import fix_encoding
-from six.moves import map
-import six
-from six.moves import range
-from six.moves import zip
-
-# Represents a job submitted to the batch pool.
-BatchJobTuple = namedtuple('BatchJobTuple', ['id', 'func', 'args', 'kwargs',
-        'deps', 'startmsg', 'endmsg'])
-class BatchJob(BatchJobTuple):
-    def __init__(self, *args, **kwargs):
-        super(BatchJob, self).__init__(*args, **kwargs)
-        self.done = False
-        self.submitted = False
-
-class BatchJobPool(object):
-    '''
-    Implementation of a dependency-respecting batch pool
-
-    This system uses a pool of N worker processes to run jobs. Since the
-    multiprocessing module is used, all functions, args and kwargs must be
-    pickleable. Specifically, this means that only functions defined at
-    top-level in a module can be used here.
-
-    Jobs can be created using pool.add(function, args, kwargs, deps=deps))
-    where deps can be a list of job handles previously returned by
-    pool.add. If multiprocessing is disabled, the functions are run
-    immediately and None is returned.
-
-    Call pool.join() to start execution and wait until all jobs are complete.
-    If a work item raises an exception, the join() will terminate with
-    that exception, if pickleable, or a generic Exception if otherwise.
-    '''
 
-    def __init__(self, n_cores):
-        self.n_cores = n_cores
-        self.next_id = 1
-        self.jobs = OrderedDict() # Dictionary of jobs (ordered for repeatability)
 
-        # Initialize workers and their work and done queues
-        self.work_queue, self.done_queues, self.workers = Queue(), [], []
-        if n_cores > 1:
-            # When n_cores is 1 we doen't use the process anyway.
-            # However the pycharm debugger goes crasy when we start the
-            # process, so as a workaround don't start anything when
-            # n_core is 1.
-            for i in range(n_cores):
-                dq = Queue()
-                w = Process(target=batchjob_worker_function, args=(self.work_queue, dq))
-                self.done_queues.append(dq)
-                self.workers.append(w)
-                w.start()
-
-    def _is_ready(self, job):
-        '''Returns true if the job is ready for submission'''
-        if job.done or job.submitted:
-            return False
-        return all(self.jobs[j].done for j in job.deps if j is not None)
-
-    def _submit(self, job):
-        '''Submit the job if it is ready'''
-        if self._is_ready(job):
-            self.work_queue.put(job)
-            job.submitted = True
-
-    def add(self, func, args, kwargs={}, deps=(), startmsg=None, endmsg=None):
-        '''
-        Add a job that executes func(*args, **kwargs) and depends on the
-        jobs with the ids listed in deps.
-        This function returns a job ID which can be used as a dependency
-        in other calls to add.
-        If n_cores is 1; this call immediately executes the given function
-        and returns None
-        '''
-        if self.n_cores == 1:
-            log.info(startmsg)
-            func(*args, **kwargs)
-            log.info(endmsg)
-            return None
-        job_id = self.next_id
-        self.next_id += 1
-        j = BatchJob(job_id, func, args, kwargs, deps, startmsg, endmsg)
-        self.jobs[job_id] = j
-        return job_id
-
-    def join(self):
-        '''
-        Submit jobs and wait for all jobs to finish.
-        '''
-        try:
-            while not all(j.done for j in self.jobs.values()):
-                # Put jobs that are ready onto the work queue
-                for j in self.jobs.values():
-                    self._submit(j)
-                # Wait for a result from the done_queues
-                for dq in self.done_queues:
-                    try:
-                        res = dq.get(block=False)
-                    except Empty:
-                        continue
-                    if res is None:
-                        log.fatal("Uncaught exception in worker thread!")
-                        raise Exception("Failure in Batch Pool")
-                    if isinstance(res, Exception):
-                        log.fatal("Uncaught exception in worker thread:")
-                        raise res
-                    log.debug("Job {} has finished!".format(res))
-                    self.jobs[res].done = True
-                # Check if workers died
-                for w in self.workers:
-                    if not w.is_alive():
-                        w.join()
-                        raise Exception("A Worker died unexpectedly!")
-                sleep(0.01)
-        finally:
-            # Terminate and join the workers
-            # Wait 100ms to allow backtraces to be logged
-            sleep(0.1)
-            log.info("Terminating workers...")
-            for w in self.workers:
-                w.terminate()
-            log.info("Workers terminated.")
-
-def batchjob_worker_function(work_queue, done_queue):
-    '''
-    Worker function executed in a separate process.
-    This function pulls work items off the work queue; terminates if there
-    is no item for 0.5s; otherwise executes the work item. Any exception
-    is reraised after putting a None onto the done_queue (triggering an
-    exception in the main process)
-    '''
-    # Silently quit on CTRL+C
-    signal.signal(signal.SIGINT, handle_sigint_silent)
-    while True:
-        try:
-            job = work_queue.get(block=True)
-        except ValueError as ve:
-            # This happens when the main loop stops before we do
-            return
-        log.debug("Starting job id {}".format(job.id))
-        try:
-            if job.startmsg:
-                log.info(job.startmsg)
-            job.func(*job.args, **job.kwargs)
-            if job.endmsg:
-                log.info(job.endmsg)
-            log.debug("Finished work id {}".format(job.id))
-            done_queue.put(job.id)
-        except Exception as e:
-            log.debug("Failed work id {}".format(job.id))
-            done_queue.put(Exception(e.__class__.__name__ + ": " +
-                    str(e) + "\n" + traceback.format_exc()))
+log = logging.getLogger(__name__)
 
 # Function to dump the stacks of all threads
 def get_stack_dump():
@@ -210,228 +45,6 @@ def get_stack_dump():
                 code.append("  %s" % (line.strip()))
     return code
 
-# Signal handler that dumps all stacks and terminates
-# Lock l dis-interleaves the stack traces of processes
-l = Lock()
-def handle_sigint(signal, frame):
-    with l:
-        log.fatal("CTRL-C pressed!")
-        for c in get_stack_dump():
-            log.info(c)
-    # This call raises a SystemExit exception in the
-    # stack frame that was interrupted by the signal
-    # For the main thread, this is what we want.
-    sys.exit(-1)
-
-# Signal handler that dumps all stacks and terminates silently
-# Also uses the Lock l to dis-interleave the stack traces
-def handle_sigint_silent(signal, frame):
-    with l:
-        for c in get_stack_dump():
-            log.info(c)
-    logging.shutdown()
-    # Since we want to terminate worker threads with prejudice,
-    # we use os._exit, which directly terminates the process.
-    # otherwise the worker try/catch will also catch the SystemExit
-    os._exit(-1)
-
-def handle_sigterm(signal, frame):
-    # Since we want to terminate worker threads with prejudice,
-    # we use os._exit, which directly terminates the process.
-    # otherwise the worker try/catch will also catch the SystemExit
-    logging.shutdown()
-    os._exit(-1)
-
-def handle_sigusr1(signal, frame):
-    for c in get_stack_dump():
-        log.info(c)
-
-# Dump all the stacks in case of CTRL-C
-signal.signal(signal.SIGINT, handle_sigint)
-# Also dump on sigterm
-signal.signal(signal.SIGTERM, handle_sigterm)
-# Also dump on sigusr1, but do not terminate
-signal.signal(signal.SIGUSR1, handle_sigusr1)
-
-def execute_command(cmd, ignore_errors=False, direct_io=False, cwd=None, silent_errors=False):
-    '''
-    Execute the command `cmd` specified as a list of ['program', 'arg', ...]
-    If ignore_errors is true, a non-zero exit code will be ignored (and a warning
-    messages will be issued), otherwise an exception is raised. If silent_errors is True,
-    no messages will be emitted even in case of an error (but exceptions will still be raised).
-    If direct_io is True, do not capture the stdin and stdout of the command.
-    Returns the stdout of the command.
-    '''
-    jcmd = " ".join(cmd)
-    log.debug("Running command: {}".format(jcmd))
-    try:
-        if direct_io:
-            pipe = Popen(cmd, cwd=cwd)
-        else:
-            pipe = Popen(cmd, stdout=PIPE, stderr=PIPE, cwd=cwd)
-        stdout, stderr = pipe.communicate()
-    except OSError:
-        log.error("Error executing command {}!".format(jcmd))
-        raise
-
-    if pipe.returncode != 0:
-        if ignore_errors:
-            if not(silent_errors):
-                log.warning("Command '{}' failed with exit code {}. Ignored.".
-                            format(jcmd, pipe.returncode))
-        else:
-            if not(direct_io) and not(silent_errors):
-                log.info("Command '{}' stdout:".format(jcmd))
-                for line in stdout.splitlines():
-                    log.info(line)
-                log.info("Command '{}' stderr:".format(jcmd))
-                for line in stderr.splitlines():
-                    log.info(line)
-            msg = "Command '{}' failed with exit code {}. \n" \
-                  "(stdout: {}\nstderr: {})"\
-                  .format(jcmd, pipe.returncode, stdout, stderr)
-            if not(silent_errors):
-                log.error(msg)
-            raise Exception(msg)
-    return stdout
-
-def _convert_dot_file(dotfile):
-    '''
-    Convert duplicate edges in the given dot file into edges with
-    a larger pen width.
-    '''
-    res = []
-    edges = {}
-    edge_spec = re.compile(r"\s+(\d+) -> (\d+);")
-
-    file = open(dotfile, "r")
-    lines = [line.strip("\n") for line in file]
-    # Modify the header (copyright line + digraph)
-    lines[0] = "digraph {"
-    lines[1] = "node[fontsize=30, shape=\"box\"];"
-
-    lines[len(lines)-1] = "" # Skip closing brace
-
-    for line in lines:
-        m = re.match(edge_spec, line)
-        if m:
-            a, b = m.group(1), m.group(2)
-            edges[(a,b)] = edges.get((a,b), 0) + 1
-        else:
-            res.append(line + "\n")
-
-    # sort the edges for reproducibility
-    for ((a, b), count) in sorted(edges.items()):
-        res.append("{0} -> {1} [weight={2} penwidth={3}];\n".
-              format(a,b,count, sqrt(float(count))))
-
-    res.append("overlap=prism;\n")
-    res.append("splines=true;\n")
-    res.append("}\n")
-    return res
-
-def layout_graph(filename):
-    out = NamedTemporaryFile(mode="w", delete=False)
-    out.writelines(_convert_dot_file(filename))
-    out.close() # flushes the cache
-    cmd = []
-    cmd.append("dot")
-    cmd.append("-Kfdp")
-    cmd.append("-Tpdf")
-    cmd.append("-Gcharset=utf-8")
-    cmd.append("-o{0}.pdf".format(os.path.splitext(filename)[0]))
-    cmd.append(out.name)
-    execute_command(cmd, ignore_errors=True)
-    # Manually remove the temporary file
-    os.unlink(out.name)
-
-def generate_report(start_rev, end_rev, resdir):
-    log.info("  -> Generating report")
-    report_base = "report-{0}_{1}".format(start_rev, end_rev)
-
-    # Run perl script to generate report LaTeX file
-    cmd = []
-    cmd.append(files(__package__).joinpath("perl/create_report.pl"))
-    cmd.append(resdir)
-    cmd.append("{0}--{1}".format(start_rev, end_rev))
-    with open(os.path.join(resdir, report_base + ".tex"), 'w') as f:
-        f.write(execute_command(cmd))
-
-    # Compile report with lualatex
-    cmd = []
-    cmd.append("lualatex")
-    cmd.append("-interaction=nonstopmode")
-    cmd.append(os.path.join(resdir, report_base + ".tex"))
-
-    # We run latex in a temporary directory so that it's easy to
-    # get rid of the log files etc. created during the run that are
-    # not relevant for the final result
-    orig_wd = os.getcwd()
-    tmpdir = mkdtemp()
-
-    os.chdir(tmpdir)
-    execute_command(cmd, ignore_errors=True)
-    try:
-        shutil.copy(report_base + ".pdf", resdir)
-    except IOError:
-        log.warning("Could not copy report PDF (missing input data?)")
-
-    os.chdir(orig_wd)
-    shutil.rmtree(tmpdir)
-
-def generate_reports(start_rev, end_rev, range_resdir):
-    files = glob(os.path.join(range_resdir, "*.dot"))
-    log.info("  -> Generating Reports...")
-    for file in files:
-        layout_graph(file)
-    generate_report(start_rev, end_rev, range_resdir)
-
-def check4ctags():
-    # check if the appropriate ctags is installed on the system
-    prog_name    = 'Universal Ctags'
-    prog_version = 'Universal Ctags 5.9.0, Copyright (C) 2015 Universal Ctags Team'
-    cmd = "ctags-universal --version".split()
-
-    res = execute_command(cmd, None)
-
-    if not(res.startswith(prog_name)):
-        log.error("program '{0}' does not exist".format(prog_name))
-        raise Exception("ctags-universal not found")
-
-    if not(res.startswith(prog_version)):
-        # TODO: change this to use standard mechanism for error logging
-        log.error("Ctags version '{0}' not found".format(prog_version))
-        raise Exception("Incompatible ctags-universal version")
-
-
-def check4cppstats():
-    """
-    check if the appropriate cppstats is installed on the system.
-    """
-    # We can not check the version directly as there is no version switch
-    # on cppstats We just check if the first line is OK.
-    line = "cppstats v0.9."
-    cmd = "/usr/bin/env cppstats --version".split()
-    res = execute_command(cmd)
-    if not (res.startswith(line)):
-        error_message = "expected the first line to start with '{0}' but "\
-                        "got '{1}'".format(line, res[0])
-        log.error("program cppstats does not exist, or it is not working "
-                  "as expected ({0}"
-                  .format(error_message))
-        raise Exception("no working cppstats found ({0})"
-                        .format(error_message))
-
-
-def gen_prefix(i, num_ranges, start_rev, end_rev):
-    if (len(start_rev) == 40):
-        # When revisions are given by commit hashes, shorten them since
-        # they don't carry any meaning
-        start_rev = start_rev[0:6]
-        end_rev = end_rev[0:6]
-    return("  -> Revision range {0}/{1} ({2}..{3}): ".format(i, num_ranges,
-                                                             start_rev, end_rev))
-
 def gen_range_path(base_path, i, start_rev, end_rev):
     if (len(start_rev) == 40):
         # Same logic as above, but construct a file system path
@@ -440,127 +53,6 @@ def gen_range_path(base_path, i, start_rev, end_rev):
     return(os.path.join(base_path, "{0}--{1}-{2}".
                         format(str(i).zfill(3), start_rev, end_rev)))
 
-
-def parse_iso_git_date(date_string):
-    # from http://stackoverflow.com/questions/526406/python-time-to-age-part-2-timezones
-    try:
-        offset = int(date_string[-5:])
-    except:
-        log.error("could not extract timezone info from \"{0}\""
-                  .format(date_string))
-        raise
-    minutes = (offset if offset > 0 else -offset) % 100
-    delta = timedelta(hours=offset / 100,
-                      minutes=minutes if offset > 0 else -minutes)
-    # In future python versions we can use "%Y-%m-%d %H:%M:%S %z"
-    # this way we don't need the above workaround, currently %z isn't
-    # working as documented
-    fmt = "%Y-%m-%d %H:%M:%S"
-    parsed_date = datetime.strptime(date_string[:-6], fmt)
-    parsed_date -= delta
-    return parsed_date
-
-# Determine settings for the size and amount of analysis windows. If nothing
-# specific is provided, use default settings
-def get_analysis_windows(conf):
-    window_size_months = 3
-    num_window = -1
-
-    if "windowSize" in list(conf.keys()):
-        window_size_months = conf["windowSize"]
-    if "numWindows" in list(conf.keys()):
-        num_window = conf["numWindows"]
-
-    return window_size_months, num_window
-
-def generate_analysis_windows(repo, window_size_months):
-    """
-    Generates a list of revisions (commit hash) in increments of the window_size
-    parameter. The window_size parameter specifies the number of months between
-    revisions. This function is useful when the git repository has no tags
-    referencing releases.
-    """
-    cmd_date = 'git --git-dir={0} show --format=%ad  --date=iso8601'\
-        .format(repo).split()
-    latest_date_result = execute_command(cmd_date).splitlines()[0]
-    latest_commit = parse_iso_git_date(latest_date_result)
-
-    cmd_root_commit_dates = 'git --git-dir={0} log --max-parents=0 --format=%ad  --date=iso8601'\
-        .format(repo).split()
-    root_commit_dates_result = execute_command(cmd_root_commit_dates).splitlines()
-    earliest_root_commit_date = min([parse_iso_git_date(root_commit) for root_commit in root_commit_dates_result])
-
-    print_fmt = "%Y-%m-%dT%H:%M:%S+0000"
-    month = timedelta(days=30)
-
-    def get_before_arg(num_months):
-        date = latest_commit - num_months * month
-
-        # Due to a bug in git, broken author information in commit objects can lead to a timestamp of 0 when using the
-        # --before option although the dates themselves are not broken and can be parsed without problems.
-        # For more details, see the whole thread conversation on the git mailing list here:
-        # https://lore.kernel.org/git/7728e059-d58d-cce7-c011-fbc16eb22fb9@cs.uni-saarland.de/
-        # To avoid running into an infinite while loop below (due to timestamps being 0), check if the date is earlier
-        # than the date of the earliest root commit and break if this is the case.
-        if date < earliest_root_commit_date:
-            raise ValueError("The before-arg date is earlier than the earliest commit in the repository.")
-
-        return '--before=' + date.strftime(print_fmt)
-
-    revs = []
-    start = window_size_months  # Window size time ago
-    end = 0  # Present time
-    cmd_base = 'git --git-dir={0} log --no-merges --format=%H,%ct,%ci'\
-        .format(repo).split()
-    cmd_base_max1 = cmd_base + ['--max-count=1']
-    cmd = cmd_base_max1 + [get_before_arg(end)]
-    rev_end = execute_command(cmd).splitlines()
-    revs.extend(rev_end)
-
-    while start != end:
-
-        try:
-            cmd = cmd_base_max1 + [get_before_arg(start)]
-            rev_start = execute_command(cmd).splitlines()
-        except ValueError as ve:
-            rev_start = []
-            log.info("rev_start would be earlier than earliest root commit. Start at initial commit instead...")
-
-        if len(rev_start) == 0:
-            start = end
-            #cmd = cmd_base + ['--reverse']
-            #rev_start = [execute_command(cmd).splitlines()[0]]
-            cmd = cmd_base + ['--max-parents=0']
-            rev_start = [execute_command(cmd).splitlines()[-1]]
-        else:
-            end = start
-            start = end + window_size_months
-
-        # Check if any commits occurred since the last analysis window
-        if rev_start[0] != revs[0]:
-            revs = rev_start + revs
-        # else: no commit happened since last window, don't add duplicate
-        #       revisions
-    # End while
-
-    # Check that commit dates are monotonic, in some cases the earliest
-    # first commit does not carry the earliest commit date
-    revs = [rev.split(",") for rev in revs]
-    rev_len = len(revs)
-    if int(revs[0][1]) > int(revs[1][1]):
-      del revs[0]
-
-    # Extract hash values and dates intro seperate lists
-    revs_hash = [rev[0] for rev in revs]
-    revs_date = [rev[2].split(" ")[0] for rev in revs]
-
-    # We cannot detect release canndidate tags in this analysis mode,
-    # so provide a list with None entries
-    rcs = [None for x in range(len(revs))]
-
-    return revs_hash, rcs, revs_date
-
-
 def encode_as_utf8(string):
     """
     Encode the given string properly in UTF-8,
@@ -573,27 +65,28 @@ def encode_as_utf8(string):
     :return: the UTF-8 encoded string of type str
     """
 
-    try:
-        string = string.decode("utf-8")
-    except:
-        # if we have a string, we transform it to unicode
-        if isinstance(string, str):
-            string = six.text_type(string, "unicode-escape", errors="replace")
-
-    ## maybe not a string/unicode at all, return rightaway
-    if not isinstance(string, six.text_type):
+    # Normalize to str first
+    if isinstance(string, bytes):
+        try:
+            text = string.decode("utf-8")
+        except UnicodeDecodeError:
+            text = string.decode("utf-8", errors="replace")
+    elif isinstance(string, str):
+        text = string
+    else:
+        # not string-like, return as-is
         return string
 
     # convert to real unicode-utf8 encoded string, fix_text ensures proper encoding
-    new_string = fix_encoding(string)
+    new_string = fix_encoding(text)
 
     # remove unicode characters from "Specials" block
     # see: https://www.compart.com/en/unicode/block/U+FFF0
-    new_string = re.sub(r"\\ufff.", " ", new_string.encode("unicode-escape"))
+    new_string = re.sub(r"\ufff.", " ", new_string)
 
     # remove all kinds of control characters and emojis
     # see: https://www.fileformat.info/info/unicode/category/index.htm
-    new_string = u"".join(ch if unicodedata.category(ch)[0] != "C" else " " for ch in new_string.decode("unicode-escape"))
+    new_string = u"".join(ch if unicodedata.category(ch)[0] != "C" else " " for ch in new_string)
 
     new_string = new_string.encode("utf-8")
 
@@ -610,28 +103,3 @@ def encode_as_utf8(string):
 
     return str(new_string)
 
-
-def encode_items_as_utf8(items):
-    """
-    Encode the given list/tuple/dict of strings properly in UTF-8,
-    independent from its internal representation (str or unicode).
-
-    This function uses encode_as_utf8(string) internally.
-
-    :param string: any string
-    :return: the UTF-8 encoded string of type str
-    """
-
-    # unpack values if we have a dictionary
-    items_unpacked = items
-    if isinstance(items, dict):
-        items_unpacked = list(items.values())
-
-    # encode each item as UTF-8 properly
-    items_enc = list(map(encode_as_utf8, items_unpacked))
-
-    # add key for dict again
-    if isinstance(items, dict):
-        items_enc = dict(zip(list(items.keys()), items_enc))
-
-    return items_enc
diff --git a/csv_writer/csv_writer.py b/csv_writer/csv_writer.py
index f950694..ca453be 100644
--- a/csv_writer/csv_writer.py
+++ b/csv_writer/csv_writer.py
@@ -24,19 +24,6 @@
 import csv
 
 
-def __encode(line):
-    """Encode the given line (a tuple of columns) properly in UTF-8."""
-
-    lineres = ()  # re-encode column if it is unicode
-    for column in line:
-        if type(column) is str:
-            lineres += (column.encode("utf-8"),)
-        else:
-            lineres += (column,)
-
-    return lineres
-
-
 def write_to_csv(file_path, lines, append=False):
     """
     Write the given lines to the file with the given file path.

From f6bc6a2c02159480b2a492a8c54e8e667eda2882 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20L=C3=B6ffler?= <s8maloef@stud.uni-saarland.de>
Date: Thu, 18 Sep 2025 18:50:21 +0200
Subject: [PATCH 10/13] Ensure string represenation of all user data in
 'get_id_and_update_user'
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Maximilian Löffler <s8maloef@stud.uni-saarland.de>
---
 issue_processing/issue_processing.py      | 7 ++++---
 issue_processing/jira_issue_processing.py | 6 +++---
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py
index 3b53cab..1b5a823 100644
--- a/issue_processing/issue_processing.py
+++ b/issue_processing/issue_processing.py
@@ -688,9 +688,10 @@ def get_user_string(name, email):
 
     def get_id_and_update_user(user, buffer_db_ids=user_id_buffer, buffer_usernames=username_id_buffer):
 
-        # fix encoding for name and e-mail address
-        name = user["name"] if "name" in user else str(user["username"])
-        mail = user["email"] # empty
+        # ensure string representation for name and e-mail address
+        username = str(user["username"])
+        name = str(user["name"]) if "name" in user else username
+        mail = str(user["email"])
 
         # construct string for ID service and send query
         user_string = get_user_string(name, mail)
diff --git a/issue_processing/jira_issue_processing.py b/issue_processing/jira_issue_processing.py
index bfc2214..97f1711 100644
--- a/issue_processing/jira_issue_processing.py
+++ b/issue_processing/jira_issue_processing.py
@@ -608,9 +608,9 @@ def get_user_string(name, email):
 
     def get_id_and_update_user(user, buffer_db_ids=user_id_buffer):
 
-        # fix encoding for name and e-mail address
-        name = user["name"] if "name" in user else str(user["username"])
-        mail = user["email"]
+        # ensure string representation for name and e-mail address
+        name = str(user["name"]) if "name" in user else str(user["username"])
+        mail = str(user["email"]) # may be empty
 
         # construct string for ID service and send query
         user_string = get_user_string(name, mail)

From 41822da520c61b4a0775d1427dbaa04c368bcd9b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Maximilian=20L=C3=B6ffler?= <s8maloef@stud.uni-saarland.de>
Date: Tue, 30 Sep 2025 13:46:56 +0200
Subject: [PATCH 11/13] Use the same return format in all implementations of
 'getPersonFromDB'
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Maximilian Löffler <s8maloef@stud.uni-saarland.de>
---
 codeface_utils/cluster/idManager.py       | 15 +++++++--------
 issue_processing/issue_processing.py      |  9 +++++----
 issue_processing/jira_issue_processing.py |  9 +++++----
 3 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/codeface_utils/cluster/idManager.py b/codeface_utils/cluster/idManager.py
index 5e64c90..43a4be5 100644
--- a/codeface_utils/cluster/idManager.py
+++ b/codeface_utils/cluster/idManager.py
@@ -296,11 +296,10 @@ def _query_user_id(self, name, email):
             raise Exception("Constructed author list is in invalid format. Duplicate entries found")
 
     def getPersonFromDB(self, person_id):
-        """Get a PersonInfo instance from the database by ID."""
-        if person_id not in self.persons:
-            rows = self.df[self.df['ID'] == person_id]
-            if len(rows) == 1:
-                name = rows['name'].values[0]
-                email = rows['email'].values[0]
-                self.persons[person_id] = PersonInfo(person_id, name, email)
-        return self.persons.get(person_id, None)
+        rows = self.df[self.df['ID'] == person_id]
+        if len(rows) == 1:
+            return {
+                'name': rows['name'].values[0],
+                'email1': rows['email'].values[0],
+                'id': person_id
+            }
diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py
index 1b5a823..013fd8c 100644
--- a/issue_processing/issue_processing.py
+++ b/issue_processing/issue_processing.py
@@ -727,10 +727,11 @@ def get_user_from_id(idx, buffer_db=user_buffer):
         # get person information from ID service
         log.info("Passing user id '{}' to ID service.".format(idx))
         person = idservice.getPersonFromDB(idx)
-        user = dict()
-        user["email"] = person.getEmail()  # column "email1"
-        user["name"] = person.getName()  # column "name"
-        user["id"] = person.getID()  # column "id"
+        user = {
+            "name": person["name"],
+            "email": person["email1"],
+            "id": person["id"]
+        }
 
         # add user information to buffer
         buffer_db[idx] = user
diff --git a/issue_processing/jira_issue_processing.py b/issue_processing/jira_issue_processing.py
index 97f1711..4974707 100644
--- a/issue_processing/jira_issue_processing.py
+++ b/issue_processing/jira_issue_processing.py
@@ -640,10 +640,11 @@ def get_user_from_id(idx, buffer_db=user_buffer):
         # get person information from ID service
         log.info("Passing user id '{}' to ID service.".format(idx))
         person = idservice.getPersonFromDB(idx)
-        user = dict()
-        user["email"] = person["email1"]  # column "email1"
-        user["name"] = person["name"]  # column "name"
-        user["id"] = person["id"]  # column "id"
+        user = {
+            "name": person["name"],
+            "email": person["email1"],
+            "id": person["id"]
+        }
 
         # add user information to buffer
         buffer_db[idx] = user

From 9e33e834d63ebe729b1d2e0ce252d5ac5219281d Mon Sep 17 00:00:00 2001
From: Leo Sendelbach <s8lesend@stud.uni-saarland.de>
Date: Tue, 27 Jan 2026 14:46:36 +0100
Subject: [PATCH 12/13] Move changes from PR53 to python 3

Moved all prior changes. Needs testing.

Signed-off-by: Leo Sendelbach <s8lesend@stud.uni-saarland.de>
---
 issue_processing/issue_processing.py | 164 +++++++++++++++++++++++----
 1 file changed, 145 insertions(+), 19 deletions(-)

diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py
index 013fd8c..1e65d09 100644
--- a/issue_processing/issue_processing.py
+++ b/issue_processing/issue_processing.py
@@ -19,6 +19,7 @@
 # Copyright 2019 by Thomas Bock <bockthom@fim.uni-passau.de>
 # Copyright 2020-2021 by Thomas Bock <bockthom@cs.uni-saarland.de>
 # Copyright 2025 by Maximilian Löffler <s8maloef@stud.uni-saarland.de>
+# Copyright 2025-2026 by Leo Sendelbach <s8lesend@stud.uni-saarland.de>
 # All Rights Reserved.
 """
 This file is able to extract Github issue data from json files.
@@ -42,7 +43,10 @@
 log = getLogger(__name__)
 
 # known types from JIRA and GitHub default labels
-known_types = {"bug", "improvement", "enhancement", "new feature", "task", "test", "wish"}
+known_types = {"bug", "improvement", "enhancement", "feature", "task", "test", "wish"}
+
+# Copilot username to be assigned in specific copilot events
+copilot_username = "Copilot"
 
 # known resolutions from JIRA and GitHub default labels
 known_resolutions = {"unresolved", "fixed", "wontfix", "duplicate", "invalid", "incomplete", "cannot reproduce",
@@ -75,13 +79,14 @@ def run():
     # 1) load the list of issues
     issues = load(__srcdir)
     # 2) re-format the issues
-    issues = reformat_issues(issues)
+    reformat_issues(issues)
     # 3) merges all issue events into one list
-    issues = merge_issue_events(issues)
+    external_connected_events = dict()
+    filtered_connected_events = merge_issue_events(issues, external_connected_events)
     # 4) re-format the eventsList of the issues
-    issues = reformat_events(issues)
+    reformat_events(issues, filtered_connected_events, external_connected_events)
     # 5) update user data with Codeface database and dump username-to-name/e-mail list
-    issues = insert_user_data(issues, __conf, __resdir)
+    insert_user_data(issues, __conf, __resdir)
     # 6) dump result to disk
     print_to_disk(issues, __resdir)
 
@@ -239,7 +244,10 @@ def reformat_issues(issue_data):
     for issue in issue_data:
 
         # empty container for issue types
-        issue["type"] = []
+        if issue["type"] is None:
+            issue["type"] = []
+        else:
+            issue["type"] = [issue["type"]["name"].lower()]
 
         # empty container for issue resolutions
         issue["resolution"] = []
@@ -264,6 +272,10 @@ def reformat_issues(issue_data):
         if "relatedIssues" not in issue:
             issue["relatedIssues"] = []
 
+        # if an issue has no sub-issue list, an empty List gets created
+        if "subIssues" not in issue:
+            issue["subIssues"] = []
+
         # add "closed_at" information if not present yet
         if issue["closed_at"] is None:
             issue["closed_at"] = ""
@@ -280,10 +292,10 @@ def reformat_issues(issue_data):
         else:
             issue["type"].append("issue")
 
-    return issue_data
+    return
 
 
-def merge_issue_events(issue_data):
+def merge_issue_events(issue_data, external_connected_events):
     """
     All issue events are merged together in the eventsList. This simplifies processing in later steps.
 
@@ -294,6 +306,7 @@ def merge_issue_events(issue_data):
     log.info("Merge issue events ...")
 
     issue_data_to_update = dict()
+    connected_events = dict()
 
     for issue in issue_data:
 
@@ -362,6 +375,7 @@ def merge_issue_events(issue_data):
             # it is a commit which was added to the pull request
             if rel_commit["type"] == "commitAddedToPullRequest":
                 rel_commit["event"] = "commit_added"
+                rel_commit["event_info_2"] = rel_commit["commit"]["author"]
 
             # if the related commit was mentioned in an issue comment:
             elif rel_commit["type"] == "commitMentionedInIssue":
@@ -477,6 +491,12 @@ def merge_issue_events(issue_data):
             if event["event"] == "review_requested" or event["event"] == "review_request_removed":
                 event["ref_target"] = event["requestedReviewer"]
 
+            # if event is a specific copilot event, assign the copilot user data
+            if event["event"] == "copilot_work_started" or event["event"] == "copilot_work_finished":
+                event["user"]["name"] = None
+                event["user"]["username"] = copilot_username
+                event["user"]["email"] = ""
+
             # if event dismisses a review, we can determine the original state of the corresponding review
             if event["event"] == "review_dismissed":
                 for review in issue["reviewsList"]:
@@ -489,6 +509,32 @@ def merge_issue_events(issue_data):
                 event["ref_target"] = event["user"]
                 event["user"] = event["assigner"]
 
+            # if event is merged event, save the hash of the merge commit in event_info_1
+            if event["event"] == "merged":
+                event["event_info_1"] = event["commit"]["hash"]
+
+            # if event is connected event, create or add to a matching dict entry by matching timestamps, for later reconstruction
+            if event["event"] == "connected":
+                if event["created_at"] in list(connected_events.keys()) and connected_events[event["created_at"]]["user"] == event["user"]:
+                    # if there is already a connected event at this time by this user, add this event to the list
+                    connected_events[event["created_at"]]["issues"].append(issue["number"])
+                elif subtract_seconds_from_time(event["created_at"], 1) in list(connected_events.keys()) \
+                        and connected_events[subtract_seconds_from_time(event["created_at"], 1)]["user"] == event["user"]:
+                    # same as above, but accounting for a possible difference in timestamps of 1 second between matching events
+                    connected_events[subtract_seconds_from_time(event["created_at"], 1)]["issues"].append(issue["number"])
+                    event["created_at"] = subtract_seconds_from_time(event["created_at"], 1)
+                elif subtract_seconds_from_time(event["created_at"], -1) in list(connected_events.keys()) \
+                        and connected_events[subtract_seconds_from_time(event["created_at"], -1)]["user"] == event["user"]:
+                    # same as above, with offset calculated in the other direction
+                    connected_events[subtract_seconds_from_time(event["created_at"], -1)]["issues"].append(issue["number"])
+                    event["created_at"] = subtract_seconds_from_time(event["created_at"], -1)
+                else:
+                    # if there is no connected event yet at this timestamp, create a new entry for this event
+                    connected_info = dict()
+                    connected_info["issues"] = [issue["number"]]
+                    connected_info["user"] = event["user"]
+                    connected_events[event["created_at"]] = connected_info
+
         # merge events, relatedCommits, relatedIssues and comment lists
         issue["eventsList"] = issue["commentsList"] + issue["eventsList"] + issue["relatedIssues"] + issue[
             "relatedCommits"] + issue["reviewsList"]
@@ -500,16 +546,53 @@ def merge_issue_events(issue_data):
         # sorts eventsList by time
         issue["eventsList"] = sorted(issue["eventsList"], key=lambda k: k["created_at"])
 
+    # filter out connected events which cannot be perfectly matched
+    # and populate external_connected_events dict
+    # because this happens in place, we do not need to return the external_connected_event dict later
+    filtered_connected_events = dict(filter(lambda item: filter_connected_events(item[0], item[1], external_connected_events), connected_events.items()))
+
     # updates all the issues by the temporarily stored referenced_by events
     for _, value in issue_data_to_update.items():
         for issue in issue_data:
             if issue["number"] == value["number"]:
                 issue["eventsList"] = issue["eventsList"] + value["eventsList"]
 
-    return issue_data
-
-
-def reformat_events(issue_data):
+    return filtered_connected_events
+
+def filter_connected_events(key, value, external_connected_events):
+    num_issues = len(value["issues"])
+    # if only a single connected event exists at this time, it has to be connecting to an external issue
+    if num_issues == 1:
+        external_connected_events[key] = value
+        return False
+    # if 2 connected events exist, matching them is trivial
+    if num_issues == 2:
+        return True
+    occurances = {x: value["issues"].count(x) for x in set(value["issues"])}
+    # otherwise, if it is an even number, check if it can be easily matched,
+    # meaning that exactly half the events occur in the same issue
+    if num_issues % 2 == 0 and num_issues/2 in occurances.values():
+        # duplicate issue list for matching the issues later
+        value["multi_issues_copy"] = list(value["issues"])
+        return True
+    # if it is an odd number, check if it can be easily matched
+    # meaning that exactly half (rounded up) the events occur in the same issue
+    if num_issues % 2 == 1 and (num_issues + 1)/2 in occurances.values():
+        for sub_key, sub_value in occurances.items():
+            # then, assign one of them as an external connected event and proceed as in previous case
+            if sub_value == (num_issues + 1)/2:
+                new_entry = dict()
+                new_entry["user"] = value["user"]
+                new_entry["issues"] = [sub_key]
+                external_connected_events[key] = new_entry
+                value["issues"].remove(sub_key)
+                # duplicate issue list for matching the issues later
+                value["multi_issues_copy"] = list(value["issues"])
+                return True
+    # no other variants can be easily matched
+    return False
+
+def reformat_events(issue_data, filtered_connected_events, external_connected_events):
     """
     Re-format event information dependent on the event type.
 
@@ -539,6 +622,35 @@ def reformat_events(issue_data):
             if event["ref_target"] is not None and not event["ref_target"] == "":
                 users = update_user_dict(users, event["ref_target"])
 
+            # reconstruction of connections
+            if event["event"] == "connected":
+                if event["created_at"] in external_connected_events \
+                    and issue["number"] in external_connected_events[event["created_at"]]["issues"]:
+                    # if the event is an external connected event, mark it as such and remove this issue from the list
+                    event["event_info_1"] = "external"
+                    external_connected_events[event["created_at"]]["issues"].remove(issue["number"])
+                elif event["created_at"] in filtered_connected_events \
+                    and issue["number"] in filtered_connected_events[event["created_at"]]["issues"]:
+                    # if it is instead an internal connected event
+                    value = filtered_connected_events[event["created_at"]]
+                    if len(value["issues"]) == 2:
+                        # and we only have 2 issues in the list, connect to the other issue
+                        event["event_info_1"] = value["issues"][0] if value["issues"][1] == issue["number"] else value["issues"][1]
+                    else:
+                        # and we have more than two issues, count each issue's occurences
+                        occurances = {x: value["issues"].count(x) for x in set(value["issues"])}
+                        if occurances[issue["number"]] == max(occurances.values()):
+                            # if our issue is the most common one, that means it is the common denominator
+                            # for all connected events at this time
+                            # so this event connects to any other issue
+                            # which is then removed from a copied list to avoid duplications
+                            number = next(x for x in value["multi_issues_copy"] if x != issue["number"])
+                            value["multi_issues_copy"].remove(number)
+                            event["event_info_1"] = number
+                        else:
+                            # otherwise, connect this event to the common denominator
+                            event["event_info_1"] = max(occurances, key = occurances.get)
+
     # as the user dictionary is created, start re-formating the event information of all issues
     for issue in issue_data:
 
@@ -556,13 +668,16 @@ def reformat_events(issue_data):
             if event["event"] == "closed":
                 event["event"] = "state_updated"
                 event["event_info_1"] = "closed"  # new state
-                event["event_info_2"] = "open"  # old state
+                if event["commit"] is not None:
+                    event["event_info_2"] = event["commit"]["hash"]
+                else:
+                    event["event_info_2"] = event["state_reason"]
                 issue["state_new"] = "closed"
 
             elif event["event"] == "reopened":
                 event["event"] = "state_updated"
                 event["event_info_1"] = "open"  # new state
-                event["event_info_2"] = "closed"  # old state
+                event["event_info_2"] = event["state_reason"]
                 issue["state_new"] = "reopened"
 
             elif event["event"] == "labeled":
@@ -570,7 +685,7 @@ def reformat_events(issue_data):
                 event["event_info_1"] = label
 
                 # if the label is in this list, it also is a type of the issue
-                if label in known_types:
+                if label in known_types and label not in issue["type"]:
                     issue["type"].append(str(label))
 
                     # creates an event for type updates and adds it to the eventsList
@@ -635,7 +750,11 @@ def reformat_events(issue_data):
                 # "state_new" and "resolution" of the issue give the information about the state and the resolution of
                 # the issue when the comment was written, because the eventsList is sorted by time
                 event["event_info_1"] = issue["state_new"]
-                event["event_info_2"] = issue["resolution"]
+                # if event is a review comment, it can contain suggestions
+                if "contains_suggestion" in event:
+                    event["event_info_2"] = event["contains_suggestion"]
+                else:
+                    event["event_info_2"] = False
 
             elif event["event"] == "referenced" and event["commit"] is not None:
                 # remove "referenced" events originating from commits
@@ -649,7 +768,7 @@ def reformat_events(issue_data):
         for event_to_remove in events_to_remove:
             issue["eventsList"].remove(event_to_remove)
 
-    return issue_data
+    return
 
 
 def insert_user_data(issues, conf, resdir):
@@ -748,6 +867,9 @@ def get_user_from_id(idx, buffer_db=user_buffer):
         for event in issue["eventsList"]:
             event["user"] = get_id_and_update_user(event["user"])
 
+            if event["event"] == "commit_added":
+                event["event_info_2"] = get_id_and_update_user(event["event_info_2"])
+
             # check database for the reference-target user if needed
             if event["ref_target"] != "":
                 event["ref_target"] = get_id_and_update_user(event["ref_target"])
@@ -761,6 +883,10 @@ def get_user_from_id(idx, buffer_db=user_buffer):
         for event in issue["eventsList"]:
             event["user"] = get_user_from_id(event["user"])
 
+            # for commit_added events, save the commit's author's name in event_info_2
+            if event["event"] == "commit_added":
+                event["event_info_2"] = get_user_from_id(event["event_info_2"])["name"]
+
             # get the reference-target user if needed
             if event["ref_target"] != "":
                 event["ref_target"] = get_user_from_id(event["ref_target"])
@@ -781,7 +907,7 @@ def get_user_from_id(idx, buffer_db=user_buffer):
     username_dump = os.path.join(resdir, "usernames.list")
     csv_writer.write_to_csv(username_dump, sorted(set(lines), key=lambda line: line[0]))
 
-    return issues
+    return
 
 
 def print_to_disk(issues, results_folder):
@@ -808,7 +934,7 @@ def print_to_disk(issues, results_folder):
                 json.dumps(issue["resolution"]),
                 issue["created_at"],
                 issue["closed_at"],
-                json.dumps([]),  # components
+                json.dumps([issue["subIssues"]]),  # components
                 event["event"],
                 event["user"]["name"],
                 event["user"]["email"],

From c4d82bcf86f29125449a872ee73f74f106d97c33 Mon Sep 17 00:00:00 2001
From: Leo Sendelbach <s8lesend@stud.uni-saarland.de>
Date: Tue, 27 Jan 2026 15:37:12 +0100
Subject: [PATCH 13/13] Add changes from PR53 to author postprocessing

All older changes transcribed, needs etsting

Signed-off-by: Leo Sendelbach <s8lesend@stud.uni-saarland.de>
---
 .../author_postprocessing.py                  | 24 +++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/author_postprocessing/author_postprocessing.py b/author_postprocessing/author_postprocessing.py
index 53caeb2..cee17c5 100644
--- a/author_postprocessing/author_postprocessing.py
+++ b/author_postprocessing/author_postprocessing.py
@@ -15,6 +15,7 @@
 # Copyright 2015-2017 by Claus Hunsen <hunsen@fim.uni-passau.de>
 # Copyright 2020-2022 by Thomas Bock <bockthom@cs.uni-saarland.de>
 # Copyright 2025 by Maximilian Löffler <s8maloef@stud.uni-saarland.de>
+# Copyright 2025-2026 by Leo Sendelbach <s8lesend@stud.uni-saarland.de>
 # All Rights Reserved.
 """
 This file is able to disambiguate authors after the extraction from the Codeface database was performed. A manually
@@ -51,6 +52,16 @@
 
 log = getLogger(__name__)
 
+##
+# GLOBAL VARIABLES
+##
+
+# global variable containing all known copilot users and the name and mail adress copilot users will be assigned
+known_copilot_users = {"Copilot", "copilot-pull-request-reviewer[bot]", "copilot-swe-agentbot"}
+copilot_unified_name = "Copilot"
+copilot_unified_email = "copilot@example.com"
+
+
 ##
 # RUN POSTPROCESSING
 ##
@@ -79,7 +90,7 @@ def perform_data_backup(results_path, results_path_backup):
                     copy(current_file, backup_file)
 
 
-def fix_github_browser_commits(data_path, issues_github_list, commits_list, authors_list, emails_list, bots_list):
+def fix_github_browser_commits(data_path, issues_github_list, commits_list, authors_list, emails_list, bots_list, unify_copilot_users=True):
     """
     Replace the author "GitHub <noreply@github.com>" in both commit and GitHub issue data by the correct author.
     The author "GitHub <noreply@github.com>" is automatically inserted as the committer of a commit that is made when
@@ -90,7 +101,7 @@ def fix_github_browser_commits(data_path, issues_github_list, commits_list, auth
     "GitHub <noreply@github.com>" are removed. Also "mentioned" or "subscribed" events in the GitHub issue data which
     reference the author "GitHub <noreply@github.com>" are removed from the GitHub issue data. In addition, remove the
     author "GitHub <noreply@github.com>" also from the author data and bot data and remove e-mails that have been sent
-    by this author.
+    by this author. This method also unifies all known copilot users into a single user if desired.
 
     :param data_path: the path to the project data that is to be fixed
     :param issues_github_list: file name of the github issue data
@@ -98,6 +109,7 @@ def fix_github_browser_commits(data_path, issues_github_list, commits_list, auth
     :param authors_list: file name of the corresponding author data
     :param emails_list: file name of the corresponding email data
     :param bots_list: file name of the corresponding bot data
+    :param unify_copilot_users: whether to unify known copilot users into a single user
     """
     github_user = "GitHub"
     github_email = "noreply@github.com"
@@ -179,7 +191,7 @@ def is_github_noreply_author(name, email):
             commit_data_file = path.join(data_path, commits_list)
             commit_data = csv_writer.read_from_csv(commit_data_file)
             commit_hash_to_author = {commit[7]: commit[2:4] for commit in commit_data}
-
+            author_name_to_data = {author[1]: author[1:3] for author in author_data_new}
             issue_data_new = []
 
             for event in issue_data:
@@ -187,12 +199,16 @@ def is_github_noreply_author(name, email):
                 if is_github_noreply_author(event[9], event[10]) and event[8] == commit_added_event:
                     # extract commit hash from event info 1
                     commit_hash = event[12]
-
+                    name = event[13][1:-1]
                     # extract commit author from commit data, if available
                     if commit_hash in commit_hash_to_author:
                         event[9] = commit_hash_to_author[commit_hash][0]
                         event[10] = commit_hash_to_author[commit_hash][1]
                         issue_data_new.append(event)
+                    elif name in author_name_to_data:
+                        event[9] = author_name_to_data[name][0]
+                        event[10] = author_name_to_data[name][1]
+                        issue_data_new.append(event)
                     else:
                         # the added commit is not part of the commit data. In most cases, this is due to merge commits
                         # appearing in another pull request, as Codeface does not keep track of merge commits. As we