From fd0a8edea98dbe322df3e9f0f64eb935ee51b416 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20L=C3=B6ffler?= Date: Mon, 4 Aug 2025 17:50:21 +0200 Subject: [PATCH 01/13] Automatically convert to python3 (using 2to3) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Maximilian Löffler --- anonymization/anonymization.py | 4 +- .../author_postprocessing.py | 2 +- bot_processing/bot_processing.py | 8 ++-- codeface_extraction/codeface_extraction.py | 4 +- codeface_extraction/extractions.py | 8 ++-- csv_writer/csv_writer.py | 2 +- issue_processing/issue_processing.py | 18 ++++---- issue_processing/jira_issue_processing.py | 45 ++++++++++--------- mbox_parsing/mbox_parsing.py | 6 +-- 9 files changed, 49 insertions(+), 48 deletions(-) diff --git a/anonymization/anonymization.py b/anonymization/anonymization.py index a2e49f0..b11ef52 100644 --- a/anonymization/anonymization.py +++ b/anonymization/anonymization.py @@ -343,7 +343,7 @@ def anonymize_authors(author_data, i, author_to_anonymized_author, name_only = F gender_data_new = [] for author in gender_data: - if author[0] in author_to_anonymized_author_gender.keys(): + if author[0] in list(author_to_anonymized_author_gender.keys()): new_person = author_to_anonymized_author_gender[author[0]] author[0] = new_person[0] gender_data_new.append(author) @@ -395,7 +395,7 @@ def run(): # process arguments # - First make all the args absolute __resdir = abspath(args.resdir) - __codeface_conf, __project_conf = map(abspath, (args.config, args.project)) + __codeface_conf, __project_conf = list(map(abspath, (args.config, args.project))) # load configuration __conf = Configuration.load(__codeface_conf, __project_conf) diff --git a/author_postprocessing/author_postprocessing.py b/author_postprocessing/author_postprocessing.py index 13b1e38..c712ac6 100644 --- a/author_postprocessing/author_postprocessing.py +++ b/author_postprocessing/author_postprocessing.py @@ -469,7 +469,7 @@ def run(): # process arguments # - First make all the args absolute __resdir = abspath(args.resdir) - __codeface_conf, __project_conf = map(abspath, (args.config, args.project)) + __codeface_conf, __project_conf = list(map(abspath, (args.config, args.project))) __backup_data = args.backup # load configuration diff --git a/bot_processing/bot_processing.py b/bot_processing/bot_processing.py index 53a397e..43ff492 100644 --- a/bot_processing/bot_processing.py +++ b/bot_processing/bot_processing.py @@ -19,10 +19,10 @@ """ import argparse -import httplib +import http.client import os import sys -import urllib +import urllib.request, urllib.parse, urllib.error import operator from codeface.cli import log @@ -39,7 +39,7 @@ def run(): # parse arguments args = parser.parse_args(sys.argv[1:]) - __codeface_conf, __project_conf = map(os.path.abspath, (args.config, args.project)) + __codeface_conf, __project_conf = list(map(os.path.abspath, (args.config, args.project))) # create configuration __conf = Configuration.load(__codeface_conf, __project_conf) @@ -192,7 +192,7 @@ def add_user_data(bot_data, user_data, known_bots_file): continue # get user information if available - if user[0] in user_buffer.keys(): + if user[0] in list(user_buffer.keys()): bot_reduced["user"] = user_buffer[user[0]] bot_reduced["prediction"] = user[-1] bot_data_reduced.append(bot_reduced) diff --git a/codeface_extraction/codeface_extraction.py b/codeface_extraction/codeface_extraction.py index 7cf24ea..3478b1f 100644 --- a/codeface_extraction/codeface_extraction.py +++ b/codeface_extraction/codeface_extraction.py @@ -28,7 +28,7 @@ from codeface.configuration import Configuration from codeface.dbmanager import DBManager -import extractions +from . import extractions from csv_writer import csv_writer @@ -119,7 +119,7 @@ def run(): # process arguments # - First make all the args absolute __resdir = abspath(args.resdir) - __codeface_conf, __project_conf = map(abspath, (args.config, args.project)) + __codeface_conf, __project_conf = list(map(abspath, (args.config, args.project))) __extract_commit_messages = args.commit_messages __extract_impl = args.implementation __extract_on_range_level = args.range diff --git a/codeface_extraction/extractions.py b/codeface_extraction/extractions.py index 081a353..081d1be 100644 --- a/codeface_extraction/extractions.py +++ b/codeface_extraction/extractions.py @@ -723,7 +723,7 @@ def _reduce_result(self, result): def fix_characters_in_string(text): """ - Removes control characters such as \r\n \x1b \ufffd from string impl and returns a unicode + Removes control characters such as \r\n \x1b \\ufffd from string impl and returns a unicode string where all control characters have been replaced by a space. :param text: expects a unicode string :return: unicode string @@ -742,7 +742,7 @@ def fix_characters_in_string(text): # remove all kinds of control characters and emojis # see: https://www.fileformat.info/info/unicode/category/index.htm - new_text = u"".join(ch if unicodedata.category(ch)[0] != "C" else " " for ch in new_text.decode("unicode-escape")) + new_text = "".join(ch if unicodedata.category(ch)[0] != "C" else " " for ch in new_text.decode("unicode-escape")) return new_text @@ -765,10 +765,10 @@ def fix_name_encoding(name): try: # Apply correct encoding and return unicode string - return unicode(make_header(info)) + return str(make_header(info)) except UnicodeDecodeError: # Undo utf-8 encoding and return unicode string - return unicode(name.decode('utf-8')) + return str(name.decode('utf-8')) except LookupError: # Encoding not found, return string as is return name diff --git a/csv_writer/csv_writer.py b/csv_writer/csv_writer.py index 2804081..b41463a 100644 --- a/csv_writer/csv_writer.py +++ b/csv_writer/csv_writer.py @@ -28,7 +28,7 @@ def __encode(line): lineres = () # re-encode column if it is unicode for column in line: - if type(column) is unicode: + if type(column) is str: lineres += (column.encode("utf-8"),) else: lineres += (column,) diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py index a901e19..ff80d53 100644 --- a/issue_processing/issue_processing.py +++ b/issue_processing/issue_processing.py @@ -24,11 +24,11 @@ """ import argparse -import httplib +import http.client import json import os import sys -import urllib +import urllib.request, urllib.parse, urllib.error from datetime import datetime, timedelta import operator @@ -61,7 +61,7 @@ def run(): # parse arguments args = parser.parse_args(sys.argv[1:]) - __codeface_conf, __project_conf = map(os.path.abspath, (args.config, args.project)) + __codeface_conf, __project_conf = list(map(os.path.abspath, (args.config, args.project))) # create configuration __conf = Configuration.load(__codeface_conf, __project_conf) @@ -210,7 +210,7 @@ def update_user_dict(user_dict, user): if user is None: user = create_deleted_user() - if not user["username"] in user_dict.keys(): + if not user["username"] in list(user_dict.keys()): if not user["username"] is None and not user["username"] == "": user_dict[user["username"]] = user else: @@ -340,7 +340,7 @@ def merge_issue_events(issue_data): # as we cannot update the referenced issue during iterating over all issues, we need to save the # referenced_by event for the referenced issue temporarily - if rel_issue["number"] in issue_data_to_update.keys(): + if rel_issue["number"] in list(issue_data_to_update.keys()): issue_data_to_update[rel_issue["number"]]["eventsList"].append(referenced_issue_event) else: ref = dict() @@ -500,7 +500,7 @@ def merge_issue_events(issue_data): issue["eventsList"] = sorted(issue["eventsList"], key=lambda k: k["created_at"]) # updates all the issues by the temporarily stored referenced_by events - for key, value in issue_data_to_update.iteritems(): + for key, value in issue_data_to_update.items(): for issue in issue_data: if issue["number"] == value["number"]: issue["eventsList"] = issue["eventsList"] + value["eventsList"] @@ -683,14 +683,14 @@ def get_user_string(name, email): return "{name} <{email}>".format(name=name, email=email) def get_id_and_update_user(user, buffer_db_ids=user_id_buffer, buffer_usernames=username_id_buffer): - username = unicode(user["username"]).encode("utf-8") + username = str(user["username"]).encode("utf-8") # fix encoding for name and e-mail address if user["name"] is not None: - name = unicode(user["name"]).encode("utf-8") + name = str(user["name"]).encode("utf-8") else: name = username - mail = unicode(user["email"]).encode("utf-8") + mail = str(user["email"]).encode("utf-8") # construct string for ID service and send query user_string = get_user_string(name, mail) diff --git a/issue_processing/jira_issue_processing.py b/issue_processing/jira_issue_processing.py index d9748ae..fa3e826 100644 --- a/issue_processing/jira_issue_processing.py +++ b/issue_processing/jira_issue_processing.py @@ -44,8 +44,9 @@ from jira import JIRA from jira.exceptions import JIRAError from time import sleep +import importlib -reload(sys) +importlib.reload(sys) sys.setdefaultencoding("utf-8") # global counter for JIRA requests to make sure to not exceed the request limit @@ -65,7 +66,7 @@ def run(): # parse arguments args = parser.parse_args(sys.argv[1:]) - __codeface_conf, __project_conf = map(os.path.abspath, (args.config, args.project)) + __codeface_conf, __project_conf = list(map(os.path.abspath, (args.config, args.project))) # create configuration __conf = Configuration.load(__codeface_conf, __project_conf) @@ -114,9 +115,9 @@ def run(): processed_issues.extend(issues) # 4) insert referenced_by events into issue histories - for issue_id in referenced_bys.keys(): + for issue_id in list(referenced_bys.keys()): # obtain list of issues which have the current issue id - referenced_issue = list(filter(lambda issue: issue["externalId"] == issue_id, processed_issues)) + referenced_issue = list([issue for issue in processed_issues if issue["externalId"] == issue_id]) if len(referenced_issue) > 0: if len(referenced_issue) > 1: log.warning("Ambiguous issue id " + issue_id + " found in the issue list.") @@ -235,21 +236,21 @@ def merge_user_with_user_from_csv(user, persons): """ new_user = dict() - name_utf8 = unicode(user["name"]).encode("utf-8") - username_utf8 = unicode(user["username"].lower()).encode("utf-8") + name_utf8 = str(user["name"]).encode("utf-8") + username_utf8 = str(user["username"].lower()).encode("utf-8") - if username_utf8 in persons["by_username"].keys(): + if username_utf8 in list(persons["by_username"].keys()): new_user["username"] = username_utf8 - new_user["name"] = unicode(persons["by_username"].get(username_utf8)[0]).encode("utf-8") - new_user["email"] = unicode(persons["by_username"].get(username_utf8)[1]).encode("utf-8") - elif name_utf8 in persons["by_name"].keys(): + new_user["name"] = str(persons["by_username"].get(username_utf8)[0]).encode("utf-8") + new_user["email"] = str(persons["by_username"].get(username_utf8)[1]).encode("utf-8") + elif name_utf8 in list(persons["by_name"].keys()): new_user["username"] = username_utf8 - new_user["name"] = unicode(persons["by_name"].get(name_utf8)[0]).encode("utf-8") - new_user["email"] = unicode(persons["by_name"].get(name_utf8)[1]).encode("utf-8") + new_user["name"] = str(persons["by_name"].get(name_utf8)[0]).encode("utf-8") + new_user["email"] = str(persons["by_name"].get(name_utf8)[1]).encode("utf-8") else: new_user["username"] = username_utf8 new_user["name"] = name_utf8 - new_user["email"] = unicode(user["email"]).encode("utf-8") + new_user["email"] = str(user["email"]).encode("utf-8") log.warning("User not in csv-file: " + str(user)) log.info("current User: " + str(user) + ", new user: " + str(new_user)) @@ -605,10 +606,10 @@ def get_user_string(name, email): def get_id_and_update_user(user, buffer_db_ids=user_id_buffer): # fix encoding for name and e-mail address if user["name"] is not None and user["name"] != "": - name = unicode(user["name"]).encode("utf-8") + name = str(user["name"]).encode("utf-8") else: - name = unicode(user["username"]).encode("utf-8") - mail = unicode(user["email"]).encode("utf-8") # empty + name = str(user["username"]).encode("utf-8") + mail = str(user["email"]).encode("utf-8") # empty # construct string for ID service and send query user_string = get_user_string(name, mail) @@ -1000,8 +1001,8 @@ def find_first_existing(source_folder, filenames): :return: the first existing file name, None otherwise """ - filenames = map(lambda fi: os.path.join(source_folder, fi), filenames) - existing = map(lambda fi: os.path.exists(fi), filenames) + filenames = [os.path.join(source_folder, fi) for fi in filenames] + existing = [os.path.exists(fi) for fi in filenames] first = next((i for (i, x) in enumerate(existing) if x), None) if first is not None: @@ -1026,11 +1027,11 @@ def find_first_existing(source_folder, filenames): persons_by_username = {} persons_by_name = {} for row in person_data: - if not row["AuthorID"] in persons_by_username.keys(): - author_id_utf8 = unicode(row["AuthorID"]).encode("utf-8") + if not row["AuthorID"] in list(persons_by_username.keys()): + author_id_utf8 = str(row["AuthorID"]).encode("utf-8") persons_by_username[author_id_utf8] = (row["AuthorName"], row["userEmail"]) - if not row["AuthorName"] in persons_by_name.keys(): - author_name_utf8 = unicode(row["AuthorName"]).encode("utf-8") + if not row["AuthorName"] in list(persons_by_name.keys()): + author_name_utf8 = str(row["AuthorName"]).encode("utf-8") persons_by_name[author_name_utf8] = (row["AuthorName"], row["userEmail"]) persons = dict() diff --git a/mbox_parsing/mbox_parsing.py b/mbox_parsing/mbox_parsing.py index fd9fd59..92d8cb9 100644 --- a/mbox_parsing/mbox_parsing.py +++ b/mbox_parsing/mbox_parsing.py @@ -69,7 +69,7 @@ def __get_index(mbox, mbox_path, results_folder, schema, reindex): writer = ix.writer() # add all messages to index for message in mbox: - writer.add_document(messageID=unicode(message['message-id']), content=__mbox_getbody(message)) + writer.add_document(messageID=str(message['message-id']), content=__mbox_getbody(message)) writer.commit() log.devinfo("Index created, parsing will begin now.") else: @@ -136,7 +136,7 @@ def __mbox_getbody(message): "An image or some other content has been found that cannot be indexed. Message is given an empty body.") body = ' ' - return unicode(body, errors="replace") + return str(body, errors="replace") def __parse_execute(artifact, schema, my_index, include_filepath): @@ -247,7 +247,7 @@ def run(): args = parser.parse_args(sys.argv[1:]) __resdir = abspath(args.resdir) __maildir = abspath(args.maildir) - __codeface_conf, __project_conf = map(abspath, (args.config, args.project)) + __codeface_conf, __project_conf = list(map(abspath, (args.config, args.project))) # initialize configuration __conf = Configuration.load(__codeface_conf, __project_conf) From d64832e943ead1480d39469deac1c06f1538bd1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20L=C3=B6ffler?= Date: Mon, 4 Aug 2025 18:45:15 +0200 Subject: [PATCH 02/13] Import codeface files required for extraction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Maximilian Löffler --- codeface/__init__.py | 1 + codeface/configuration.py | 198 ++++++++++++++ codeface/dbmanager.py | 431 +++++++++++++++++++++++++++++++ codeface/linktype.py | 41 +++ codeface/util.py | 527 ++++++++++++++++++++++++++++++++++++++ 5 files changed, 1198 insertions(+) create mode 100644 codeface/__init__.py create mode 100644 codeface/configuration.py create mode 100644 codeface/dbmanager.py create mode 100644 codeface/linktype.py create mode 100644 codeface/util.py diff --git a/codeface/__init__.py b/codeface/__init__.py new file mode 100644 index 0000000..9bad579 --- /dev/null +++ b/codeface/__init__.py @@ -0,0 +1 @@ +# coding=utf-8 diff --git a/codeface/configuration.py b/codeface/configuration.py new file mode 100644 index 0000000..e1fa874 --- /dev/null +++ b/codeface/configuration.py @@ -0,0 +1,198 @@ +# This file is part of Codeface. Codeface is free software: you can +# redistribute it and/or modify it under the terms of the GNU General Public +# License as published by the Free Software Foundation, version 2. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +# details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# +# Copyright 2013 by Siemens AG, Johannes Ebke +# All Rights Reserved. +''' +Configuration module for codeface + +Encapsulates a configuration as an immutable dict +''' + +import yaml +from collections.abc import Mapping +from logging import getLogger + +from tempfile import NamedTemporaryFile +from codeface.linktype import LinkType + +# create logger +log = getLogger(__name__) + +class ConfigurationError(Exception): + '''Raised if any part of the configuration is malformed''' + pass + +class Configuration(Mapping): + ''' + Encapsulates the codeface configuration + ''' + + GLOBAL_KEYS = ('dbname', 'dbhost', 'dbuser', 'dbpwd', + 'idServiceHostname', 'idServicePort') + GLOBAL_OPTIONAL_KEYS = ('dbport',) + PROJECT_KEYS = ('project', 'repo', 'tagging', 'revisions', 'rcs') + OPTIONAL_KEYS = ('description', 'ml', 'mailinglists', 'sleepTime', + 'proxyHost', 'proxyPort', 'bugsProjectName', + 'productAsProject', 'issueTrackerType', + 'issueTrackerURL', 'issueTrackerProject', + 'issueTrackerUser', 'issueTrackerPassword', + 'understand', 'sloccount', 'windowSize', 'numWindows', + 'qualityType', 'communicationType', 'artifactType', 'dependencyType') + ALL_KEYS = set(GLOBAL_KEYS + GLOBAL_OPTIONAL_KEYS + PROJECT_KEYS + + OPTIONAL_KEYS) + + def __init__(self): + ''' + Initialize an empty configuration object with the default values + ''' + self._conf = { + 'idServiceHostname' : '127.0.0.1', + 'idServicePort' : 8080 + } + + self._conf_file_loc = None + + @classmethod + def load(cls, global_conffile, local_conffile=None): + ''' + Load configuration from global/local files + ''' + c = Configuration() + log.info("Loading global configuration file '{}'". + format(global_conffile)) + cls._global_conf = c._load(global_conffile) + c._conf.update(c._global_conf) + if local_conffile: + log.info("Loading project configuration file '{}'". + format(local_conffile)) + cls._project_conf = c._load(local_conffile) + c._conf.update(c._project_conf) + else: + log.info("Not loading project configuration file!") + c._initialize() + c._check_sanity() + return c + + def _load(self, filename): + '''Helper function that checks loading errors and logs them''' + try: + return yaml.load(open(filename), Loader=yaml.SafeLoader) + except IOError: + log.exception("Could not open configuration file '{}'". + format(filename)) + raise + except yaml.YAMLError: + log.exception("Could not parse configuration file '{}'". + format(filename)) + raise + + def _initialize(self): + '''Infer missing values in the configuration''' + if "rcs" not in self: + self._conf["rcs"] = [None for _ in range(len(self["revisions"]))] + + if "mailinglists" not in self: + self._conf["mailinglists"] = [] + if "ml" in self: + self._conf["mailinglists"].append({"name": self["ml"]}) + for ml in self._conf["mailinglists"]: + ml.setdefault("type", "dev") + ml.setdefault("source", "gmane") + + if "dbport" not in self: + self._conf["dbport"] = 3306 + else: + self._conf["dbport"] = int(self._conf["dbport"]) + + def _check_sanity(self): + ''' + Check that the configuration makes sense. + :raise ConfigurationError + ''' + + # Some elementary sanity checks + for key in self.GLOBAL_KEYS: + if self._project_conf and key in self._project_conf: + log.critical("The key '{}' may not be overridden in the " + "project configuration file".format(key)) + raise ConfigurationError('Invalid configuration key.') + + for key in self.GLOBAL_KEYS + self.PROJECT_KEYS: + if key not in self: + log.critical("Required key '{}' missing in configuration!" + ''.format(key)) + raise ConfigurationError('Missing configuration key.') + + if self['tagging'] not in LinkType.get_all_link_types(): + log.critical('Unsupported tagging mechanism specified!') + raise ConfigurationError('Unsupported tagging mechanism.') + + if len(self["revisions"]) < 2: + log.info("No revision range specified in configuration, using auto-generated windows") + + if len(self["revisions"]) != len(self["rcs"]): + log.critical("Malformed configuration: revision and rcs list " + "lengths differ! Found {0} revisions and {1} release " + "candidates.".format(len(self["revisions"]), len(self["rcs"]))) + raise ConfigurationError('Malformed configuration.') + + unknown_keys = [k for k in self if k not in self.ALL_KEYS] + for key in unknown_keys: + log.warning("Unknown key '{}' in configuration.".format(key)) + + def write(self): + conf_file = NamedTemporaryFile(mode='w', prefix=self._conf['project'], + delete=False) + yaml.dump(self._conf, conf_file) + self._conf_file_loc = conf_file.name + conf_file.close() + + def get_conf_file_loc(self): + return self._conf_file_loc + + # Function for the Configuration object to function as a dict + def __getitem__(self, key): + return self._conf[key] + + def __setitem__(self, key, value): + self._conf[key] = value + + def __len__(self): + return len(self._conf) + + def __iter__(self): + return iter(self._conf) + + def __keys__(self): + return list(self._conf.keys()) + + def __str__(self): + ''' + Return a pretty string for display and logging + ''' + r = [] + r.append("--- # global codeface configuration") + for key in self.GLOBAL_KEYS: + if key in self: + r.append("{}: {}".format(key, repr(self[key]))) + r.append("# codeface project configuration") + for key in self.PROJECT_KEYS + self.OPTIONAL_KEYS: + if key in self: + r.append("{}: {}".format(key, repr(self[key]))) + unknown = [k for k in self if k not in self.ALL_KEYS] + if unknown: + r.append("# Unknown keys") + for key in unknown: + r.append("{}: {}".format(key, repr(self[key]))) + return "\n".join(r) diff --git a/codeface/dbmanager.py b/codeface/dbmanager.py new file mode 100644 index 0000000..df917ca --- /dev/null +++ b/codeface/dbmanager.py @@ -0,0 +1,431 @@ +#! /usr/bin/env python +# This file is part of Codeface. Codeface is free software: you can +# redistribute it and/or modify it under the terms of the GNU General Public +# License as published by the Free Software Foundation, version 2. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +# details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# +# Copyright 2013 by Siemens AG, Wolfgang Mauerer +# All Rights Reserved. + +# Thin sql database wrapper + +import MySQLdb as mdb +from datetime import datetime, timezone +from logging import getLogger +from contextlib import contextmanager + +# create logger +log = getLogger(__name__) + +@contextmanager +def _log_db_error(action, args=None): + try: + yield + except mdb.Error as e: + if args: + try: + action = action % args + except: + pass + log.critical('MySQL error {e[0]} during "{action}": {e[1]}' + ''.format(e=e.args, action=action)) + raise + + +class DBManager: + """This class provides an interface to the codeface sql database.""" + + def __init__(self, conf): + try: + self.con = None + self.con = mdb.Connection(host=conf["dbhost"], + port=conf["dbport"], + user=conf["dbuser"], + passwd=conf["dbpwd"], + db=conf["dbname"]) + log.debug( + "Establishing MySQL connection to " + "{c[dbuser]}@{c[dbhost]}:{c[dbport]}, DB '{c[dbname]}'" + .format(c=conf)) + except mdb.Error as e: + log.critical( + "Failed to establish MySQL connection to " + "{c[dbuser]}@{c[dbhost]}:{c[dbport]}, DB '{c[dbname]}'" + ": {e[1]} ({e[0]})" + "".format(c=conf, e=e.args)) + raise + self.cur = self.con.cursor() + + max_packet_size = 1024 * 1024 * 256 + self.doExec("SET GLOBAL max_allowed_packet=%s", (max_packet_size,)) + + def __del__(self): + if self.con is not None: + self.con.close() + + def doExec(self, stmt, args=None): + with _log_db_error(stmt, args): + retryCount = 0 + while retryCount < 10: + try: + if isinstance(args, list): + res = self.cur.executemany(stmt, args) + else: + res = self.cur.execute(stmt, args) + return res + except mdb.OperationalError as dbe: + retryCount += 1 + log.info("DBE args: " + str(dbe.args)) + if dbe.args[0] == 1213: # Deadlock! retry... + log.warning("Recoverable deadlock in MySQL - retrying " \ + "(attempt {}).".format(retryCount)) + elif dbe.args[0] == 2006: # Server gone away... + log.warning("MySQL Server gone away, trying to reconnect " \ + "(attempt {}).".format(retryCount)) + self.con.ping(True) + elif dbe.args[0] == 2013: # Lost connection to MySQL server during query... + log.warning("Lost connection to MySQL server during query, " \ + "trying to reconnect (attempt {}).".format(retryCount)) + self.con.ping(True) + else: + raise + + # Give up after ten retry attempts and propagate the + # problem to the caller. Callers can either fix the problem with + # a different query, or the analysis fails + log.error("DB access failed after ten attempts, giving up") + raise + + def doFetchAll(self): + with _log_db_error("fetchall"): + return self.cur.fetchall() + + def doCommit(self): + with _log_db_error("commit"): + return self.con.commit() + + def doExecCommit(self, stmt, args=None): + self.doExec(stmt, args) + self.doCommit() + + # NOTE: We don't provide any synchronisation since by assumption, + # a single project is never analysed from two threads. + def getProjectID(self, name, analysisMethod): + """ + Return the project ID of the given name/analysisMethod combination. + If the project does not exist yet in the database, it is created. + """ + self.doExec("SELECT id FROM project WHERE name=%s " + "AND analysisMethod=%s", (name, analysisMethod)) + if self.cur.rowcount == 0: + # Project is not contained in the database + log.info("Creating new project {}/{}". + format(name, analysisMethod)) + self.doExecCommit("INSERT INTO project (name, analysisMethod) " + + "VALUES (%s, %s);", (name, analysisMethod)) + self.doExec("SELECT id FROM project WHERE name=%s;", (name,)) + elif self.cur.rowcount > 1: + raise Exception("Duplicate projects {}/{} in database!". + format(name, analysisMethod)) + pid = self.doFetchAll()[0][0] + log.info("Using project {}/{} with ID {}". + format(name, analysisMethod, pid)) + return pid + + def get_project(self, pid): + self.doExec("SELECT name, analysisMethod FROM project" + " WHERE id=%s", pid) + if self.cur.rowcount == 0: + raise Exception("Project id {} not found!".format(pid)) + return self.doFetchAll()[0] + + def get_edgelist(self, cid): + self.doExec("SELECT fromId, toId, weight FROM edgelist \ + WHERE clusterId={}".format(cid)) + if self.cur.rowcount == 0: + raise Exception("Cluster id {} not found!".format(cid)) + return self.doFetchAll() + + def get_file_dev(self, project_id, range_id): + self.doExec("SELECT * FROM (SELECT id, commitHash, commitDate, author, description " \ + "FROM commit WHERE projectId={} AND releaseRangeId={}) AS Commits " \ + "INNER JOIN (SELECT file, commitId, SUM(size) AS fileSize " \ + "FROM commit_dependency GROUP BY commitId, file) AS commitFileLOC " \ + "ON Commits.id=commitFileLOC.commitId ORDER BY " \ + "commitFileLOC.file, commitFileLOC.commitId".format(project_id, range_id)) + + if self.cur.rowcount == 0: + raise Exception("Could not obtain file-dev information for project {} "\ + "(release range {}!".format(project_id, range_id)) + return self.doFetchAll() + + def get_release_ranges(self, project_id): + self.doExec("SELECT id FROM release_range \ + WHERE projectId={}".format(project_id)) + if self.cur.rowcount == 0: + raise Exception("No release ranges found for project {}!" + .format(project_id)) + return [range_entry[0] for range_entry in self.doFetchAll()] + + def get_cluster_id(self, pid, release_range_id=None): + if release_range_id: + self.doExec("SELECT id FROM cluster WHERE clusterNumber=-1 \ + AND projectId={} AND releaseRangeId={}" + .format(pid, release_range_id)) + else: + self.doExec("SELECT id FROM cluster WHERE clusterNumber=-1 \ + AND projectId={}".format(pid)) + if self.cur.rowcount == 0: + raise Exception("Cluster from project {} not found!".format(pid)) + return self.doFetchAll()[0][0] + + def get_project_persons(self, pid): + self.doExec("SELECT id, name FROM person \ + WHERE projectId={}".format(pid)) + if self.cur.rowcount == 0: + raise Exception("Persons from project {} not found!".format(pid)) + return (self.doFetchAll()) + + def getTagID(self, projectID, tag, type): + """Determine the ID of a tag, given its textual form and the type""" + self.doExec("SELECT id FROM release_timeline WHERE projectId=%s " + + "AND tag=%s AND type=%s", (projectID, tag, type)) + if self.cur.rowcount != 1: + raise Exception("Tag '{}' of type {} is {} times in the DB!". + format(tag, type, self.cur.rowcount)) + return self.doFetchAll()[0][0] + + def getCommitId(self, projectId, commitHash): + self.doExec("SELECT id FROM commit" + + " WHERE commitHash=%s AND projectId=%s" + , (commitHash, projectId)) + if self.cur.rowcount == 0: + raise Exception("Commit from project {} not found!". + format(projectId)) + return self.doFetchAll()[0][0] + + def getRevisionID(self, projectID, tag): + return self.getTagID(projectID, tag, "release") + + def getRCID(self, projectID, tag): + return self.getTagID(projectID, tag, "rc") + + def getReleaseRangeID(self, projectID, revisionIDs): + """Given a pair of release IDs, determine the release range ID""" + self.doExec("SELECT id FROM release_range WHERE projectId=%s " + + "AND releaseStartId=%s AND releaseEndId=%s", + (projectID, revisionIDs[0], revisionIDs[1])) + if self.cur.rowcount != 1: + raise Exception("Release range from '{r[0]}' to '{r[1]}' is {c} " + "times in the DB!". + format(r=revisionIDs, c=self.cur.rowcount)) + return self.doFetchAll()[0][0] + + def getProjectTimeRange(self, pid): + """Given a project ID, determine the start and end date of available VCS data. + Returns a tuple with start end end date in the form YYYY-MM-DD""" + self.doExec("SELECT MIN(date_start) FROM revisions_view " + "WHERE projectId={}".format(pid)) + if self.cur.rowcount == 0: + raise Exception("No start date for pid {} found!".format(pid)) + date_start = self.doFetchAll()[0][0].strftime("%Y-%m-%d") + + self.doExec("SELECT MAX(date_end) FROM revisions_view " + "WHERE projectId={}".format(pid)) + if self.cur.rowcount == 0: + raise Exception("No end date for pid {} found!".format(pid)) + date_end = self.doFetchAll()[0][0].strftime("%Y-%m-%d") + + return (date_start, date_end) + + def get_commit_cdate(self, pid, hash): + """Given a project ID and a commit hash, obtain the commit date + in format YYYY-MM-DD""" + self.doExec("SELECT commitDate FROM commit " + "WHERE projectId={} and commitHash='{}'".format(pid, hash)) + if self.cur.rowcount == 0: + raise Exception("No date found for commit {} (pid {}) found!".format(hash, pid)) + date = self.doFetchAll()[0][0].strftime("%Y-%m-%d") + + return (date) + + def get_release_range(self, project_id, range_id): + self.doExec( + "SELECT st.tag, nd.tag, rc.tag FROM release_range " + "LEFT JOIN release_timeline AS st ON st.id=releaseStartId " + "LEFT JOIN release_timeline AS nd ON nd.id=releaseEndId " + "LEFT JOIN release_timeline AS rc ON rc.id=releaseRCStartId " + "WHERE release_range.projectId=%s AND release_range.id=%s", + (project_id, range_id)) + ranges = self.doFetchAll() + if self.cur.rowcount == 0: + raise Exception("Range id {} not found!".format(project_id)) + return ranges[0] + + def get_num_commits_in_range(self, range_id): + self.doExec("SELECT COUNT(*) FROM commit WHERE releaseRangeId={}".format(range_id)) + if self.cur.rowcount == 0: + raise Exception("Range id {} not found in get_num_commits_in_range!".format(range_id)) + return self.doFetchAll()[0][0] + + def update_release_timeline(self, project, tagging, revs, rcs, + recreate_project=False): + ''' + For a project, update the release timeline table with the given + revisions. If existing releases/rcs from the timeline are not in + order, the conservative approach is taken and the whole project is + recreated to avoid inconsistencies. + + Returns true if the project had to be recreated. + ''' + assert len(revs) >= 2 + assert len(revs) == len(rcs) + rcs = [rc if rc else rev for rc, rev in zip(rcs, revs)] + pid = self.getProjectID(project, tagging) + + if not recreate_project: + # First check if the release timeline is sane and in order + self.doExec("SELECT tag FROM release_timeline WHERE projectId=%s " + "AND type='release' ORDER BY id", (pid,)) + tags = [tag for (tag,) in self.doFetchAll()] + if len(set(tags)) != len(tags): + log.error("Database corrupted: Duplicate release entries in " + "release_timeline! Recreating project.") + recreate_project = True + if len(tags) == 0: + recreate_project = True + + # Check that the tags are in the same order + if not recreate_project: + for i, tag in enumerate(tags): + if i >= len(revs): + log.warning("List of revisions to analyse was shortened.") + break + if revs[i] != tag: + log.error("Release number {} changed tag from {} to " + "{}. Recreating project.". + format(i, tag, revs[i])) + recreate_project = True + break + + # Check that the RC tags are in order + if not recreate_project: + self.doExec("SELECT tag FROM release_timeline WHERE " + "projectId=%s AND type='rc' ORDER BY id", (pid,)) + rctags = [tag for (tag,) in self.doFetchAll()] + if len(set(rctags)) != len(rctags): + log.error("Database corrupted: Duplicate RC entries in " + "release_timeline! Recreating project.") + recreate_project = True + + # Check for changes in release candidates + # Note that the first RC is unused, since it refers to the end + # of a previous period + if not recreate_project: + for i, tag in enumerate(rctags): + if i + 1 >= len(rcs): + log.warning("List of release candidates to analyse " + "was shortened.") + break + if rcs[i + 1] != tag: + log.error("Release candidate number {} changed tag " + "from {} to {}. Recreating project.". + format(i, tag, rcs[i + 1])) + recreate_project = True + break + + # Go through the release ranges and check if they have changed + if not recreate_project: + self.doExec( + "SELECT st.tag, nd.tag, rc.tag FROM release_range " + "LEFT JOIN release_timeline AS st ON st.id=releaseStartId " + "LEFT JOIN release_timeline AS nd ON nd.id=releaseEndId " + "LEFT JOIN release_timeline AS rc ON rc.id=releaseRCStartId " + "WHERE release_range.projectId=%s ORDER BY release_range.id", + (pid,)) + ranges = self.doFetchAll() + if len(set(ranges)) != len(tags) - 1: + log.error("Database corrupted: Number of release ranges" + " does not match number of release tags!") + recreate_project = True + + for i, (start, end, rc) in enumerate(self.doFetchAll()): + if i + 1 >= len(revs) or recreate_project: + # List of revisions to analyse was shortened + break + if (start, end) != (revs[i], revs[i + 1]): + log.error("Release range {} changed from {} to {}." + " Recreating project.". + format(i, (start, end), (revs[i], revs[i + 1]))) + recreate_project = True + break + if rc != rcs[i + 1]: + log.error("Release candidate {} changed from {} to {}." + " Recreating project.". + format(i, rc, rcs[i + 1])) + recreate_project = True + break + + # Recreate project if necessary + if recreate_project: + # This should ripple through the database and delete + # all referencing entries for project + log.warning("Deleting and re-creating project {}/{}.". + format(project, tagging)) + self.doExecCommit("DELETE FROM `project` WHERE id=%s", (pid,)) + pid = self.getProjectID(project, tagging) + tags = [] + rctags = [] + + # at this point we have verified that the first len(tags) + # entries are identical + new_ranges_to_process = [] + if len(revs) > len(tags): + n_new = len(revs) - len(tags) + log.info("Adding {} new releases...".format(n_new)) + previous_rev = None + if len(tags) > 0: + previous_rev = tags[-1] + for rev, rc in list(zip(revs, rcs))[len(tags):]: + self.doExecCommit("INSERT INTO release_timeline " + "(type, tag, projectId) " + "VALUES (%s, %s, %s)", + ("release", rev, pid)) + + if previous_rev is not None and rc: + self.doExecCommit("INSERT INTO release_timeline " + "(type, tag, projectId) " + "VALUES (%s, %s, %s)", + ("rc", rc, pid)) + + if previous_rev is not None: + startID = self.getRevisionID(pid, previous_rev) + endID = self.getRevisionID(pid, rev) + if rc: + rcID = self.getRCID(pid, rc) + else: + rcID = "NULL" + self.doExecCommit("INSERT INTO release_range " + "(releaseStartId, releaseEndId, " + "projectId, releaseRCStartId) " + "VALUES (%s, %s, %s, %s)", + (startID, endID, pid, rcID)) + new_ranges_to_process.append(self.getReleaseRangeID(pid, + (startID, endID))) + previous_rev = rev + # now we are in a well-defined state. + # Return the ids of the release ranges we have to process + return new_ranges_to_process + + +def tstamp_to_sql(tstamp): + """Convert a Unix timestamp into an SQL compatible DateTime string""" + return (datetime.fromtimestamp(tstamp, tz=timezone.utc).strftime("%Y-%m-%d %H:%M:%S")) diff --git a/codeface/linktype.py b/codeface/linktype.py new file mode 100644 index 0000000..db08a80 --- /dev/null +++ b/codeface/linktype.py @@ -0,0 +1,41 @@ +## This file is part of Codeface. Codeface is free software: you can +## redistribute it and/or modify it under the terms of the GNU General Public +## License as published by the Free Software Foundation, version 2. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +## FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software +## Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +## +## Copyright 2013 by Siemens AG, Wolfgang Mauerer +## Copyright 2014 by Matthias Dittrich +## All Rights Reserved. + +__author__ = 'drag0on' + + +#enum-like class to distinguish between the various +#methods used to link individuals +class LinkType: + tag = "tag" + proximity = "proximity" + committer2author = "committer2author" + file = "file" + feature = "feature" + feature_file = "feature_file" + + _all_link_types = \ + (tag, proximity, committer2author, file, feature, feature_file) + + @staticmethod + def get_all_link_types(): + return LinkType._all_link_types + + @staticmethod + def get_tag_types(): + return ["Signed-off-by", "Acked-by", "CC", "Reviewed-by", + "Reported-by", "Tested-by", "Patch"] diff --git a/codeface/util.py b/codeface/util.py new file mode 100644 index 0000000..807ecc4 --- /dev/null +++ b/codeface/util.py @@ -0,0 +1,527 @@ +## This file is part of Codeface. Codeface is free software: you can +## redistribute it and/or modify it under the terms of the GNU General Public +## License as published by the Free Software Foundation, version 2. +## +## This program is distributed in the hope that it will be useful, but WITHOUT +## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +## FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +## details. +## +## You should have received a copy of the GNU General Public License +## along with this program; if not, write to the Free Software +## Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +## +## Copyright 2013 by Siemens AG, Wolfgang Mauerer +## All Rights Reserved. +''' +Utility functions for running external commands +''' + +import os +import os.path +import re +import shutil +import signal +import sys +import traceback +from collections import OrderedDict, namedtuple +from glob import glob +from math import sqrt +from multiprocessing import Process, Queue, Lock +from pkg_resources import resource_filename +from subprocess import Popen, PIPE +from tempfile import NamedTemporaryFile, mkdtemp +from time import sleep +from threading import enumerate as threading_enumerate +from queue import Empty +from datetime import timedelta, datetime +import logging +log = logging.getLogger(__name__) + +# Represents a job submitted to the batch pool. +BatchJobTuple = namedtuple('BatchJobTuple', ['id', 'func', 'args', 'kwargs', + 'deps', 'startmsg', 'endmsg']) +class BatchJob(BatchJobTuple): + def __init__(self, *args, **kwargs): + super(BatchJob, self).__init__(*args, **kwargs) + self.done = False + self.submitted = False + +class BatchJobPool(object): + ''' + Implementation of a dependency-respecting batch pool + + This system uses a pool of N worker processes to run jobs. Since the + multiprocessing module is used, all functions, args and kwargs must be + pickleable. Specifically, this means that only functions defined at + top-level in a module can be used here. + + Jobs can be created using pool.add(function, args, kwargs, deps=deps)) + where deps can be a list of job handles previously returned by + pool.add. If multiprocessing is disabled, the functions are run + immediately and None is returned. + + Call pool.join() to start execution and wait until all jobs are complete. + If a work item raises an exception, the join() will terminate with + that exception, if pickleable, or a generic Exception if otherwise. + ''' + + def __init__(self, n_cores): + self.n_cores = n_cores + self.next_id = 1 + self.jobs = OrderedDict() # Dictionary of jobs (ordered for repeatability) + + # Initialize workers and their work and done queues + self.work_queue, self.done_queues, self.workers = Queue(), [], [] + if n_cores > 1: + # When n_cores is 1 we doen't use the process anyway. + # However the pycharm debugger goes crasy when we start the + # process, so as a workaround don't start anything when + # n_core is 1. + for i in range(n_cores): + dq = Queue() + w = Process(target=batchjob_worker_function, args=(self.work_queue, dq)) + self.done_queues.append(dq) + self.workers.append(w) + w.start() + + def _is_ready(self, job): + '''Returns true if the job is ready for submission''' + if job.done or job.submitted: + return False + return all(self.jobs[j].done for j in job.deps if j is not None) + + def _submit(self, job): + '''Submit the job if it is ready''' + if self._is_ready(job): + self.work_queue.put(job) + job.submitted = True + + def add(self, func, args, kwargs={}, deps=(), startmsg=None, endmsg=None): + ''' + Add a job that executes func(*args, **kwargs) and depends on the + jobs with the ids listed in deps. + This function returns a job ID which can be used as a dependency + in other calls to add. + If n_cores is 1; this call immediately executes the given function + and returns None + ''' + if self.n_cores == 1: + log.info(startmsg) + func(*args, **kwargs) + log.info(endmsg) + return None + job_id = self.next_id + self.next_id += 1 + j = BatchJob(job_id, func, args, kwargs, deps, startmsg, endmsg) + self.jobs[job_id] = j + return job_id + + def join(self): + ''' + Submit jobs and wait for all jobs to finish. + ''' + try: + while not all(j.done for j in self.jobs.values()): + # Put jobs that are ready onto the work queue + for j in self.jobs.values(): + self._submit(j) + # Wait for a result from the done_queues + for dq in self.done_queues: + try: + res = dq.get(block=False) + except Empty: + continue + if res is None: + log.fatal("Uncaught exception in worker thread!") + raise Exception("Failure in Batch Pool") + if isinstance(res, Exception): + log.fatal("Uncaught exception in worker thread:") + raise res + log.debug("Job {} has finished!".format(res)) + self.jobs[res].done = True + # Check if workers died + for w in self.workers: + if not w.is_alive(): + w.join() + raise Exception("A Worker died unexpectedly!") + sleep(0.01) + finally: + # Terminate and join the workers + # Wait 100ms to allow backtraces to be logged + sleep(0.1) + log.devinfo("Terminating workers...") + for w in self.workers: + w.terminate() + log.devinfo("Workers terminated.") + +def batchjob_worker_function(work_queue, done_queue): + ''' + Worker function executed in a separate process. + This function pulls work items off the work queue; terminates if there + is no item for 0.5s; otherwise executes the work item. Any exception + is reraised after putting a None onto the done_queue (triggering an + exception in the main process) + ''' + # Silently quit on CTRL+C + signal.signal(signal.SIGINT, handle_sigint_silent) + while True: + try: + job = work_queue.get(block=True) + except ValueError as ve: + # This happens when the main loop stops before we do + return + log.debug("Starting job id {}".format(job.id)) + try: + if job.startmsg: + log.info(job.startmsg) + job.func(*job.args, **job.kwargs) + if job.endmsg: + log.info(job.endmsg) + log.debug("Finished work id {}".format(job.id)) + done_queue.put(job.id) + except Exception as e: + log.debug("Failed work id {}".format(job.id)) + done_queue.put(Exception(e.__class__.__name__ + ": " + + str(e) + "\n" + traceback.format_exc())) + +# Function to dump the stacks of all threads +def get_stack_dump(): + id2name = dict([(th.ident, th.name) for th in threading_enumerate()]) + code = ["Stack dump:"] + for threadId, stack in sys._current_frames().items(): + code.append("") + code.append("# Thread: %s(%d)" % (id2name.get(threadId,""), threadId)) + for filename, lineno, name, line in traceback.extract_stack(stack): + code.append('File: "%s", line %d, in %s' % (filename, lineno, name)) + if line: + code.append(" %s" % (line.strip())) + return code + +# Signal handler that dumps all stacks and terminates +# Lock l dis-interleaves the stack traces of processes +l = Lock() +def handle_sigint(signal, frame): + with l: + log.fatal("CTRL-C pressed!") + for c in get_stack_dump(): + log.devinfo(c) + # This call raises a SystemExit exception in the + # stack frame that was interrupted by the signal + # For the main thread, this is what we want. + sys.exit(-1) + +# Signal handler that dumps all stacks and terminates silently +# Also uses the Lock l to dis-interleave the stack traces +def handle_sigint_silent(signal, frame): + with l: + for c in get_stack_dump(): + log.devinfo(c) + logging.shutdown() + # Since we want to terminate worker threads with prejudice, + # we use os._exit, which directly terminates the process. + # otherwise the worker try/catch will also catch the SystemExit + os.exit_(-1) + +def handle_sigterm(signal, frame): + # Since we want to terminate worker threads with prejudice, + # we use os._exit, which directly terminates the process. + # otherwise the worker try/catch will also catch the SystemExit + logging.shutdown() + os._exit(-1) + +def handle_sigusr1(signal, frame): + for c in get_stack_dump(): + log.info(c) + +# Dump all the stacks in case of CTRL-C +signal.signal(signal.SIGINT, handle_sigint) +# Also dump on sigterm +signal.signal(signal.SIGTERM, handle_sigterm) +# Also dump on sigusr1, but do not terminate +signal.signal(signal.SIGUSR1, handle_sigusr1) + +def execute_command(cmd, ignore_errors=False, direct_io=False, cwd=None, silent_errors=False): + ''' + Execute the command `cmd` specified as a list of ['program', 'arg', ...] + If ignore_errors is true, a non-zero exit code will be ignored (and a warning + messages will be issued), otherwise an exception is raised. If silent_errors is True, + no messages will be emitted even in case of an error (but exceptions will still be raised). + If direct_io is True, do not capture the stdin and stdout of the command. + Returns the stdout of the command. + ''' + jcmd = " ".join(cmd) + log.debug("Running command: {}".format(jcmd)) + try: + if direct_io: + pipe = Popen(cmd, cwd=cwd) + else: + pipe = Popen(cmd, stdout=PIPE, stderr=PIPE, cwd=cwd) + stdout, stderr = pipe.communicate() + except OSError: + log.error("Error executing command {}!".format(jcmd)) + raise + + if pipe.returncode != 0: + if ignore_errors: + if not(silent_errors): + log.warning("Command '{}' failed with exit code {}. Ignored.". + format(jcmd, pipe.returncode)) + else: + if not(direct_io) and not(silent_errors): + log.info("Command '{}' stdout:".format(jcmd)) + for line in stdout.splitlines(): + log.info(line) + log.info("Command '{}' stderr:".format(jcmd)) + for line in stderr.splitlines(): + log.info(line) + msg = "Command '{}' failed with exit code {}. \n" \ + "(stdout: {}\nstderr: {})"\ + .format(jcmd, pipe.returncode, stdout, stderr) + if not(silent_errors): + log.error(msg) + raise Exception(msg) + return stdout + +def _convert_dot_file(dotfile): + ''' + Convert duplicate edges in the given dot file into edges with + a larger pen width. + ''' + res = [] + edges = {} + edge_spec = re.compile("\s+(\d+) -> (\d+);") + + file = open(dotfile, "r") + lines = [line.strip("\n") for line in file] + # Modify the header (copyright line + digraph) + lines[0] = "digraph {" + lines[1] = "node[fontsize=30, shape=\"box\"];" + + lines[len(lines)-1] = "" # Skip closing brace + + for line in lines: + m = re.match(edge_spec, line) + if m: + a, b = m.group(1), m.group(2) + edges[(a,b)] = edges.get((a,b), 0) + 1 + else: + res.append(line + "\n") + + # sort the edges for reproducibility + for ((a, b), count) in sorted(edges.items()): + res.append("{0} -> {1} [weight={2} penwidth={3}];\n". + format(a,b,count, sqrt(float(count)))) + + res.append("overlap=prism;\n") + res.append("splines=true;\n") + res.append("}\n") + return res + +def layout_graph(filename): + out = NamedTemporaryFile(mode="w", delete=False) + out.writelines(_convert_dot_file(filename)) + out.close() # flushes the cache + cmd = [] + cmd.append("dot") + cmd.append("-Kfdp") + cmd.append("-Tpdf") + cmd.append("-Gcharset=utf-8") + cmd.append("-o{0}.pdf".format(os.path.splitext(filename)[0])) + cmd.append(out.name) + execute_command(cmd, ignore_errors=True) + # Manually remove the temporary file + os.unlink(out.name) + +def generate_report(start_rev, end_rev, resdir): + log.devinfo(" -> Generating report") + report_base = "report-{0}_{1}".format(start_rev, end_rev) + + # Run perl script to generate report LaTeX file + cmd = [] + cmd.append(resource_filename(__name__, "perl/create_report.pl")) + cmd.append(resdir) + cmd.append("{0}--{1}".format(start_rev, end_rev)) + with open(os.path.join(resdir, report_base + ".tex"), 'w') as f: + f.write(execute_command(cmd)) + + # Compile report with lualatex + cmd = [] + cmd.append("lualatex") + cmd.append("-interaction=nonstopmode") + cmd.append(os.path.join(resdir, report_base + ".tex")) + + # We run latex in a temporary directory so that it's easy to + # get rid of the log files etc. created during the run that are + # not relevant for the final result + orig_wd = os.getcwd() + tmpdir = mkdtemp() + + os.chdir(tmpdir) + execute_command(cmd, ignore_errors=True) + try: + shutil.copy(report_base + ".pdf", resdir) + except IOError: + log.warning("Could not copy report PDF (missing input data?)") + + os.chdir(orig_wd) + shutil.rmtree(tmpdir) + +def generate_reports(start_rev, end_rev, range_resdir): + files = glob(os.path.join(range_resdir, "*.dot")) + log.info(" -> Generating Reports...") + for file in files: + layout_graph(file) + generate_report(start_rev, end_rev, range_resdir) + +def check4ctags(): + # check if the appropriate ctags is installed on the system + prog_name = 'Exuberant Ctags' + prog_version = 'Exuberant Ctags 5.9~svn20110310' + cmd = "ctags-exuberant --version".split() + + res = execute_command(cmd, None) + + if not(res.startswith(prog_name)): + log.error("program '{0}' does not exist".format(prog_name)) + raise Exception("ctags-exuberant not found") + + if not(res.startswith(prog_version)): + # TODO: change this to use standard mechanism for error logging + log.error("Ctags version '{0}' not found".format(prog_version)) + raise Exception("Incompatible ctags-exuberant version") + + +def check4cppstats(): + """ + check if the appropriate cppstats is installed on the system. + """ + # We can not check the version directly as there is no version switch + # on cppstats We just check if the first line is OK. + line = "cppstats v0.9." + cmd = "/usr/bin/env cppstats --version".split() + res = execute_command(cmd) + if not (res.startswith(line)): + error_message = "expected the first line to start with '{0}' but "\ + "got '{1}'".format(line, res[0]) + log.error("program cppstats does not exist, or it is not working " + "as expected ({0}" + .format(error_message)) + raise Exception("no working cppstats found ({0})" + .format(error_message)) + + +def gen_prefix(i, num_ranges, start_rev, end_rev): + if (len(start_rev) == 40): + # When revisions are given by commit hashes, shorten them since + # they don't carry any meaning + start_rev = start_rev[0:6] + end_rev = end_rev[0:6] + return(" -> Revision range {0}/{1} ({2}..{3}): ".format(i, num_ranges, + start_rev, end_rev)) + +def gen_range_path(base_path, i, start_rev, end_rev): + if (len(start_rev) == 40): + # Same logic as above, but construct a file system path + start_rev = start_rev[0:6] + end_rev = end_rev[0:6] + return(os.path.join(base_path, "{0}--{1}-{2}". + format(str(i).zfill(3), start_rev, end_rev))) + + +def parse_iso_git_date(date_string): + # from http://stackoverflow.com/questions/526406/python-time-to-age-part-2-timezones + try: + offset = int(date_string[-5:]) + except: + log.error("could not extract timezone info from \"{0}\"" + .format(date_string)) + raise + minutes = (offset if offset > 0 else -offset) % 100 + delta = timedelta(hours=offset / 100, + minutes=minutes if offset > 0 else -minutes) + # In future python versions we can use "%Y-%m-%d %H:%M:%S %z" + # this way we don't need the above workaround, currently %z isn't + # working as documented + fmt = "%Y-%m-%d %H:%M:%S" + parsed_date = datetime.strptime(date_string[:-6], fmt) + parsed_date -= delta + return parsed_date + +# Determine settings for the size and amount of analysis windows. If nothing +# specific is provided, use default settings +def get_analysis_windows(conf): + window_size_months = 3 + num_window = -1 + + if "windowSize" in conf.keys(): + window_size_months = conf["windowSize"] + if "numWindows" in conf.keys(): + num_window = conf["numWindows"] + + return window_size_months, num_window + +def generate_analysis_windows(repo, window_size_months): + """ + Generates a list of revisions (commit hash) in increments of the window_size + parameter. The window_size parameter specifies the number of months between + revisions. This function is useful when the git repository has no tags + referencing releases. + """ + cmd_date = 'git --git-dir={0} show --format=%ad --date=iso8601'\ + .format(repo).split() + latest_date_result = execute_command(cmd_date).splitlines()[0] + latest_commit = parse_iso_git_date(latest_date_result) + + print_fmt = "%Y-%m-%dT%H:%M:%S+0000" + month = timedelta(days=30) + + def get_before_arg(num_months): + date = latest_commit - num_months * month + return '--before=' + date.strftime(print_fmt) + + revs = [] + start = window_size_months # Window size time ago + end = 0 # Present time + cmd_base = 'git --git-dir={0} log --no-merges --format=%H,%ct,%ci'\ + .format(repo).split() + cmd_base_max1 = cmd_base + ['--max-count=1'] + cmd = cmd_base_max1 + [get_before_arg(end)] + rev_end = execute_command(cmd).splitlines() + revs.extend(rev_end) + + while start != end: + cmd = cmd_base_max1 + [get_before_arg(start)] + rev_start = execute_command(cmd).splitlines() + + if len(rev_start) == 0: + start = end + cmd = cmd_base + ['--reverse'] + rev_start = [execute_command(cmd).splitlines()[0]] + else: + end = start + start = end + window_size_months + + # Check if any commits occurred since the last analysis window + if rev_start[0] != revs[0]: + revs = rev_start + revs + # else: no commit happened since last window, don't add duplicate + # revisions + # End while + + # Check that commit dates are monotonic, in some cases the earliest + # first commit does not carry the earliest commit date + revs = [rev.split(",") for rev in revs] + rev_len = len(revs) + if int(revs[0][1]) > int(revs[1][1]): + del revs[0] + + # Extract hash values and dates intro seperate lists + revs_hash = [rev[0] for rev in revs] + revs_date = [rev[2].split(" ")[0] for rev in revs] + + # We cannot detect release canndidate tags in this analysis mode, + # so provide a list with None entries + rcs = [None for x in range(len(revs))] + + return revs_hash, rcs, revs_date From 96c4208360c209245395a11b0d54a5e68ab15ed8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20L=C3=B6ffler?= Date: Mon, 4 Aug 2025 18:46:52 +0200 Subject: [PATCH 03/13] Remove dependency on codeface --- codeface_extraction/codeface_extraction.py | 9 +++++---- codeface_extraction/extractions.py | 13 ++++++------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/codeface_extraction/codeface_extraction.py b/codeface_extraction/codeface_extraction.py index 3478b1f..d5df6a0 100644 --- a/codeface_extraction/codeface_extraction.py +++ b/codeface_extraction/codeface_extraction.py @@ -22,15 +22,16 @@ import argparse import sys +from logging import getLogger from os.path import abspath -from codeface.cli import log -from codeface.configuration import Configuration -from codeface.dbmanager import DBManager - from . import extractions from csv_writer import csv_writer +from codeface.dbmanager import DBManager +from codeface.configuration import Configuration +# create logger +log = getLogger(__name__) ## # RUN FOR ALL PROJECTS diff --git a/codeface_extraction/extractions.py b/codeface_extraction/extractions.py index 081d1be..edeefda 100644 --- a/codeface_extraction/extractions.py +++ b/codeface_extraction/extractions.py @@ -26,18 +26,18 @@ import os import unicodedata import re +from logging import getLogger from ftfy import fix_encoding from email.header import decode_header, make_header -from codeface.cli import log from codeface.util import gen_range_path +log = getLogger(__name__) # # GET EXTRACTIONS # - def get_extractions(dbm, conf, resdir, csv_writer, extract_commit_messages, extract_impl, extract_on_range_level): # all extractions are subclasses of Extraction: # instantiate them all! @@ -117,7 +117,7 @@ def __init__(self, dbm, conf, res_dir, csv_writer): def is_project_level(self): """Check if this extraction is on project level (i.e., {revision} is not on the SQL statement).""" - return not ("{revision}" in self.sql) + return "{revision}" not in self.sql def is_generic_extraction(self): """Check if this extraction is generic (i.e., it can be used for several artifacts and, hence, @@ -441,7 +441,7 @@ def __init__(self, dbm, conf, resdir, csv_writer): def get_list(self): result = self._run_sql(None, None) lines = self._reduce_result(result) - return [rev for (rev, date) in lines] + return [rev for (rev, _) in lines] # @@ -737,8 +737,8 @@ def fix_characters_in_string(text): new_text = fix_encoding(text) # remove unicode characters from "Specials" block - # see: https://www.compart.com/en/unicode/block/U+FFF0 - new_text = re.sub(r"\\ufff.", " ", new_text.encode("unicode-escape")) + # see: https://www.compart.com/en/unicode/block/U+FFF0 + new_text = re.sub(r"\\ufff.", " ", new_text).encode("unicode-escape") # remove all kinds of control characters and emojis # see: https://www.fileformat.info/info/unicode/category/index.htm @@ -772,5 +772,4 @@ def fix_name_encoding(name): except LookupError: # Encoding not found, return string as is return name - return name From 454efa3094070c8445bfc8dcf58ddb5c7a848881 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20L=C3=B6ffler?= Date: Fri, 29 Aug 2025 12:15:14 +0200 Subject: [PATCH 04/13] Rename folder for codeface fragments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Maximilian Löffler --- {codeface => codeface_utils}/__init__.py | 0 codeface_utils/cluster/PersonInfo.py | 54 ++++ codeface_utils/cluster/idManager.py | 281 ++++++++++++++++++ {codeface => codeface_utils}/configuration.py | 37 ++- {codeface => codeface_utils}/dbmanager.py | 96 ++++-- {codeface => codeface_utils}/linktype.py | 0 {codeface => codeface_utils}/util.py | 153 ++++++++-- 7 files changed, 561 insertions(+), 60 deletions(-) rename {codeface => codeface_utils}/__init__.py (100%) create mode 100644 codeface_utils/cluster/PersonInfo.py create mode 100644 codeface_utils/cluster/idManager.py rename {codeface => codeface_utils}/configuration.py (86%) rename {codeface => codeface_utils}/dbmanager.py (83%) rename {codeface => codeface_utils}/linktype.py (100%) rename {codeface => codeface_utils}/util.py (77%) diff --git a/codeface/__init__.py b/codeface_utils/__init__.py similarity index 100% rename from codeface/__init__.py rename to codeface_utils/__init__.py diff --git a/codeface_utils/cluster/PersonInfo.py b/codeface_utils/cluster/PersonInfo.py new file mode 100644 index 0000000..5884108 --- /dev/null +++ b/codeface_utils/cluster/PersonInfo.py @@ -0,0 +1,54 @@ +# This file is part of Codeface. Codeface is free software: you can +# redistribute it and/or modify it under the terms of the GNU General Public +# License as published by the Free Software Foundation, version 2. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +# details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# +# Copyright 2010, 2011 by Wolfgang Mauerer +# Copyright 2012, 2013, Siemens AG, Wolfgang Mauerer +# All Rights Reserved. + +from __future__ import absolute_import + + +class PersonInfo: + """ Information about a commiter, and his relation to other commiters""" + + def __init__(self, ID=None, name="", email=""): + self.ID = ID + self.name = name + self.email = email + + def __str__(self): + return self.name + " <" + self.email + ">" + + def setID(self, ID): + self.ID = ID + def getID(self): + return self.ID + + def setName(self, name): + self.name = name + def getName(self): + if self.name == "": + return self.email + return self.name + + def setEmail(self, email): + self.email = email + def getEmail(self): + return self.email + + +############################ Test cases ######################### +if __name__ == "__main__": + personInfo = PersonInfo("sepp") + +# TODO: Implement a couple of test cases diff --git a/codeface_utils/cluster/idManager.py b/codeface_utils/cluster/idManager.py new file mode 100644 index 0000000..b6f68d3 --- /dev/null +++ b/codeface_utils/cluster/idManager.py @@ -0,0 +1,281 @@ +# This file is part of Codeface. Codeface is free software: you can +# redistribute it and/or modify it under the terms of the GNU General Public +# License as published by the Free Software Foundation, version 2. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +# details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# +# Copyright 2010, 2011 by Wolfgang Mauerer +# Copyright 2012, 2013 by Siemens AG, Wolfgang Mauerer +# All Rights Reserved. + +from __future__ import absolute_import +import re +from email.utils import parseaddr +from logging import getLogger; log = getLogger(__name__) +import six.moves.http_client +import six.moves.urllib.request, six.moves.urllib.parse, six.moves.urllib.error +import json +import string +import random +import time +from ..util import encode_as_utf8 +from six.moves import range +from abc import ABC, abstractmethod +import pandas + +from codeface_utils.cluster.PersonInfo import PersonInfo + + +class idManager(ABC): + + def __init__(self): + self.subsys_names = [] + + # Cache identical requests to the server + self._cache = {} + + # Map IDs to an instance of PersonInfo + self.persons = {} + + # Map a name, email address, or a combination of both to the numeric ID + # assigned to the developer + self.person_ids = {} + + self.fixup_emailPattern = re.compile(r'([^<]+)\s+<([^>]+)>') + self.commaNamePattern = re.compile(r'([^,\s]+),\s+(.+)') + + @abstractmethod + def _query_user_id(self, person_id): + pass + + @abstractmethod + def getPersonFromDB(self, person_id): + pass + + def getPersonID(self, addr): + """Obtain a unique ID from contributor identity credentials. + + The IDs are managed by a central csv file. + Managing multiple identities for the same person is also + handled there. + """ + + (name, email) = self._decompose_addr(addr) + if (name, email) not in self._cache: + self._cache[(name, email)] = self._query_user_id(name, email) + ID = self._cache[(name, email)] + + # Construct a local instance of PersonInfo for the contributor + # if it is not yet available + if (ID not in self.persons): + self.persons[ID] = PersonInfo(self.subsys_names, ID, name, email) + + return ID + + def getPersons(self): + return self.persons + + def getPI(self, ID): + return self.persons[ID] + + # We need the subsystem names because PersonInfo instances + # are created from this class -- and we want to know in which + # subsystem(s) a developer is active + def setSubsysNames(self, subsys_names): + self.subsys_names = subsys_names + + def getSubsysNames(self): + return self.subsys_names + + def _cleanName(self, name): + # Remove or replace characters in names that are known + # to cause parsing problems in later stages + name = name.replace('\"', "") + name = name.replace("\'", "") + name = name.strip() + + return name + + def _decompose_addr(self, addr): + addr = addr.replace("[", "").replace("]", "") + (name, email) = parseaddr(addr) + + # Handle cases where the name is unknown from commits that potentially + # predate the era of git, where only an e-mail address was given. + # In such a case, we set the name to the e-mail address. Otherwise, + # all authors with unknown name would be matched to one person. + if (name == "unknown" or name == "unknown (none)" or name == "none"): + name = email + + # The eMail parser cannot handle Surname, Name properly. + # Provide a fixup hack for this case + if (name == "" or email.count("@") == 0): + m = re.search(self.fixup_emailPattern, addr) + if m: + name = m.group(1) + email = m.group(2) + m2 = re.search(self.commaNamePattern, name) + if m2: + # Replace "Surname, Name" by "Name Surname" + name = "{0} {1}".format(m2.group(2), m2.group(1)) + + # print "Fixup for addr {0} required -> ({1}/{2})".format(addr, name, email) + else: + # check for the following special format: email@domain.tld <> + strangePattern = re.compile(r'(.*@.*)\s+(<>)') + m3 = re.search(strangePattern, addr) + if m3: + # Replace addr by "email " + name = m3.group(1).split("@")[0] # get name before @ symbol + email = m3.group(1) + # print "Fixup for addr {0} required -> ({1}/{2})".format(addr, name, email) + else: + # In this case, no eMail address was specified. + # print("Fixup for email required, but FAILED for {0}".format(addr)) + name = addr + rand_str = "".join(random.choice(string.ascii_lowercase + string.digits) + for i in range(10)) + email = "could.not.resolve@" + rand_str + + email = email.lower() + + name = self._cleanName(name) + email = self._cleanName(email) + + return (name, email) + + +class dbIdManager(idManager): + """Provide unique IDs for developers. + + This class provides an interface to the REST id server. Heuristics to + detect developers who operate under multiple identities are included + in the server.""" + + def __init__(self, dbm, conf): + super().__init__() + + self._idMgrServer = conf["idServiceHostname"] + self._idMgrPort = conf["idServicePort"] + self._conn = six.moves.http_client.HTTPConnection(self._idMgrServer, self._idMgrPort) + + # Create a project ID + self._dbm = dbm + # TODO: Pass the analysis method to idManager via the configuration + # file. However, the method should not influence the id scheme so + # that the results are easily comparable. + self._projectID = self._dbm.getProjectID(conf["project"], + conf["tagging"]) + + # Construct request headers + self.headers = {"Content-type": + "application/x-www-form-urlencoded; charset=utf-8", + "Accept": "text/plain"} + + def _query_user_id(self, name, email): + """Query the ID database for a contributor ID""" + + name = encode_as_utf8(name) + params = six.moves.urllib.parse.urlencode({'projectID': self._projectID, + 'name': name, + 'email': email}) + + try: + self._conn.request("POST", "/post_user_id", params, self.headers) + res = self._conn.getresponse() + except: + retryCount = 0 + successful = False + while (retryCount <= 10 and not successful): + log.warning("Could not reach ID service. Try to reconnect " \ + "(attempt {}).".format(retryCount)); + self._conn.close() + self._conn = six.moves.http_client.HTTPConnection(self._idMgrServer, self._idMgrPort) + time.sleep(60) + #self._conn.ping(True) + try: + self._conn.request("POST", "/post_user_id", params, self.headers) + res = self._conn.getresponse() + successful = True + except: + if retryCount < 10: + retryCount += 1 + else: + retryCount += 1 + log.exception("Could not reach ID service. Is the server running?\n") + raise + + # TODO: We should handle errors by throwing an exception instead + # of silently ignoring them + result = res.read() + jsond = json.loads(result) + try: + id = jsond["id"] + except KeyError: + raise Exception("Bad response from server: '{}'".format(jsond)) + + return (id) + + def getPersonID(self, addr): + """Obtain a unique ID from contributor identity credentials. + + The IDs are managed by a central database accessed via REST. + Managing multiple identities for the same person is also + handled there. Safety against concurrent access is provided by + the database. + """ + + (name, email) = self._decompose_addr(addr) + if not (name, email) in self._cache: + self._cache[(name, email)] = self._query_user_id(name, email) + ID = self._cache[(name, email)] + + # Construct a local instance of PersonInfo for the contributor + # if it is not yet available + if ID not in self.persons: + self.persons[ID] = PersonInfo(self.subsys_names, ID, name, email) + + return ID + + def getPersonFromDB(self, person_id): + """Query the ID database for a contributor and all corresponding data""" + + try: + self._conn.request("GET", "/getUser/{}".format(person_id), headers=self.headers) + res = self._conn.getresponse() + except: + self._conn.close() + self._conn = six.moves.http_client.HTTPConnection(self._idMgrServer, self._idMgrPort) + retryCount = 0 + successful = False + while (retryCount <= 10 and not successful): + log.warning("Could not reach ID service. Try to reconnect " \ + "(attempt {}).".format(retryCount)); + self._conn.close() + self._conn = six.moves.http_client.HTTPConnection(self._idMgrServer, self._idMgrPort) + time.sleep(60) + #self._conn.ping(True) + try: + self._conn.request("GET", "/getUser/{}".format(person_id), headers=self.headers) + res = self._conn.getresponse() + successful = True + except: + if retryCount < 10: + retryCount += 1 + else: + retryCount += 1 + log.exception("Could not reach ID service. Is the server running?\n") + raise + + result = res.read() + jsond = json.loads(result)[0] + + return (jsond) + diff --git a/codeface/configuration.py b/codeface_utils/configuration.py similarity index 86% rename from codeface/configuration.py rename to codeface_utils/configuration.py index e1fa874..d43f8d4 100644 --- a/codeface/configuration.py +++ b/codeface_utils/configuration.py @@ -19,15 +19,17 @@ Encapsulates a configuration as an immutable dict ''' +from __future__ import absolute_import import yaml from collections.abc import Mapping +from six.moves import range from logging import getLogger -from tempfile import NamedTemporaryFile -from codeface.linktype import LinkType +from codeface_utils.linktype import LinkType + -# create logger log = getLogger(__name__) +from tempfile import NamedTemporaryFile class ConfigurationError(Exception): '''Raised if any part of the configuration is malformed''' @@ -39,8 +41,8 @@ class Configuration(Mapping): ''' GLOBAL_KEYS = ('dbname', 'dbhost', 'dbuser', 'dbpwd', - 'idServiceHostname', 'idServicePort') - GLOBAL_OPTIONAL_KEYS = ('dbport',) + 'idServiceHostname', 'idServicePort') + GLOBAL_OPTIONAL_KEYS = ('dbport', 'useCsv') PROJECT_KEYS = ('project', 'repo', 'tagging', 'revisions', 'rcs') OPTIONAL_KEYS = ('description', 'ml', 'mailinglists', 'sleepTime', 'proxyHost', 'proxyPort', 'bugsProjectName', @@ -48,7 +50,8 @@ class Configuration(Mapping): 'issueTrackerURL', 'issueTrackerProject', 'issueTrackerUser', 'issueTrackerPassword', 'understand', 'sloccount', 'windowSize', 'numWindows', - 'qualityType', 'communicationType', 'artifactType', 'dependencyType') + 'qualityType', 'communicationType', 'artifactType', 'dependencyType', + 'csvFile', 'csvSeparator') ALL_KEYS = set(GLOBAL_KEYS + GLOBAL_OPTIONAL_KEYS + PROJECT_KEYS + OPTIONAL_KEYS) @@ -64,19 +67,19 @@ def __init__(self): self._conf_file_loc = None @classmethod - def load(cls, global_conffile, local_conffile=None): + def load(self, global_conffile, local_conffile=None): ''' Load configuration from global/local files ''' c = Configuration() log.info("Loading global configuration file '{}'". format(global_conffile)) - cls._global_conf = c._load(global_conffile) + self._global_conf = c._load(global_conffile) c._conf.update(c._global_conf) if local_conffile: log.info("Loading project configuration file '{}'". format(local_conffile)) - cls._project_conf = c._load(local_conffile) + self._project_conf = c._load(local_conffile) c._conf.update(c._project_conf) else: log.info("Not loading project configuration file!") @@ -87,7 +90,7 @@ def load(cls, global_conffile, local_conffile=None): def _load(self, filename): '''Helper function that checks loading errors and logs them''' try: - return yaml.load(open(filename), Loader=yaml.SafeLoader) + return yaml.load(open(filename, 'r'), Loader=yaml.SafeLoader) except IOError: log.exception("Could not open configuration file '{}'". format(filename)) @@ -100,7 +103,7 @@ def _load(self, filename): def _initialize(self): '''Infer missing values in the configuration''' if "rcs" not in self: - self._conf["rcs"] = [None for _ in range(len(self["revisions"]))] + self._conf["rcs"] = [None for x in range(len(self["revisions"]))] if "mailinglists" not in self: self._conf["mailinglists"] = [] @@ -129,12 +132,12 @@ def _check_sanity(self): raise ConfigurationError('Invalid configuration key.') for key in self.GLOBAL_KEYS + self.PROJECT_KEYS: - if key not in self: + if not key in self: log.critical("Required key '{}' missing in configuration!" ''.format(key)) raise ConfigurationError('Missing configuration key.') - if self['tagging'] not in LinkType.get_all_link_types(): + if not self['tagging'] in LinkType.get_all_link_types(): log.critical('Unsupported tagging mechanism specified!') raise ConfigurationError('Unsupported tagging mechanism.') @@ -147,6 +150,14 @@ def _check_sanity(self): "candidates.".format(len(self["revisions"]), len(self["rcs"]))) raise ConfigurationError('Malformed configuration.') + if self["useCsv"]: + if not "csvFile" in self: + log.critical("Malformed configuration: useCsv is true, but " + "csvFile is not specified.") + raise ConfigurationError('Malformed configuration.') + if not "csvSeparator" in self: + self["csvSeparator"] = "," + unknown_keys = [k for k in self if k not in self.ALL_KEYS] for key in unknown_keys: log.warning("Unknown key '{}' in configuration.".format(key)) diff --git a/codeface/dbmanager.py b/codeface_utils/dbmanager.py similarity index 83% rename from codeface/dbmanager.py rename to codeface_utils/dbmanager.py index df917ca..4f8895d 100644 --- a/codeface/dbmanager.py +++ b/codeface_utils/dbmanager.py @@ -17,10 +17,15 @@ # Thin sql database wrapper +from __future__ import absolute_import +from __future__ import print_function import MySQLdb as mdb -from datetime import datetime, timezone -from logging import getLogger +import time +from datetime import datetime +from logging import getLogger; from contextlib import contextmanager +from six.moves import range +from six.moves import zip # create logger log = getLogger(__name__) @@ -44,13 +49,28 @@ class DBManager: """This class provides an interface to the codeface sql database.""" def __init__(self, conf): + + self.conf = conf + self.__openConnection(conf) + + # max_packet_size = 1024 * 1024 * 512 + # self.doExec("SET GLOBAL max_allowed_packet=%s", (max_packet_size,)) + + def __del__(self): + if self.con != None: + self.con.close() + + def __openConnection(self, conf): try: self.con = None self.con = mdb.Connection(host=conf["dbhost"], port=conf["dbport"], user=conf["dbuser"], passwd=conf["dbpwd"], - db=conf["dbname"]) + db=conf["dbname"], + charset="utf8", + use_unicode=True) + self.cur = self.con.cursor() log.debug( "Establishing MySQL connection to " "{c[dbuser]}@{c[dbhost]}:{c[dbport]}, DB '{c[dbname]}'" @@ -62,14 +82,7 @@ def __init__(self, conf): ": {e[1]} ({e[0]})" "".format(c=conf, e=e.args)) raise - self.cur = self.con.cursor() - max_packet_size = 1024 * 1024 * 256 - self.doExec("SET GLOBAL max_allowed_packet=%s", (max_packet_size,)) - - def __del__(self): - if self.con is not None: - self.con.close() def doExec(self, stmt, args=None): with _log_db_error(stmt, args): @@ -87,21 +100,53 @@ def doExec(self, stmt, args=None): if dbe.args[0] == 1213: # Deadlock! retry... log.warning("Recoverable deadlock in MySQL - retrying " \ "(attempt {}).".format(retryCount)) + elif dbe.args[0] == 2003: # Can't connect to MySQL server + log.warning("Can't connect to MySQL server - retrying " \ + "(attempt {}).".format(retryCount)) + time.sleep(60) + log.warning("Try opening new connection") + self.con.close() + log.warning("Connection successfully closed") + self.__openConnection(self.conf) + log.warning("Opening new connection successful") elif dbe.args[0] == 2006: # Server gone away... log.warning("MySQL Server gone away, trying to reconnect " \ "(attempt {}).".format(retryCount)) - self.con.ping(True) - elif dbe.args[0] == 2013: # Lost connection to MySQL server during query... + time.sleep(60) + log.warning("Try opening new connection") + self.con.close() + log.warning("Connection successfully closed") + self.__openConnection(self.conf) + log.warning("Opening new connection successful") + elif dbe.args[0] == 2013 or dbe.args[0] == 1053: # Lost connection to MySQL server during query | Server shutdown in progress log.warning("Lost connection to MySQL server during query, " \ "trying to reconnect (attempt {}).".format(retryCount)) + time.sleep(60) + log.warning("Try opening new connection") + self.con.close() + log.warning("Connection successfully closed") + self.__openConnection(self.conf) + log.warning("Opening new connection successful") + elif dbe.args[0] == 1153: # Got a packet bigger than 'max_allowed_packet' bytes + log.warning("Sent a too big packet ({lnos} lines), retrying with smaller packets.".format( + lnos=len(args))) + ## split package into smaller packets of size 'chunk_size' + chunk_size = 100 + args_list = [args[i:i + chunk_size] for i in range(0, len(args), chunk_size)] + ## retrying + time.sleep(60) self.con.ping(True) + for chunk in args_list: + self.doExec(stmt, chunk) else: + self.con.close() raise - # Give up after ten retry attempts and propagate the - # problem to the caller. Callers can either fix the problem with - # a different query, or the analysis fails + # Give up after too many retry attempts and propagate the + # problem to the caller. Either it's fixed with a different + # query, or the analysis fails log.error("DB access failed after ten attempts, giving up") + self.con.close() raise def doFetchAll(self): @@ -203,13 +248,18 @@ def getTagID(self, projectID, tag, type): format(tag, type, self.cur.rowcount)) return self.doFetchAll()[0][0] - def getCommitId(self, projectId, commitHash): - self.doExec("SELECT id FROM commit" + - " WHERE commitHash=%s AND projectId=%s" - , (commitHash, projectId)) + def getCommitId(self, projectId, commitHash, releaseRangeID=None): + stmt = "SELECT id FROM commit WHERE commitHash=%s AND projectId=%s" + args = (commitHash, projectId) + + if (releaseRangeID): + stmt += " AND releaseRangeId=%s" + args += (releaseRangeID, ) + + self.doExec(stmt, args) if self.cur.rowcount == 0: - raise Exception("Commit from project {} not found!". - format(projectId)) + raise Exception("Commit {0} from project {1} not found!". + format(commitHash, projectId)) return self.doFetchAll()[0][0] def getRevisionID(self, projectID, tag): @@ -394,7 +444,7 @@ def update_release_timeline(self, project, tagging, revs, rcs, previous_rev = None if len(tags) > 0: previous_rev = tags[-1] - for rev, rc in list(zip(revs, rcs))[len(tags):]: + for rev, rc in zip(revs, rcs)[len(tags):]: self.doExecCommit("INSERT INTO release_timeline " "(type, tag, projectId) " "VALUES (%s, %s, %s)", @@ -428,4 +478,4 @@ def update_release_timeline(self, project, tagging, revs, rcs, def tstamp_to_sql(tstamp): """Convert a Unix timestamp into an SQL compatible DateTime string""" - return (datetime.fromtimestamp(tstamp, tz=timezone.utc).strftime("%Y-%m-%d %H:%M:%S")) + return (datetime.utcfromtimestamp(tstamp).strftime("%Y-%m-%d %H:%M:%S")) diff --git a/codeface/linktype.py b/codeface_utils/linktype.py similarity index 100% rename from codeface/linktype.py rename to codeface_utils/linktype.py diff --git a/codeface/util.py b/codeface_utils/util.py similarity index 77% rename from codeface/util.py rename to codeface_utils/util.py index 807ecc4..d859dcf 100644 --- a/codeface/util.py +++ b/codeface_utils/util.py @@ -17,6 +17,8 @@ Utility functions for running external commands ''' +from __future__ import absolute_import +import logging; log = logging.getLogger(__name__) import os import os.path import re @@ -24,19 +26,24 @@ import signal import sys import traceback +import unicodedata from collections import OrderedDict, namedtuple from glob import glob from math import sqrt -from multiprocessing import Process, Queue, Lock -from pkg_resources import resource_filename +from multiprocessing import Process, Queue, JoinableQueue, Lock +from pickle import dumps, PicklingError +from importlib.resources import files from subprocess import Popen, PIPE from tempfile import NamedTemporaryFile, mkdtemp from time import sleep from threading import enumerate as threading_enumerate -from queue import Empty +from six.moves.queue import Empty from datetime import timedelta, datetime -import logging -log = logging.getLogger(__name__) +from ftfy import fix_encoding +from six.moves import map +import six +from six.moves import range +from six.moves import zip # Represents a job submitted to the batch pool. BatchJobTuple = namedtuple('BatchJobTuple', ['id', 'func', 'args', 'kwargs', @@ -150,10 +157,10 @@ def join(self): # Terminate and join the workers # Wait 100ms to allow backtraces to be logged sleep(0.1) - log.devinfo("Terminating workers...") + log.info("Terminating workers...") for w in self.workers: w.terminate() - log.devinfo("Workers terminated.") + log.info("Workers terminated.") def batchjob_worker_function(work_queue, done_queue): ''' @@ -205,7 +212,7 @@ def handle_sigint(signal, frame): with l: log.fatal("CTRL-C pressed!") for c in get_stack_dump(): - log.devinfo(c) + log.info(c) # This call raises a SystemExit exception in the # stack frame that was interrupted by the signal # For the main thread, this is what we want. @@ -216,12 +223,12 @@ def handle_sigint(signal, frame): def handle_sigint_silent(signal, frame): with l: for c in get_stack_dump(): - log.devinfo(c) + log.info(c) logging.shutdown() # Since we want to terminate worker threads with prejudice, # we use os._exit, which directly terminates the process. # otherwise the worker try/catch will also catch the SystemExit - os.exit_(-1) + os._exit(-1) def handle_sigterm(signal, frame): # Since we want to terminate worker threads with prejudice, @@ -290,7 +297,7 @@ def _convert_dot_file(dotfile): ''' res = [] edges = {} - edge_spec = re.compile("\s+(\d+) -> (\d+);") + edge_spec = re.compile(r"\s+(\d+) -> (\d+);") file = open(dotfile, "r") lines = [line.strip("\n") for line in file] @@ -334,12 +341,12 @@ def layout_graph(filename): os.unlink(out.name) def generate_report(start_rev, end_rev, resdir): - log.devinfo(" -> Generating report") + log.info(" -> Generating report") report_base = "report-{0}_{1}".format(start_rev, end_rev) # Run perl script to generate report LaTeX file cmd = [] - cmd.append(resource_filename(__name__, "perl/create_report.pl")) + cmd.append(files(__package__).joinpath("perl/create_report.pl")) cmd.append(resdir) cmd.append("{0}--{1}".format(start_rev, end_rev)) with open(os.path.join(resdir, report_base + ".tex"), 'w') as f: @@ -376,20 +383,20 @@ def generate_reports(start_rev, end_rev, range_resdir): def check4ctags(): # check if the appropriate ctags is installed on the system - prog_name = 'Exuberant Ctags' - prog_version = 'Exuberant Ctags 5.9~svn20110310' - cmd = "ctags-exuberant --version".split() + prog_name = 'Universal Ctags' + prog_version = 'Universal Ctags 5.9.0, Copyright (C) 2015 Universal Ctags Team' + cmd = "ctags-universal --version".split() res = execute_command(cmd, None) if not(res.startswith(prog_name)): log.error("program '{0}' does not exist".format(prog_name)) - raise Exception("ctags-exuberant not found") + raise Exception("ctags-universal not found") if not(res.startswith(prog_version)): # TODO: change this to use standard mechanism for error logging log.error("Ctags version '{0}' not found".format(prog_version)) - raise Exception("Incompatible ctags-exuberant version") + raise Exception("Incompatible ctags-universal version") def check4cppstats(): @@ -454,9 +461,9 @@ def get_analysis_windows(conf): window_size_months = 3 num_window = -1 - if "windowSize" in conf.keys(): + if "windowSize" in list(conf.keys()): window_size_months = conf["windowSize"] - if "numWindows" in conf.keys(): + if "numWindows" in list(conf.keys()): num_window = conf["numWindows"] return window_size_months, num_window @@ -473,11 +480,26 @@ def generate_analysis_windows(repo, window_size_months): latest_date_result = execute_command(cmd_date).splitlines()[0] latest_commit = parse_iso_git_date(latest_date_result) + cmd_root_commit_dates = 'git --git-dir={0} log --max-parents=0 --format=%ad --date=iso8601'\ + .format(repo).split() + root_commit_dates_result = execute_command(cmd_root_commit_dates).splitlines() + earliest_root_commit_date = min([parse_iso_git_date(root_commit) for root_commit in root_commit_dates_result]) + print_fmt = "%Y-%m-%dT%H:%M:%S+0000" month = timedelta(days=30) def get_before_arg(num_months): date = latest_commit - num_months * month + + # Due to a bug in git, broken author information in commit objects can lead to a timestamp of 0 when using the + # --before option although the dates themselves are not broken and can be parsed without problems. + # For more details, see the whole thread conversation on the git mailing list here: + # https://lore.kernel.org/git/7728e059-d58d-cce7-c011-fbc16eb22fb9@cs.uni-saarland.de/ + # To avoid running into an infinite while loop below (due to timestamps being 0), check if the date is earlier + # than the date of the earliest root commit and break if this is the case. + if date < earliest_root_commit_date: + raise ValueError("The before-arg date is earlier than the earliest commit in the repository.") + return '--before=' + date.strftime(print_fmt) revs = [] @@ -491,13 +513,20 @@ def get_before_arg(num_months): revs.extend(rev_end) while start != end: - cmd = cmd_base_max1 + [get_before_arg(start)] - rev_start = execute_command(cmd).splitlines() + + try: + cmd = cmd_base_max1 + [get_before_arg(start)] + rev_start = execute_command(cmd).splitlines() + except ValueError as ve: + rev_start = [] + log.info("rev_start would be earlier than earliest root commit. Start at initial commit instead...") if len(rev_start) == 0: start = end - cmd = cmd_base + ['--reverse'] - rev_start = [execute_command(cmd).splitlines()[0]] + #cmd = cmd_base + ['--reverse'] + #rev_start = [execute_command(cmd).splitlines()[0]] + cmd = cmd_base + ['--max-parents=0'] + rev_start = [execute_command(cmd).splitlines()[-1]] else: end = start start = end + window_size_months @@ -525,3 +554,79 @@ def get_before_arg(num_months): rcs = [None for x in range(len(revs))] return revs_hash, rcs, revs_date + + +def encode_as_utf8(string): + """ + Encode the given string properly in UTF-8, + independent from its internal representation (str or unicode). + + This function removes any control characters and four-byte-encoded unicode characters and replaces them + with " ". (Four-byte-encoded unicode characters do not work with 'utf8' encoding of MySQL.) + + :param string: any string + :return: the UTF-8 encoded string of type str + """ + + try: + string = string.decode("utf-8") + except: + # if we have a string, we transform it to unicode + if isinstance(string, str): + string = six.text_type(string, "unicode-escape", errors="replace") + + ## maybe not a string/unicode at all, return rightaway + if not isinstance(string, six.text_type): + return string + + # convert to real unicode-utf8 encoded string, fix_text ensures proper encoding + new_string = fix_encoding(string) + + # remove unicode characters from "Specials" block + # see: https://www.compart.com/en/unicode/block/U+FFF0 + new_string = re.sub(r"\\ufff.", " ", new_string.encode("unicode-escape")) + + # remove all kinds of control characters and emojis + # see: https://www.fileformat.info/info/unicode/category/index.htm + new_string = u"".join(ch if unicodedata.category(ch)[0] != "C" else " " for ch in new_string.decode("unicode-escape")) + + new_string = new_string.encode("utf-8") + + # replace any 4-byte characters with a single space (previously: four_byte_replacement) + try: + # UCS-4 build + four_byte_regex = re.compile(u"[\U00010000-\U0010ffff]") + except re.error: + # UCS-2 build + four_byte_regex = re.compile(u"[\uD800-\uDBFF][\uDC00-\uDFFF]") + + four_byte_replacement = r" " # r":4bytereplacement:" + new_string = four_byte_regex.sub(four_byte_replacement, new_string.decode("utf-8")).encode("utf-8") + + return str(new_string) + + +def encode_items_as_utf8(items): + """ + Encode the given list/tuple/dict of strings properly in UTF-8, + independent from its internal representation (str or unicode). + + This function uses encode_as_utf8(string) internally. + + :param string: any string + :return: the UTF-8 encoded string of type str + """ + + # unpack values if we have a dictionary + items_unpacked = items + if isinstance(items, dict): + items_unpacked = list(items.values()) + + # encode each item as UTF-8 properly + items_enc = list(map(encode_as_utf8, items_unpacked)) + + # add key for dict again + if isinstance(items, dict): + items_enc = dict(zip(list(items.keys()), items_enc)) + + return items_enc From 347238451357f5a17dd3e3014f7a4bd25872895a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20L=C3=B6ffler?= Date: Fri, 29 Aug 2025 12:17:05 +0200 Subject: [PATCH 05/13] Introduce CSV-based IdManager class MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Using the csvIdManager, we can use all functionality but extraction without a codeface installation. The csvIdManager then replaces the MySQL-based IdManager from codeface. Signed-off-by: Maximilian Löffler --- codeface_utils/cluster/idManager.py | 91 ++++++++++++++++++++++++----- 1 file changed, 77 insertions(+), 14 deletions(-) diff --git a/codeface_utils/cluster/idManager.py b/codeface_utils/cluster/idManager.py index b6f68d3..b319511 100644 --- a/codeface_utils/cluster/idManager.py +++ b/codeface_utils/cluster/idManager.py @@ -36,8 +36,6 @@ class idManager(ABC): def __init__(self): - self.subsys_names = [] - # Cache identical requests to the server self._cache = {} @@ -52,7 +50,7 @@ def __init__(self): self.commaNamePattern = re.compile(r'([^,\s]+),\s+(.+)') @abstractmethod - def _query_user_id(self, person_id): + def _query_user_id(self, name, email): pass @abstractmethod @@ -75,7 +73,7 @@ def getPersonID(self, addr): # Construct a local instance of PersonInfo for the contributor # if it is not yet available if (ID not in self.persons): - self.persons[ID] = PersonInfo(self.subsys_names, ID, name, email) + self.persons[ID] = PersonInfo(ID, name, email) return ID @@ -85,15 +83,6 @@ def getPersons(self): def getPI(self, ID): return self.persons[ID] - # We need the subsystem names because PersonInfo instances - # are created from this class -- and we want to know in which - # subsystem(s) a developer is active - def setSubsysNames(self, subsys_names): - self.subsys_names = subsys_names - - def getSubsysNames(self): - return self.subsys_names - def _cleanName(self, name): # Remove or replace characters in names that are known # to cause parsing problems in later stages @@ -240,7 +229,7 @@ def getPersonID(self, addr): # Construct a local instance of PersonInfo for the contributor # if it is not yet available if ID not in self.persons: - self.persons[ID] = PersonInfo(self.subsys_names, ID, name, email) + self.persons[ID] = PersonInfo(ID, name, email) return ID @@ -279,3 +268,77 @@ def getPersonFromDB(self, person_id): return (jsond) + +class csvIdManager(idManager): + """Provide unique IDs for developers. + + This class provides an interface to CSV id files. + """ + def __init__(self, conf): + super().__init__() + + # CSV file containing the IDs + self.csv_file = conf["csvFile"] + self.csv_sep = conf["csvSeparator"] + self.df = self._verifyCsvFile() + + def _verifyCsvFile(self): + with open(self.csv_file, "r") as file: + df = pandas.read_csv(file, sep=self.csv_sep, names=['ID', 'name', 'email']) + return df + + def _addRow(self, name, email): + + # determine next ID + max_id = self.df['ID'].max() + next_id = 0 if bool(pandas.isna(max_id)) else int(max_id) + 1 + + # append new row + self.df = self.df._append({ + 'ID': next_id, + 'name': name, + 'email': email + }, ignore_index=True) + + # dump df to file + file = open(self.csv_file, "w") + self.df.to_csv(file, sep=self.csv_sep, index=False, header=False) + + return next_id + + def _query_user_id(self, name, email): + """Query the ID csv file for a contributor ID""" + + # no name is okay, but no email is not + if not email: + return -1 + + # Match by name and email. + # Disregard random string after "could.not.resolve@" in email + # to avoid creating multiple entries for the same person. + if email.startswith("could.not.resolve@"): + rows = self.df[(self.df['name'] == name) & + (self.df['email'].str.startswith("could.not.resolve@"))] + else: + rows = self.df[(self.df['name'] == name) & + (self.df['email'] == email)] + + if len(rows) == 0: + name = '' if not name else name + return self._addRow(name, email) + + elif len(rows) == 1: + return int(rows['ID'].values[0]) + + else: + raise Exception("Constructed author list is in invalid format. Duplicate entries found") + + def getPersonFromDB(self, person_id): + """Get a PersonInfo instance from the database by ID.""" + if person_id not in self.persons: + rows = self.df[self.df['ID'] == person_id] + if len(rows) == 1: + name = rows['name'].values[0] + email = rows['email'].values[0] + self.persons[person_id] = PersonInfo(person_id, name, email) + return self.persons.get(person_id, None) From 202a188a724548ff7db1ed274ae156efe29e0bad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20L=C3=B6ffler?= Date: Sat, 30 Aug 2025 17:12:54 +0200 Subject: [PATCH 06/13] Adapt usage of codeface fragments in all modules MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Maximilian Löffler --- anonymization/anonymization.py | 9 ++-- .../author_postprocessing.py | 8 +-- bot_processing/bot_processing.py | 11 ++-- codeface_extraction/codeface_extraction.py | 4 +- codeface_extraction/extractions.py | 3 +- issue_processing/issue_processing.py | 43 ++++++++-------- issue_processing/jira_issue_processing.py | 51 ++++++++++--------- mbox_parsing/mbox_parsing.py | 22 ++++---- 8 files changed, 82 insertions(+), 69 deletions(-) diff --git a/anonymization/anonymization.py b/anonymization/anonymization.py index b11ef52..fdddebc 100644 --- a/anonymization/anonymization.py +++ b/anonymization/anonymization.py @@ -29,15 +29,14 @@ import sys from os import path, walk, makedirs from os.path import abspath -from shutil import copy - -from codeface.cli import log -from codeface.configuration import Configuration -from codeface.dbmanager import DBManager +from logging import getLogger +from codeface_utils.configuration import Configuration from csv_writer import csv_writer +log = getLogger(__name__) + ## # RUN POSTPROCESSING ## diff --git a/author_postprocessing/author_postprocessing.py b/author_postprocessing/author_postprocessing.py index c712ac6..f3b0ca9 100644 --- a/author_postprocessing/author_postprocessing.py +++ b/author_postprocessing/author_postprocessing.py @@ -42,14 +42,14 @@ from os import path, walk, makedirs from os.path import abspath from shutil import copy +from logging import getLogger -from codeface.cli import log -from codeface.configuration import Configuration -from codeface.dbmanager import DBManager - +from codeface_utils.configuration import Configuration from csv_writer import csv_writer +log = getLogger(__name__) + ## # RUN POSTPROCESSING ## diff --git a/bot_processing/bot_processing.py b/bot_processing/bot_processing.py index 43ff492..14bdd56 100644 --- a/bot_processing/bot_processing.py +++ b/bot_processing/bot_processing.py @@ -25,11 +25,14 @@ import urllib.request, urllib.parse, urllib.error import operator -from codeface.cli import log -from codeface.configuration import Configuration +from logging import getLogger +from codeface_utils.configuration import Configuration from csv_writer import csv_writer + +log = getLogger(__name__) + def run(): # get all needed paths and arguments for the method call. parser = argparse.ArgumentParser(prog='codeface-extraction-bots-github', description='Codeface extraction') @@ -75,7 +78,7 @@ def load_bot_data(bot_file, header = True): :return: the read bot data """ - log.devinfo("Read bot data from file '{}'...".format(bot_file)) + log.info("Read bot data from file '{}'...".format(bot_file)) # check if file exists and exit early if not if not os.path.exists(bot_file): @@ -99,7 +102,7 @@ def load_user_data(user_data_file): :return: the read user data """ - log.devinfo("Read user data from file '{}'...".format(user_data_file)) + log.info("Read user data from file '{}'...".format(user_data_file)) # check if file exists and exit early if not if not os.path.exists(user_data_file): diff --git a/codeface_extraction/codeface_extraction.py b/codeface_extraction/codeface_extraction.py index d5df6a0..62b36ef 100644 --- a/codeface_extraction/codeface_extraction.py +++ b/codeface_extraction/codeface_extraction.py @@ -27,8 +27,8 @@ from . import extractions from csv_writer import csv_writer -from codeface.dbmanager import DBManager -from codeface.configuration import Configuration +from codeface_utils.dbmanager import DBManager +from codeface_utils.configuration import Configuration # create logger log = getLogger(__name__) diff --git a/codeface_extraction/extractions.py b/codeface_extraction/extractions.py index edeefda..b294be2 100644 --- a/codeface_extraction/extractions.py +++ b/codeface_extraction/extractions.py @@ -30,7 +30,8 @@ from ftfy import fix_encoding from email.header import decode_header, make_header -from codeface.util import gen_range_path +from codeface_utils.util import gen_range_path + log = getLogger(__name__) diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py index ff80d53..adbee53 100644 --- a/issue_processing/issue_processing.py +++ b/issue_processing/issue_processing.py @@ -30,16 +30,19 @@ import sys import urllib.request, urllib.parse, urllib.error from datetime import datetime, timedelta +from logging import getLogger import operator -from codeface.cli import log -from codeface.cluster.idManager import idManager -from codeface.configuration import Configuration -from codeface.dbmanager import DBManager +from codeface_utils.cluster.idManager import dbIdManager, csvIdManager +from codeface_utils.configuration import Configuration +from codeface_utils.dbmanager import DBManager from dateutil import parser as dateparser from csv_writer import csv_writer + +log = getLogger(__name__) + # known types from JIRA and GitHub default labels known_types = {"bug", "improvement", "enhancement", "new feature", "task", "test", "wish"} @@ -95,7 +98,7 @@ def load(source_folder): """ srcfile = os.path.join(source_folder, "issues.json") - log.devinfo("Loading Github issues from file '{}'...".format(srcfile)) + log.info("Loading Github issues from file '{}'...".format(srcfile)) # check if file exists and exit early if not if not os.path.exists(srcfile): @@ -232,7 +235,7 @@ def reformat_issues(issue_data): :return: the re-arranged issue data """ - log.devinfo("Re-arranging Github issues...") + log.info("Re-arranging Github issues...") # re-process all issues for issue in issue_data: @@ -670,10 +673,13 @@ def insert_user_data(issues, conf, resdir): user_id_buffer = dict() # create buffer for usernames (key: username) username_id_buffer = dict() - # open database connection - dbm = DBManager(conf) - # open ID-service connection - idservice = idManager(dbm, conf) + + # connect to ID service + if conf["useCsv"]: + idservice = csvIdManager(conf) + else: + dbm = DBManager(conf) + idservice = dbIdManager(dbm, conf) def get_user_string(name, email): if not email or email is None: @@ -683,26 +689,23 @@ def get_user_string(name, email): return "{name} <{email}>".format(name=name, email=email) def get_id_and_update_user(user, buffer_db_ids=user_id_buffer, buffer_usernames=username_id_buffer): - username = str(user["username"]).encode("utf-8") # fix encoding for name and e-mail address - if user["name"] is not None: - name = str(user["name"]).encode("utf-8") - else: - name = username - mail = str(user["email"]).encode("utf-8") + name = user["name"] if "name" in user else str(user["username"]) + mail = user["email"] # empty + # construct string for ID service and send query user_string = get_user_string(name, mail) # check buffer to reduce amount of DB queries if user_string in buffer_db_ids: - log.devinfo("Returning person id for user '{}' from buffer.".format(user_string)) + log.info("Returning person id for user '{}' from buffer.".format(user_string)) if username is not None: buffer_usernames[username] = buffer_db_ids[user_string] return buffer_db_ids[user_string] # get person information from ID service - log.devinfo("Passing user '{}' to ID service.".format(user_string)) + log.info("Passing user '{}' to ID service.".format(user_string)) idx = idservice.getPersonID(user_string) # add user information to buffer @@ -719,11 +722,11 @@ def get_user_from_id(idx, buffer_db=user_buffer): # check whether user information is in buffer to reduce amount of DB queries if idx in buffer_db: - log.devinfo("Returning user '{}' from buffer.".format(idx)) + log.info("Returning user '{}' from buffer.".format(idx)) return buffer_db[idx] # get person information from ID service - log.devinfo("Passing user id '{}' to ID service.".format(idx)) + log.info("Passing user id '{}' to ID service.".format(idx)) person = idservice.getPersonFromDB(idx) user = dict() user["email"] = person["email1"] # column "email1" diff --git a/issue_processing/jira_issue_processing.py b/issue_processing/jira_issue_processing.py index fa3e826..5d763fa 100644 --- a/issue_processing/jira_issue_processing.py +++ b/issue_processing/jira_issue_processing.py @@ -29,15 +29,15 @@ import time import csv import json +from logging import getLogger from xml.dom.minidom import parse from datetime import datetime from dateutil import parser as dateparser -from codeface.cli import log -from codeface.cluster.idManager import idManager -from codeface.configuration import Configuration -from codeface.dbmanager import DBManager +from codeface_utils.cluster.idManager import dbIdManager, csvIdManager +from codeface_utils.configuration import Configuration +from codeface_utils.dbmanager import DBManager from csv_writer import csv_writer @@ -49,6 +49,9 @@ importlib.reload(sys) sys.setdefaultencoding("utf-8") + +log = getLogger(__name__) + # global counter for JIRA requests to make sure to not exceed the request limit jira_request_counter = 0 max_requests = 45000 # 50,000 JIRA requests per 24 hours are allowed @@ -173,7 +176,7 @@ def load_xml(source_folder, xml_file): """ srcfile = os.path.join(source_folder, xml_file) - log.devinfo("Loading issues from file '{}'...".format(srcfile)) + log.info("Loading issues from file '{}'...".format(srcfile)) try: # parse the xml-file @@ -373,7 +376,7 @@ def parse_xml(issue_data, persons, skip_history, referenced_bys): text = comment_x.firstChild if text is None: - log.warn("Empty comment in issue " + issue["id"]) + log.warning("Empty comment in issue " + issue["id"]) comment["text"] = "" else: comment["text"] = text.data @@ -441,7 +444,7 @@ def load_issues_via_api(issues, persons, url, referenced_bys): api_issue = jira_project.issue(issue["externalId"], expand="changelog") changelog = api_issue.changelog except JIRAError: - log.warn("JIRA Error: Changelog cannot be extracted for issue " + issue["externalId"] + ". History omitted!") + log.warning("JIRA Error: Changelog cannot be extracted for issue " + issue["externalId"] + ". History omitted!") changelog = None histories = list() @@ -479,7 +482,7 @@ def load_issues_via_api(issues, persons, url, referenced_bys): if hasattr(change, "author"): user = create_user(change.author.displayName, change.author.name, "") else: - log.warn("No author for history: " + str(change.id) + " created at " + str(change.created)) + log.warning("No author for history: " + str(change.id) + " created at " + str(change.created)) user = create_user("","","") history["author"] = merge_user_with_user_from_csv(user, persons) history["date"] = format_time(change.created) @@ -499,7 +502,7 @@ def load_issues_via_api(issues, persons, url, referenced_bys): if hasattr(change, "author"): user = create_user(change.author.displayName, change.author.name, "") else: - log.warn("No author for history: " + str(change.id) + " created at " + str(change.created)) + log.warning("No author for history: " + str(change.id) + " created at " + str(change.created)) user = create_user("","","") history["author"] = merge_user_with_user_from_csv(user, persons) history["date"] = format_time(change.created) @@ -591,10 +594,13 @@ def insert_user_data(issues, conf): user_buffer = dict() # create buffer for user ids (key: user string) user_id_buffer = dict() - # open database connection - dbm = DBManager(conf) - # open ID-service connection - idservice = idManager(dbm, conf) + + # connect to ID service + if conf["useCsv"]: + idservice = csvIdManager(conf) + else: + dbm = DBManager(conf) + idservice = dbIdManager(dbm, conf) def get_user_string(name, email): if not email or email is None: @@ -604,22 +610,21 @@ def get_user_string(name, email): return "{name} <{email}>".format(name=name, email=email) def get_id_and_update_user(user, buffer_db_ids=user_id_buffer): + # fix encoding for name and e-mail address - if user["name"] is not None and user["name"] != "": - name = str(user["name"]).encode("utf-8") - else: - name = str(user["username"]).encode("utf-8") - mail = str(user["email"]).encode("utf-8") # empty + name = user["name"] if "name" in user else str(user["username"]) + mail = user["email"] + # construct string for ID service and send query user_string = get_user_string(name, mail) # check buffer to reduce amount of DB queries if user_string in buffer_db_ids: - log.devinfo("Returning person id for user '{}' from buffer.".format(user_string)) + log.info("Returning person id for user '{}' from buffer.".format(user_string)) return buffer_db_ids[user_string] # get person information from ID service - log.devinfo("Passing user '{}' to ID service.".format(user_string)) + log.info("Passing user '{}' to ID service.".format(user_string)) idx = idservice.getPersonID(user_string) # add user information to buffer @@ -632,11 +637,11 @@ def get_user_from_id(idx, buffer_db=user_buffer): # check whether user information is in buffer to reduce amount of DB queries if idx in buffer_db: - log.devinfo("Returning user '{}' from buffer.".format(idx)) + log.info("Returning user '{}' from buffer.".format(idx)) return buffer_db[idx] # get person information from ID service - log.devinfo("Passing user id '{}' to ID service.".format(idx)) + log.info("Passing user id '{}' to ID service.".format(idx)) person = idservice.getPersonFromDB(idx) user = dict() user["email"] = person["email1"] # column "email1" @@ -1021,7 +1026,7 @@ def find_first_existing(source_folder, filenames): log.error("Person files '{}' do not exist! Exiting early...".format(person_files)) sys.exit(-1) - log.devinfo("Loading person csv from file '{}'...".format(srcfile)) + log.info("Loading person csv from file '{}'...".format(srcfile)) with open(srcfile, "r") as f: person_data = csv.DictReader(f, delimiter=",", skipinitialspace=True) persons_by_username = {} diff --git a/mbox_parsing/mbox_parsing.py b/mbox_parsing/mbox_parsing.py index 92d8cb9..ef9ae41 100644 --- a/mbox_parsing/mbox_parsing.py +++ b/mbox_parsing/mbox_parsing.py @@ -28,18 +28,20 @@ import shutil import sys from os.path import abspath +from logging import getLogger -from codeface.cli import log -from codeface.configuration import Configuration from joblib import Parallel, delayed from whoosh import index # import create_in, open_dir, exists_in from whoosh.analysis import StandardAnalyzer from whoosh.fields import Schema, TEXT, ID from whoosh.qparser import QueryParser +from codeface_utils.configuration import Configuration from csv_writer import csv_writer +log = getLogger(__name__) + def __get_index(mbox, mbox_path, results_folder, schema, reindex): """Initialize the search index (and create it, if needed @@ -56,13 +58,13 @@ def __get_index(mbox, mbox_path, results_folder, schema, reindex): index_path = os.path.join(results_folder, "mbox-index", os.path.basename(mbox_path)) # 1) if reindexing, remove the index folder if os.path.exists(index_path) and reindex: - log.devinfo("Removing index from path '{}'...".format(index_path)) + log.info("Removing index from path '{}'...".format(index_path)) shutil.rmtree(index_path) # 2) Check if we need to create the index for Whoosh full-text search - log.devinfo("Checking for index in results folder...") + log.info("Checking for index in results folder...") if (not os.path.exists(index_path)) or (not index.exists_in(index_path)): # 2.1) create index - log.devinfo("Creating index for text search in results folder.") + log.info("Creating index for text search in results folder.") os.makedirs(index_path) # create path index.create_in(index_path, schema) # initialize as index path ix = index.open_dir(index_path) # open as index path @@ -71,10 +73,10 @@ def __get_index(mbox, mbox_path, results_folder, schema, reindex): for message in mbox: writer.add_document(messageID=str(message['message-id']), content=__mbox_getbody(message)) writer.commit() - log.devinfo("Index created, parsing will begin now.") + log.info("Index created, parsing will begin now.") else: # 2.2) load index - log.devinfo("Index has already been created, parsing will begin right away.") + log.info("Index has already been created, parsing will begin right away.") ix = index.open_dir(index_path) return ix @@ -131,8 +133,8 @@ def __mbox_getbody(message): body = message.get_payload(decode=True) if body is None: - log.devinfo(message.get_content_type()) - log.devinfo( + log.info(message.get_content_type()) + log.info( "An image or some other content has been found that cannot be indexed. Message is given an empty body.") body = ' ' @@ -149,7 +151,7 @@ def __parse_execute(artifact, schema, my_index, include_filepath): :return: a match list of tuples (file name, artifact, message ID) """ - log.devinfo("Searching for artifact ({}, {})...".format(artifact[0], artifact[1])) + log.info("Searching for artifact ({}, {})...".format(artifact[0], artifact[1])) result = [] From 9df79c0b67742378795325effeaa511d4e34d25c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20L=C3=B6ffler?= Date: Sat, 30 Aug 2025 17:14:16 +0200 Subject: [PATCH 07/13] Increase consistency with pyhton3 conventions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Maximilian Löffler --- anonymization/anonymization.py | 10 +++--- .../author_postprocessing.py | 24 +++++++------- bot_processing/bot_processing.py | 8 ++--- codeface_utils/cluster/idManager.py | 33 ++++++++++--------- codeface_utils/configuration.py | 21 ++++++------ codeface_utils/dbmanager.py | 11 +++---- csv_writer/csv_writer.py | 7 ++-- issue_processing/issue_processing.py | 23 ++++++------- issue_processing/jira_issue_processing.py | 9 ++--- 9 files changed, 67 insertions(+), 79 deletions(-) diff --git a/anonymization/anonymization.py b/anonymization/anonymization.py index fdddebc..bcd76b1 100644 --- a/anonymization/anonymization.py +++ b/anonymization/anonymization.py @@ -103,13 +103,13 @@ def anonymize_authors(author_data, i, author_to_anonymized_author, name_only = F # Don't anonymize the deleted user as this one might be needed for filtering (but add it to the dictionary) if orig_author == "Deleted user" and orig_email == "ghost@github.com": - if not (orig_author, orig_email) in author_to_anonymized_author: + if (orig_author, orig_email) not in author_to_anonymized_author: author_to_anonymized_author[(orig_author, orig_email)] = (orig_author, orig_email) else: # check whether (name, e-mail) pair isn't already present in the dictionary - if not (orig_author, orig_email) in author_to_anonymized_author: + if (orig_author, orig_email) not in author_to_anonymized_author: # check if just the name (without e-mail address) isn't already present in the dictionary - if not orig_author in author_to_anonymized_author: + if orig_author not in author_to_anonymized_author: # if the author has an empty name, only anonymize their e-mail address if not author[1] == "": author[1] = ("developer" + str(i)) @@ -140,7 +140,7 @@ def anonymize_authors(author_data, i, author_to_anonymized_author, name_only = F # Check for all files in the result directory of the project whether they need to be anonymized - for filepath, dirnames, filenames in walk(data_path): + for filepath, _, filenames in walk(data_path): # (1) Anonymize authors lists if authors_list in filenames: @@ -169,7 +169,7 @@ def anonymize_authors(author_data, i, author_to_anonymized_author, name_only = F # anonymize authors author_data, i, author_to_anonymized_author = \ anonymize_authors(author_data, i, author_to_anonymized_author) - + author_data_gender, i_gender, author_to_anonymized_author_gender = \ anonymize_authors(author_data_gender, i_gender, author_to_anonymized_author_gender, name_only = True) diff --git a/author_postprocessing/author_postprocessing.py b/author_postprocessing/author_postprocessing.py index f3b0ca9..4ec0600 100644 --- a/author_postprocessing/author_postprocessing.py +++ b/author_postprocessing/author_postprocessing.py @@ -67,7 +67,7 @@ def perform_data_backup(results_path, results_path_backup): log.info("Backup folder already exists. No backup is to be performed.") return - for filepath, dirnames, filenames in walk(results_path): + for filepath, _, filenames in walk(results_path): for filename in filenames: if filename.endswith(".list"): current_file = path.join(filepath, filename) @@ -119,7 +119,7 @@ def is_github_noreply_author(name, email): # Check for all files in the result directory of the project whether they need to be adjusted - for filepath, dirnames, filenames in walk(data_path): + for filepath, _, filenames in walk(data_path): # (1) Remove author 'GitHub ' from authors list if authors_list in filenames: @@ -148,7 +148,7 @@ def is_github_noreply_author(name, email): if not is_github_noreply_author(email[0], email[1]): email_data_new.append(email) else: - log.warn("Remove email %s as it was sent by %s <%s>.", email[2], email[0], email[1]) + log.warning("Remove email %s as it was sent by %s <%s>.", email[2], email[0], email[1]) csv_writer.write_to_csv(f, email_data_new) @@ -198,19 +198,19 @@ def is_github_noreply_author(name, email): # ignore merge commits in the commit data, we consistently ignore them also if they are added # to a pull request. Hence, the corresponding "commit_added" event will be removed now (i.e., # not added to the new issue data any more). - log.warn("Commit %s is added in the GitHub issue data, but not part of the commit data. " + - "Remove the corresponding 'commit_added' event from the issue data...", commit_hash) + log.warning("Commit %s is added in the GitHub issue data, but not part of the commit data. " + + "Remove the corresponding 'commit_added' event from the issue data...", commit_hash) elif is_github_noreply_author(event[9], event[10]): # the event is authored by 'GitHub ', but is not a "commit_added" event, so we # neglect this event and remove it now (i.e., not add it to the new issue data any more). - log.warn("Event %s is authored by %s <%s>. Remove this event form the issue data...", - event[8], event[9], event[10]) + log.warning("Event %s is authored by %s <%s>. Remove this event form the issue data...", + event[8], event[9], event[10]) elif (is_github_noreply_author(event[12], event[13][1:-1]) and (event[8] == mentioned_event or event[8] == subscribed_event)): # the event references 'GitHub ', so we neglect this event and remove it now # (i.e., not add it to the new issue data any more). - log.warn("Event %s by %s <%s> references %s <%s>. Remove this event from the issue data...", - event[8], event[9], event[10], event[12], event[13]) + log.warning("Event %s by %s <%s> references %s <%s>. Remove this event from the issue data...", + event[8], event[9], event[10], event[12], event[13]) else: issue_data_new.append(event) @@ -229,7 +229,7 @@ def is_github_noreply_author(name, email): if not is_github_noreply_author(entry[0], entry[1]): bot_data_new.append(entry) else: - log.warn("Remove entry %s <%s> from bots list.", entry[0], entry[1]) + log.warning("Remove entry %s <%s> from bots list.", entry[0], entry[1]) csv_writer.write_to_csv(f, bot_data_new) @@ -285,7 +285,7 @@ def run_postprocessing(conf, resdir, backup_data): return # Check for all files in the result directory of the project whether they need to be adjusted - for filepath, dirnames, filenames in walk(data_path): + for filepath, _, filenames in walk(data_path): # (1) Adjust authors lists if authors_list in filenames: @@ -302,7 +302,7 @@ def run_postprocessing(conf, resdir, backup_data): for author in author_data: # keep author entry only if it should not be removed - if not author in author_data_to_remove: + if author not in author_data_to_remove: author_data_new.append(author) csv_writer.write_to_csv(f, author_data_new) diff --git a/bot_processing/bot_processing.py b/bot_processing/bot_processing.py index 14bdd56..00cf099 100644 --- a/bot_processing/bot_processing.py +++ b/bot_processing/bot_processing.py @@ -19,12 +19,8 @@ """ import argparse -import http.client import os import sys -import urllib.request, urllib.parse, urllib.error - -import operator from logging import getLogger from codeface_utils.configuration import Configuration @@ -200,7 +196,7 @@ def add_user_data(bot_data, user_data, known_bots_file): bot_reduced["prediction"] = user[-1] bot_data_reduced.append(bot_reduced) else: - log.warn("User '{}' in bot data does not occur in GitHub user data. Remove user...".format(user[0])) + log.warning("User '{}' in bot data does not occur in GitHub user data. Remove user...".format(user[0])) # check whether known GitHub bots occur in the GitHub issue data and, if so, update the bot data accordingly bot_data_reduced = check_with_known_bot_list(known_bots_file, bot_data, user_buffer, bot_data_reduced) @@ -227,7 +223,7 @@ def print_to_disk(bot_data, results_folder): user["user"]["email"], user["prediction"] ) - if not entry in lines: + if entry not in lines: lines.append(entry) # write to output file diff --git a/codeface_utils/cluster/idManager.py b/codeface_utils/cluster/idManager.py index b319511..52cd04b 100644 --- a/codeface_utils/cluster/idManager.py +++ b/codeface_utils/cluster/idManager.py @@ -18,21 +18,22 @@ from __future__ import absolute_import import re from email.utils import parseaddr -from logging import getLogger; log = getLogger(__name__) -import six.moves.http_client -import six.moves.urllib.request, six.moves.urllib.parse, six.moves.urllib.error +from logging import getLogger +import http.client as http_client +import urllib.parse as urlparse import json import string import random import time -from ..util import encode_as_utf8 -from six.moves import range from abc import ABC, abstractmethod import pandas from codeface_utils.cluster.PersonInfo import PersonInfo +from ..util import encode_as_utf8 +log = getLogger(__name__) + class idManager(ABC): def __init__(self): @@ -130,7 +131,7 @@ def _decompose_addr(self, addr): # print("Fixup for email required, but FAILED for {0}".format(addr)) name = addr rand_str = "".join(random.choice(string.ascii_lowercase + string.digits) - for i in range(10)) + for _ in range(10)) email = "could.not.resolve@" + rand_str email = email.lower() @@ -153,7 +154,7 @@ def __init__(self, dbm, conf): self._idMgrServer = conf["idServiceHostname"] self._idMgrPort = conf["idServicePort"] - self._conn = six.moves.http_client.HTTPConnection(self._idMgrServer, self._idMgrPort) + self._conn = http_client.HTTPConnection(self._idMgrServer, self._idMgrPort) # Create a project ID self._dbm = dbm @@ -172,9 +173,9 @@ def _query_user_id(self, name, email): """Query the ID database for a contributor ID""" name = encode_as_utf8(name) - params = six.moves.urllib.parse.urlencode({'projectID': self._projectID, - 'name': name, - 'email': email}) + params = urlparse.urlencode({'projectID': self._projectID, + 'name': name, + 'email': email}) try: self._conn.request("POST", "/post_user_id", params, self.headers) @@ -184,9 +185,9 @@ def _query_user_id(self, name, email): successful = False while (retryCount <= 10 and not successful): log.warning("Could not reach ID service. Try to reconnect " \ - "(attempt {}).".format(retryCount)); + "(attempt {}).".format(retryCount)) self._conn.close() - self._conn = six.moves.http_client.HTTPConnection(self._idMgrServer, self._idMgrPort) + self._conn = http_client.HTTPConnection(self._idMgrServer, self._idMgrPort) time.sleep(60) #self._conn.ping(True) try: @@ -222,7 +223,7 @@ def getPersonID(self, addr): """ (name, email) = self._decompose_addr(addr) - if not (name, email) in self._cache: + if (name, email) not in self._cache: self._cache[(name, email)] = self._query_user_id(name, email) ID = self._cache[(name, email)] @@ -241,14 +242,14 @@ def getPersonFromDB(self, person_id): res = self._conn.getresponse() except: self._conn.close() - self._conn = six.moves.http_client.HTTPConnection(self._idMgrServer, self._idMgrPort) + self._conn = http_client.HTTPConnection(self._idMgrServer, self._idMgrPort) retryCount = 0 successful = False while (retryCount <= 10 and not successful): log.warning("Could not reach ID service. Try to reconnect " \ - "(attempt {}).".format(retryCount)); + "(attempt {}).".format(retryCount)) self._conn.close() - self._conn = six.moves.http_client.HTTPConnection(self._idMgrServer, self._idMgrPort) + self._conn = http_client.HTTPConnection(self._idMgrServer, self._idMgrPort) time.sleep(60) #self._conn.ping(True) try: diff --git a/codeface_utils/configuration.py b/codeface_utils/configuration.py index d43f8d4..9b437b6 100644 --- a/codeface_utils/configuration.py +++ b/codeface_utils/configuration.py @@ -20,16 +20,15 @@ ''' from __future__ import absolute_import -import yaml +from tempfile import NamedTemporaryFile from collections.abc import Mapping -from six.moves import range from logging import getLogger +import yaml from codeface_utils.linktype import LinkType log = getLogger(__name__) -from tempfile import NamedTemporaryFile class ConfigurationError(Exception): '''Raised if any part of the configuration is malformed''' @@ -67,19 +66,19 @@ def __init__(self): self._conf_file_loc = None @classmethod - def load(self, global_conffile, local_conffile=None): + def load(cls, global_conffile, local_conffile=None): ''' Load configuration from global/local files ''' c = Configuration() log.info("Loading global configuration file '{}'". format(global_conffile)) - self._global_conf = c._load(global_conffile) + cls._global_conf = c._load(global_conffile) c._conf.update(c._global_conf) if local_conffile: log.info("Loading project configuration file '{}'". format(local_conffile)) - self._project_conf = c._load(local_conffile) + cls._project_conf = c._load(local_conffile) c._conf.update(c._project_conf) else: log.info("Not loading project configuration file!") @@ -103,7 +102,7 @@ def _load(self, filename): def _initialize(self): '''Infer missing values in the configuration''' if "rcs" not in self: - self._conf["rcs"] = [None for x in range(len(self["revisions"]))] + self._conf["rcs"] = [None for _ in range(len(self["revisions"]))] if "mailinglists" not in self: self._conf["mailinglists"] = [] @@ -132,12 +131,12 @@ def _check_sanity(self): raise ConfigurationError('Invalid configuration key.') for key in self.GLOBAL_KEYS + self.PROJECT_KEYS: - if not key in self: + if key not in self: log.critical("Required key '{}' missing in configuration!" ''.format(key)) raise ConfigurationError('Missing configuration key.') - if not self['tagging'] in LinkType.get_all_link_types(): + if self['tagging'] not in LinkType.get_all_link_types(): log.critical('Unsupported tagging mechanism specified!') raise ConfigurationError('Unsupported tagging mechanism.') @@ -151,11 +150,11 @@ def _check_sanity(self): raise ConfigurationError('Malformed configuration.') if self["useCsv"]: - if not "csvFile" in self: + if "csvFile" not in self: log.critical("Malformed configuration: useCsv is true, but " "csvFile is not specified.") raise ConfigurationError('Malformed configuration.') - if not "csvSeparator" in self: + if "csvSeparator" not in self: self["csvSeparator"] = "," unknown_keys = [k for k in self if k not in self.ALL_KEYS] diff --git a/codeface_utils/dbmanager.py b/codeface_utils/dbmanager.py index 4f8895d..8170d12 100644 --- a/codeface_utils/dbmanager.py +++ b/codeface_utils/dbmanager.py @@ -21,11 +21,10 @@ from __future__ import print_function import MySQLdb as mdb import time -from datetime import datetime -from logging import getLogger; +from datetime import datetime, timezone +from logging import getLogger from contextlib import contextmanager -from six.moves import range -from six.moves import zip + # create logger log = getLogger(__name__) @@ -57,7 +56,7 @@ def __init__(self, conf): # self.doExec("SET GLOBAL max_allowed_packet=%s", (max_packet_size,)) def __del__(self): - if self.con != None: + if self.con is not None: self.con.close() def __openConnection(self, conf): @@ -478,4 +477,4 @@ def update_release_timeline(self, project, tagging, revs, rcs, def tstamp_to_sql(tstamp): """Convert a Unix timestamp into an SQL compatible DateTime string""" - return (datetime.utcfromtimestamp(tstamp).strftime("%Y-%m-%d %H:%M:%S")) + return datetime.fromtimestamp(tstamp, tz=timezone.utc).strftime("%Y-%m-%d %H:%M:%S") diff --git a/csv_writer/csv_writer.py b/csv_writer/csv_writer.py index b41463a..fac9205 100644 --- a/csv_writer/csv_writer.py +++ b/csv_writer/csv_writer.py @@ -45,14 +45,13 @@ def write_to_csv(file_path, lines, append=False): :param append: Flag if lines shall be appended to file or overwrite file """ - open_mode = "a+b" if append else "wb" + open_mode = "a" if append else "w" - with open(file_path, open_mode) as csv_file: + with open(file_path, mode=open_mode, encoding="utf-8") as csv_file: wr = csv.writer(csv_file, delimiter=';', lineterminator='\n', quoting=csv.QUOTE_NONNUMERIC) # encode in proper UTF-8 before writing to file for line in lines: - line_encoded = __encode(line) - wr.writerow(line_encoded) + wr.writerow(line) def read_from_csv(file_path, delimiter=";"): """ diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py index adbee53..0274bd8 100644 --- a/issue_processing/issue_processing.py +++ b/issue_processing/issue_processing.py @@ -24,15 +24,12 @@ """ import argparse -import http.client import json import os import sys -import urllib.request, urllib.parse, urllib.error from datetime import datetime, timedelta from logging import getLogger -import operator from codeface_utils.cluster.idManager import dbIdManager, csvIdManager from codeface_utils.configuration import Configuration from codeface_utils.dbmanager import DBManager @@ -194,7 +191,7 @@ def lookup_user(user_dict, user): user["email"] is None or user["email"] == ""): # lookup user only if username is not None and not empty - if not user["username"] is None and not user["username"] == "": + if user["username"] is not None and not user["username"] == "": user = user_dict[user["username"]] return user @@ -213,8 +210,8 @@ def update_user_dict(user_dict, user): if user is None: user = create_deleted_user() - if not user["username"] in list(user_dict.keys()): - if not user["username"] is None and not user["username"] == "": + if user["username"] not in list(user_dict.keys()): + if user["username"] is not None and not user["username"] == "": user_dict[user["username"]] = user else: user_in_dict = user_dict[user["username"]] @@ -425,7 +422,7 @@ def merge_issue_events(issue_data): # add dismissal comments to the list of comments for event in issue["eventsList"]: - if (event["event"] == "review_dismissed" and not event["dismissalMessage"] is None + if (event["event"] == "review_dismissed" and event["dismissalMessage"] is not None and not event["dismissalMessage"] == ""): dismissalComment = dict() dismissalComment["event"] = "commented" @@ -503,7 +500,7 @@ def merge_issue_events(issue_data): issue["eventsList"] = sorted(issue["eventsList"], key=lambda k: k["created_at"]) # updates all the issues by the temporarily stored referenced_by events - for key, value in issue_data_to_update.items(): + for _, value in issue_data_to_update.items(): for issue in issue_data: if issue["number"] == value["number"]: issue["eventsList"] = issue["eventsList"] + value["eventsList"] @@ -538,7 +535,7 @@ def reformat_events(issue_data): users = update_user_dict(users, event["user"]) # 3) add or update users which are ref_target of the current event - if not event["ref_target"] is None and not event["ref_target"] == "": + if event["ref_target"] is not None and not event["ref_target"] == "": users = update_user_dict(users, event["ref_target"]) # as the user dictionary is created, start re-formating the event information of all issues @@ -639,7 +636,7 @@ def reformat_events(issue_data): event["event_info_1"] = issue["state_new"] event["event_info_2"] = issue["resolution"] - elif event["event"] == "referenced" and not event["commit"] is None: + elif event["event"] == "referenced" and event["commit"] is not None: # remove "referenced" events originating from commits # as they are handled as referenced commit events_to_remove.append(event) @@ -729,9 +726,9 @@ def get_user_from_id(idx, buffer_db=user_buffer): log.info("Passing user id '{}' to ID service.".format(idx)) person = idservice.getPersonFromDB(idx) user = dict() - user["email"] = person["email1"] # column "email1" - user["name"] = person["name"] # column "name" - user["id"] = person["id"] # column "id" + user["email"] = person.getEmail() # column "email1" + user["name"] = person.getName() # column "name" + user["id"] = person.getID() # column "id" # add user information to buffer buffer_db[idx] = user diff --git a/issue_processing/jira_issue_processing.py b/issue_processing/jira_issue_processing.py index 5d763fa..032bb86 100644 --- a/issue_processing/jira_issue_processing.py +++ b/issue_processing/jira_issue_processing.py @@ -26,13 +26,11 @@ import argparse import os import sys -import time import csv import json from logging import getLogger from xml.dom.minidom import parse -from datetime import datetime from dateutil import parser as dateparser from codeface_utils.cluster.idManager import dbIdManager, csvIdManager @@ -47,7 +45,6 @@ import importlib importlib.reload(sys) -sys.setdefaultencoding("utf-8") log = getLogger(__name__) @@ -294,7 +291,7 @@ def parse_xml(issue_data, persons, skip_history, referenced_bys): resolved = issue_x.getElementsByTagName("resolved") issue["resolveDate"] = "" - if (len(resolved) > 0) and (not resolved[0] is None): + if (len(resolved) > 0) and (resolved[0] is not None): resolveDate = resolved[0].firstChild.data issue["resolveDate"] = format_time(resolveDate) @@ -1032,10 +1029,10 @@ def find_first_existing(source_folder, filenames): persons_by_username = {} persons_by_name = {} for row in person_data: - if not row["AuthorID"] in list(persons_by_username.keys()): + if row["AuthorID"] not in list(persons_by_username.keys()): author_id_utf8 = str(row["AuthorID"]).encode("utf-8") persons_by_username[author_id_utf8] = (row["AuthorName"], row["userEmail"]) - if not row["AuthorName"] in list(persons_by_name.keys()): + if row["AuthorName"] not in list(persons_by_name.keys()): author_name_utf8 = str(row["AuthorName"]).encode("utf-8") persons_by_name[author_name_utf8] = (row["AuthorName"], row["userEmail"]) From e9372fede49d86248701e1fd5536b0ae0a633838 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20L=C3=B6ffler?= Date: Thu, 18 Sep 2025 16:20:16 +0200 Subject: [PATCH 08/13] Update copyright headers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Maximilian Löffler --- anonymization/anonymization.py | 1 + .../author_postprocessing.py | 1 + bot_processing/bot_processing.py | 1 + codeface_extraction/codeface_extraction.py | 1 + codeface_extraction/extractions.py | 1 + codeface_utils/cluster/PersonInfo.py | 9 +++-- codeface_utils/cluster/idManager.py | 12 +++++-- codeface_utils/configuration.py | 11 ++++-- codeface_utils/dbmanager.py | 1 + codeface_utils/linktype.py | 35 ++++++++++--------- codeface_utils/util.py | 35 +++++++++++-------- csv_writer/csv_writer.py | 1 + issue_processing/issue_processing.py | 1 + issue_processing/jira_issue_processing.py | 2 +- mbox_parsing/mbox_parsing.py | 1 + 15 files changed, 72 insertions(+), 41 deletions(-) diff --git a/anonymization/anonymization.py b/anonymization/anonymization.py index bcd76b1..251675b 100644 --- a/anonymization/anonymization.py +++ b/anonymization/anonymization.py @@ -15,6 +15,7 @@ # Copyright 2015-2017 by Claus Hunsen # Copyright 2021 by Thomas Bock # Copyright 2022 by Christian Hechtl +# Copyright 2025 by Maximilian Löffler # All Rights Reserved. """ This file is able to anonymize authors and issue titles after the extraction from the Codeface database was performed. diff --git a/author_postprocessing/author_postprocessing.py b/author_postprocessing/author_postprocessing.py index 4ec0600..53caeb2 100644 --- a/author_postprocessing/author_postprocessing.py +++ b/author_postprocessing/author_postprocessing.py @@ -14,6 +14,7 @@ # # Copyright 2015-2017 by Claus Hunsen # Copyright 2020-2022 by Thomas Bock +# Copyright 2025 by Maximilian Löffler # All Rights Reserved. """ This file is able to disambiguate authors after the extraction from the Codeface database was performed. A manually diff --git a/bot_processing/bot_processing.py b/bot_processing/bot_processing.py index 00cf099..a4b56c5 100644 --- a/bot_processing/bot_processing.py +++ b/bot_processing/bot_processing.py @@ -13,6 +13,7 @@ # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. # # Copyright 2021-2022 by Thomas Bock +# Copyright 2025 by Maximilian Löffler # All Rights Reserved. """ This file is able to extract information on bot/human users from csv files. diff --git a/codeface_extraction/codeface_extraction.py b/codeface_extraction/codeface_extraction.py index 62b36ef..05d42c5 100644 --- a/codeface_extraction/codeface_extraction.py +++ b/codeface_extraction/codeface_extraction.py @@ -15,6 +15,7 @@ # Copyright 2015-2017 by Claus Hunsen # Copyright 2016, 2018-2019 by Thomas Bock # Copyright 2018 by Barbara Eckl +# Copyright 2025 by Maximilian Löffler # All Rights Reserved. """ This file is able to extract developer--artifact relations from the Codeface database. diff --git a/codeface_extraction/extractions.py b/codeface_extraction/extractions.py index b294be2..9c636dd 100644 --- a/codeface_extraction/extractions.py +++ b/codeface_extraction/extractions.py @@ -17,6 +17,7 @@ # Copyright 2019, 2021 by Thomas Bock # Copyright 2018 by Barbara Eckl # Copyright 2018 by Tina Schuh +# Copyright 2025 by Maximilian Löffler # All Rights Reserved. """ This file provides the class 'Extraction' and all of its subclasses. diff --git a/codeface_utils/cluster/PersonInfo.py b/codeface_utils/cluster/PersonInfo.py index 5884108..69fe1bb 100644 --- a/codeface_utils/cluster/PersonInfo.py +++ b/codeface_utils/cluster/PersonInfo.py @@ -1,6 +1,6 @@ -# This file is part of Codeface. Codeface is free software: you can -# redistribute it and/or modify it under the terms of the GNU General Public -# License as published by the Free Software Foundation, version 2. +# This file is part of codeface-extraction, which is free software: you +# can redistribute it and/or modify it under the terms of the GNU General +# Public License as published by the Free Software Foundation, version 2. # # This program is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS @@ -14,6 +14,9 @@ # Copyright 2010, 2011 by Wolfgang Mauerer # Copyright 2012, 2013, Siemens AG, Wolfgang Mauerer # All Rights Reserved. +# +# The code in this file originates from: +# https://github.com/siemens/codeface/blob/master/codeface/cluster/PersonInfo.py from __future__ import absolute_import diff --git a/codeface_utils/cluster/idManager.py b/codeface_utils/cluster/idManager.py index 52cd04b..132ae28 100644 --- a/codeface_utils/cluster/idManager.py +++ b/codeface_utils/cluster/idManager.py @@ -1,6 +1,6 @@ -# This file is part of Codeface. Codeface is free software: you can -# redistribute it and/or modify it under the terms of the GNU General Public -# License as published by the Free Software Foundation, version 2. +# This file is part of codeface-extraction, which is free software: you +# can redistribute it and/or modify it under the terms of the GNU General +# Public License as published by the Free Software Foundation, version 2. # # This program is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS @@ -13,7 +13,13 @@ # # Copyright 2010, 2011 by Wolfgang Mauerer # Copyright 2012, 2013 by Siemens AG, Wolfgang Mauerer +# Copyright 2025 by Maximilian Löffler # All Rights Reserved. +# +# The code in this file originates from: +# https://github.com/siemens/codeface/blob/master/codeface/cluster/idManager.py +# We inherit the 'idManager' and 'dbIdManager' classes from codeface. +# The 'csvManager' class is original. from __future__ import absolute_import import re diff --git a/codeface_utils/configuration.py b/codeface_utils/configuration.py index 9b437b6..78bdc6f 100644 --- a/codeface_utils/configuration.py +++ b/codeface_utils/configuration.py @@ -1,6 +1,6 @@ -# This file is part of Codeface. Codeface is free software: you can -# redistribute it and/or modify it under the terms of the GNU General Public -# License as published by the Free Software Foundation, version 2. +# This file is part of codeface-extraction, which is free software: you +# can redistribute it and/or modify it under the terms of the GNU General +# Public License as published by the Free Software Foundation, version 2. # # This program is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS @@ -12,7 +12,12 @@ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # # Copyright 2013 by Siemens AG, Johannes Ebke +# Copyright 2025 by Maximilian Löffler # All Rights Reserved. +# +# The code in this file originates from: +# https://github.com/siemens/codeface/blob/master/codeface/configuration.py + ''' Configuration module for codeface diff --git a/codeface_utils/dbmanager.py b/codeface_utils/dbmanager.py index 8170d12..aecc172 100644 --- a/codeface_utils/dbmanager.py +++ b/codeface_utils/dbmanager.py @@ -13,6 +13,7 @@ # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # # Copyright 2013 by Siemens AG, Wolfgang Mauerer +# Copyright 2025 by Maximilian Löffler # All Rights Reserved. # Thin sql database wrapper diff --git a/codeface_utils/linktype.py b/codeface_utils/linktype.py index db08a80..c9b0dfb 100644 --- a/codeface_utils/linktype.py +++ b/codeface_utils/linktype.py @@ -1,19 +1,22 @@ -## This file is part of Codeface. Codeface is free software: you can -## redistribute it and/or modify it under the terms of the GNU General Public -## License as published by the Free Software Foundation, version 2. -## -## This program is distributed in the hope that it will be useful, but WITHOUT -## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -## FOR A PARTICULAR PURPOSE. See the GNU General Public License for more -## details. -## -## You should have received a copy of the GNU General Public License -## along with this program; if not, write to the Free Software -## Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -## -## Copyright 2013 by Siemens AG, Wolfgang Mauerer -## Copyright 2014 by Matthias Dittrich -## All Rights Reserved. +# This file is part of codeface-extraction, which is free software: you +# can redistribute it and/or modify it under the terms of the GNU General +# Public License as published by the Free Software Foundation, version 2. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +# details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# +# Copyright 2013 by Siemens AG, Wolfgang Mauerer +# Copyright 2014 by Matthias Dittrich +# All Rights Reserved. +# +# The code in this file originates from: +# https://github.com/siemens/codeface/blob/master/codeface/linktype.py __author__ = 'drag0on' diff --git a/codeface_utils/util.py b/codeface_utils/util.py index d859dcf..82ad09c 100644 --- a/codeface_utils/util.py +++ b/codeface_utils/util.py @@ -1,18 +1,23 @@ -## This file is part of Codeface. Codeface is free software: you can -## redistribute it and/or modify it under the terms of the GNU General Public -## License as published by the Free Software Foundation, version 2. -## -## This program is distributed in the hope that it will be useful, but WITHOUT -## ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -## FOR A PARTICULAR PURPOSE. See the GNU General Public License for more -## details. -## -## You should have received a copy of the GNU General Public License -## along with this program; if not, write to the Free Software -## Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -## -## Copyright 2013 by Siemens AG, Wolfgang Mauerer -## All Rights Reserved. +# This file is part of codeface-extraction, which is free software: you +# can redistribute it and/or modify it under the terms of the GNU General +# Public License as published by the Free Software Foundation, version 2. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +# details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# +# Copyright 2013 by Siemens AG, Wolfgang Mauerer +# Copyright 2025 by Maximilian Löffler +# All Rights Reserved. +# +# The code in this file originates from: +# https://github.com/siemens/codeface/blob/master/codeface/util.py + ''' Utility functions for running external commands ''' diff --git a/csv_writer/csv_writer.py b/csv_writer/csv_writer.py index fac9205..f950694 100644 --- a/csv_writer/csv_writer.py +++ b/csv_writer/csv_writer.py @@ -15,6 +15,7 @@ # Copyright 2017 by Claus Hunsen # Copyright 2018 by Anselm Fehnker # Copyright 2020-2021 by Thomas Bock +# Copyright 2025 by Maximilian Löffler # All Rights Reserved. """ This file provides the needed functions for standardized CSV writing diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py index 0274bd8..3b53cab 100644 --- a/issue_processing/issue_processing.py +++ b/issue_processing/issue_processing.py @@ -18,6 +18,7 @@ # Copyright 2018-2019 by Anselm Fehnker # Copyright 2019 by Thomas Bock # Copyright 2020-2021 by Thomas Bock +# Copyright 2025 by Maximilian Löffler # All Rights Reserved. """ This file is able to extract Github issue data from json files. diff --git a/issue_processing/jira_issue_processing.py b/issue_processing/jira_issue_processing.py index 032bb86..bfc2214 100644 --- a/issue_processing/jira_issue_processing.py +++ b/issue_processing/jira_issue_processing.py @@ -17,7 +17,7 @@ # Copyright 2018 by Barbara Eckl # Copyright 2018-2019 by Anselm Fehnker # Copyright 2020-2021 by Thomas Bock -# Copyright 2023 by Maximilian Löffler +# Copyright 2023, 2025 by Maximilian Löffler # All Rights Reserved. """ This file is able to extract Jira issue data from xml files. diff --git a/mbox_parsing/mbox_parsing.py b/mbox_parsing/mbox_parsing.py index ef9ae41..be337fa 100644 --- a/mbox_parsing/mbox_parsing.py +++ b/mbox_parsing/mbox_parsing.py @@ -15,6 +15,7 @@ # Copyright 2017 by Raphael Nömmer # Copyright 2017-2019 by Claus Hunsen # Copyright 2018-2019 by Thomas Bock +# Copyright 2025 by Maximilian Löffler # All Rights Reserved. """ This file is able to extract artifact occurrences in e-mail within mbox files. From eef2c8ba66f4d32c6319c3a0bcdcd3eae5babec5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20L=C3=B6ffler?= Date: Thu, 18 Sep 2025 18:02:51 +0200 Subject: [PATCH 09/13] Remove unused code artifacts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Maximilian Löffler --- codeface_utils/cluster/PersonInfo.py | 57 --- codeface_utils/cluster/idManager.py | 47 +-- codeface_utils/linktype.py | 2 - codeface_utils/util.py | 562 +-------------------------- csv_writer/csv_writer.py | 13 - 5 files changed, 16 insertions(+), 665 deletions(-) delete mode 100644 codeface_utils/cluster/PersonInfo.py diff --git a/codeface_utils/cluster/PersonInfo.py b/codeface_utils/cluster/PersonInfo.py deleted file mode 100644 index 69fe1bb..0000000 --- a/codeface_utils/cluster/PersonInfo.py +++ /dev/null @@ -1,57 +0,0 @@ -# This file is part of codeface-extraction, which is free software: you -# can redistribute it and/or modify it under the terms of the GNU General -# Public License as published by the Free Software Foundation, version 2. -# -# This program is distributed in the hope that it will be useful, but WITHOUT -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS -# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more -# details. -# -# You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -# -# Copyright 2010, 2011 by Wolfgang Mauerer -# Copyright 2012, 2013, Siemens AG, Wolfgang Mauerer -# All Rights Reserved. -# -# The code in this file originates from: -# https://github.com/siemens/codeface/blob/master/codeface/cluster/PersonInfo.py - -from __future__ import absolute_import - - -class PersonInfo: - """ Information about a commiter, and his relation to other commiters""" - - def __init__(self, ID=None, name="", email=""): - self.ID = ID - self.name = name - self.email = email - - def __str__(self): - return self.name + " <" + self.email + ">" - - def setID(self, ID): - self.ID = ID - def getID(self): - return self.ID - - def setName(self, name): - self.name = name - def getName(self): - if self.name == "": - return self.email - return self.name - - def setEmail(self, email): - self.email = email - def getEmail(self): - return self.email - - -############################ Test cases ######################### -if __name__ == "__main__": - personInfo = PersonInfo("sepp") - -# TODO: Implement a couple of test cases diff --git a/codeface_utils/cluster/idManager.py b/codeface_utils/cluster/idManager.py index 132ae28..5e64c90 100644 --- a/codeface_utils/cluster/idManager.py +++ b/codeface_utils/cluster/idManager.py @@ -34,7 +34,6 @@ from abc import ABC, abstractmethod import pandas -from codeface_utils.cluster.PersonInfo import PersonInfo from ..util import encode_as_utf8 @@ -46,13 +45,6 @@ def __init__(self): # Cache identical requests to the server self._cache = {} - # Map IDs to an instance of PersonInfo - self.persons = {} - - # Map a name, email address, or a combination of both to the numeric ID - # assigned to the developer - self.person_ids = {} - self.fixup_emailPattern = re.compile(r'([^<]+)\s+<([^>]+)>') self.commaNamePattern = re.compile(r'([^,\s]+),\s+(.+)') @@ -65,31 +57,15 @@ def getPersonFromDB(self, person_id): pass def getPersonID(self, addr): - """Obtain a unique ID from contributor identity credentials. - - The IDs are managed by a central csv file. - Managing multiple identities for the same person is also - handled there. - """ + """Obtain a unique ID from contributor identity credentials.""" (name, email) = self._decompose_addr(addr) if (name, email) not in self._cache: self._cache[(name, email)] = self._query_user_id(name, email) ID = self._cache[(name, email)] - # Construct a local instance of PersonInfo for the contributor - # if it is not yet available - if (ID not in self.persons): - self.persons[ID] = PersonInfo(ID, name, email) - return ID - def getPersons(self): - return self.persons - - def getPI(self, ID): - return self.persons[ID] - def _cleanName(self, name): # Remove or replace characters in names that are known # to cause parsing problems in later stages @@ -219,27 +195,6 @@ def _query_user_id(self, name, email): return (id) - def getPersonID(self, addr): - """Obtain a unique ID from contributor identity credentials. - - The IDs are managed by a central database accessed via REST. - Managing multiple identities for the same person is also - handled there. Safety against concurrent access is provided by - the database. - """ - - (name, email) = self._decompose_addr(addr) - if (name, email) not in self._cache: - self._cache[(name, email)] = self._query_user_id(name, email) - ID = self._cache[(name, email)] - - # Construct a local instance of PersonInfo for the contributor - # if it is not yet available - if ID not in self.persons: - self.persons[ID] = PersonInfo(ID, name, email) - - return ID - def getPersonFromDB(self, person_id): """Query the ID database for a contributor and all corresponding data""" diff --git a/codeface_utils/linktype.py b/codeface_utils/linktype.py index c9b0dfb..617d11f 100644 --- a/codeface_utils/linktype.py +++ b/codeface_utils/linktype.py @@ -18,8 +18,6 @@ # The code in this file originates from: # https://github.com/siemens/codeface/blob/master/codeface/linktype.py -__author__ = 'drag0on' - #enum-like class to distinguish between the various #methods used to link individuals diff --git a/codeface_utils/util.py b/codeface_utils/util.py index 82ad09c..1a1d673 100644 --- a/codeface_utils/util.py +++ b/codeface_utils/util.py @@ -18,184 +18,19 @@ # The code in this file originates from: # https://github.com/siemens/codeface/blob/master/codeface/util.py -''' -Utility functions for running external commands -''' - from __future__ import absolute_import -import logging; log = logging.getLogger(__name__) +import logging import os import os.path import re -import shutil -import signal import sys import traceback import unicodedata -from collections import OrderedDict, namedtuple -from glob import glob -from math import sqrt -from multiprocessing import Process, Queue, JoinableQueue, Lock -from pickle import dumps, PicklingError -from importlib.resources import files -from subprocess import Popen, PIPE -from tempfile import NamedTemporaryFile, mkdtemp -from time import sleep from threading import enumerate as threading_enumerate -from six.moves.queue import Empty -from datetime import timedelta, datetime from ftfy import fix_encoding -from six.moves import map -import six -from six.moves import range -from six.moves import zip - -# Represents a job submitted to the batch pool. -BatchJobTuple = namedtuple('BatchJobTuple', ['id', 'func', 'args', 'kwargs', - 'deps', 'startmsg', 'endmsg']) -class BatchJob(BatchJobTuple): - def __init__(self, *args, **kwargs): - super(BatchJob, self).__init__(*args, **kwargs) - self.done = False - self.submitted = False - -class BatchJobPool(object): - ''' - Implementation of a dependency-respecting batch pool - - This system uses a pool of N worker processes to run jobs. Since the - multiprocessing module is used, all functions, args and kwargs must be - pickleable. Specifically, this means that only functions defined at - top-level in a module can be used here. - - Jobs can be created using pool.add(function, args, kwargs, deps=deps)) - where deps can be a list of job handles previously returned by - pool.add. If multiprocessing is disabled, the functions are run - immediately and None is returned. - - Call pool.join() to start execution and wait until all jobs are complete. - If a work item raises an exception, the join() will terminate with - that exception, if pickleable, or a generic Exception if otherwise. - ''' - def __init__(self, n_cores): - self.n_cores = n_cores - self.next_id = 1 - self.jobs = OrderedDict() # Dictionary of jobs (ordered for repeatability) - # Initialize workers and their work and done queues - self.work_queue, self.done_queues, self.workers = Queue(), [], [] - if n_cores > 1: - # When n_cores is 1 we doen't use the process anyway. - # However the pycharm debugger goes crasy when we start the - # process, so as a workaround don't start anything when - # n_core is 1. - for i in range(n_cores): - dq = Queue() - w = Process(target=batchjob_worker_function, args=(self.work_queue, dq)) - self.done_queues.append(dq) - self.workers.append(w) - w.start() - - def _is_ready(self, job): - '''Returns true if the job is ready for submission''' - if job.done or job.submitted: - return False - return all(self.jobs[j].done for j in job.deps if j is not None) - - def _submit(self, job): - '''Submit the job if it is ready''' - if self._is_ready(job): - self.work_queue.put(job) - job.submitted = True - - def add(self, func, args, kwargs={}, deps=(), startmsg=None, endmsg=None): - ''' - Add a job that executes func(*args, **kwargs) and depends on the - jobs with the ids listed in deps. - This function returns a job ID which can be used as a dependency - in other calls to add. - If n_cores is 1; this call immediately executes the given function - and returns None - ''' - if self.n_cores == 1: - log.info(startmsg) - func(*args, **kwargs) - log.info(endmsg) - return None - job_id = self.next_id - self.next_id += 1 - j = BatchJob(job_id, func, args, kwargs, deps, startmsg, endmsg) - self.jobs[job_id] = j - return job_id - - def join(self): - ''' - Submit jobs and wait for all jobs to finish. - ''' - try: - while not all(j.done for j in self.jobs.values()): - # Put jobs that are ready onto the work queue - for j in self.jobs.values(): - self._submit(j) - # Wait for a result from the done_queues - for dq in self.done_queues: - try: - res = dq.get(block=False) - except Empty: - continue - if res is None: - log.fatal("Uncaught exception in worker thread!") - raise Exception("Failure in Batch Pool") - if isinstance(res, Exception): - log.fatal("Uncaught exception in worker thread:") - raise res - log.debug("Job {} has finished!".format(res)) - self.jobs[res].done = True - # Check if workers died - for w in self.workers: - if not w.is_alive(): - w.join() - raise Exception("A Worker died unexpectedly!") - sleep(0.01) - finally: - # Terminate and join the workers - # Wait 100ms to allow backtraces to be logged - sleep(0.1) - log.info("Terminating workers...") - for w in self.workers: - w.terminate() - log.info("Workers terminated.") - -def batchjob_worker_function(work_queue, done_queue): - ''' - Worker function executed in a separate process. - This function pulls work items off the work queue; terminates if there - is no item for 0.5s; otherwise executes the work item. Any exception - is reraised after putting a None onto the done_queue (triggering an - exception in the main process) - ''' - # Silently quit on CTRL+C - signal.signal(signal.SIGINT, handle_sigint_silent) - while True: - try: - job = work_queue.get(block=True) - except ValueError as ve: - # This happens when the main loop stops before we do - return - log.debug("Starting job id {}".format(job.id)) - try: - if job.startmsg: - log.info(job.startmsg) - job.func(*job.args, **job.kwargs) - if job.endmsg: - log.info(job.endmsg) - log.debug("Finished work id {}".format(job.id)) - done_queue.put(job.id) - except Exception as e: - log.debug("Failed work id {}".format(job.id)) - done_queue.put(Exception(e.__class__.__name__ + ": " + - str(e) + "\n" + traceback.format_exc())) +log = logging.getLogger(__name__) # Function to dump the stacks of all threads def get_stack_dump(): @@ -210,228 +45,6 @@ def get_stack_dump(): code.append(" %s" % (line.strip())) return code -# Signal handler that dumps all stacks and terminates -# Lock l dis-interleaves the stack traces of processes -l = Lock() -def handle_sigint(signal, frame): - with l: - log.fatal("CTRL-C pressed!") - for c in get_stack_dump(): - log.info(c) - # This call raises a SystemExit exception in the - # stack frame that was interrupted by the signal - # For the main thread, this is what we want. - sys.exit(-1) - -# Signal handler that dumps all stacks and terminates silently -# Also uses the Lock l to dis-interleave the stack traces -def handle_sigint_silent(signal, frame): - with l: - for c in get_stack_dump(): - log.info(c) - logging.shutdown() - # Since we want to terminate worker threads with prejudice, - # we use os._exit, which directly terminates the process. - # otherwise the worker try/catch will also catch the SystemExit - os._exit(-1) - -def handle_sigterm(signal, frame): - # Since we want to terminate worker threads with prejudice, - # we use os._exit, which directly terminates the process. - # otherwise the worker try/catch will also catch the SystemExit - logging.shutdown() - os._exit(-1) - -def handle_sigusr1(signal, frame): - for c in get_stack_dump(): - log.info(c) - -# Dump all the stacks in case of CTRL-C -signal.signal(signal.SIGINT, handle_sigint) -# Also dump on sigterm -signal.signal(signal.SIGTERM, handle_sigterm) -# Also dump on sigusr1, but do not terminate -signal.signal(signal.SIGUSR1, handle_sigusr1) - -def execute_command(cmd, ignore_errors=False, direct_io=False, cwd=None, silent_errors=False): - ''' - Execute the command `cmd` specified as a list of ['program', 'arg', ...] - If ignore_errors is true, a non-zero exit code will be ignored (and a warning - messages will be issued), otherwise an exception is raised. If silent_errors is True, - no messages will be emitted even in case of an error (but exceptions will still be raised). - If direct_io is True, do not capture the stdin and stdout of the command. - Returns the stdout of the command. - ''' - jcmd = " ".join(cmd) - log.debug("Running command: {}".format(jcmd)) - try: - if direct_io: - pipe = Popen(cmd, cwd=cwd) - else: - pipe = Popen(cmd, stdout=PIPE, stderr=PIPE, cwd=cwd) - stdout, stderr = pipe.communicate() - except OSError: - log.error("Error executing command {}!".format(jcmd)) - raise - - if pipe.returncode != 0: - if ignore_errors: - if not(silent_errors): - log.warning("Command '{}' failed with exit code {}. Ignored.". - format(jcmd, pipe.returncode)) - else: - if not(direct_io) and not(silent_errors): - log.info("Command '{}' stdout:".format(jcmd)) - for line in stdout.splitlines(): - log.info(line) - log.info("Command '{}' stderr:".format(jcmd)) - for line in stderr.splitlines(): - log.info(line) - msg = "Command '{}' failed with exit code {}. \n" \ - "(stdout: {}\nstderr: {})"\ - .format(jcmd, pipe.returncode, stdout, stderr) - if not(silent_errors): - log.error(msg) - raise Exception(msg) - return stdout - -def _convert_dot_file(dotfile): - ''' - Convert duplicate edges in the given dot file into edges with - a larger pen width. - ''' - res = [] - edges = {} - edge_spec = re.compile(r"\s+(\d+) -> (\d+);") - - file = open(dotfile, "r") - lines = [line.strip("\n") for line in file] - # Modify the header (copyright line + digraph) - lines[0] = "digraph {" - lines[1] = "node[fontsize=30, shape=\"box\"];" - - lines[len(lines)-1] = "" # Skip closing brace - - for line in lines: - m = re.match(edge_spec, line) - if m: - a, b = m.group(1), m.group(2) - edges[(a,b)] = edges.get((a,b), 0) + 1 - else: - res.append(line + "\n") - - # sort the edges for reproducibility - for ((a, b), count) in sorted(edges.items()): - res.append("{0} -> {1} [weight={2} penwidth={3}];\n". - format(a,b,count, sqrt(float(count)))) - - res.append("overlap=prism;\n") - res.append("splines=true;\n") - res.append("}\n") - return res - -def layout_graph(filename): - out = NamedTemporaryFile(mode="w", delete=False) - out.writelines(_convert_dot_file(filename)) - out.close() # flushes the cache - cmd = [] - cmd.append("dot") - cmd.append("-Kfdp") - cmd.append("-Tpdf") - cmd.append("-Gcharset=utf-8") - cmd.append("-o{0}.pdf".format(os.path.splitext(filename)[0])) - cmd.append(out.name) - execute_command(cmd, ignore_errors=True) - # Manually remove the temporary file - os.unlink(out.name) - -def generate_report(start_rev, end_rev, resdir): - log.info(" -> Generating report") - report_base = "report-{0}_{1}".format(start_rev, end_rev) - - # Run perl script to generate report LaTeX file - cmd = [] - cmd.append(files(__package__).joinpath("perl/create_report.pl")) - cmd.append(resdir) - cmd.append("{0}--{1}".format(start_rev, end_rev)) - with open(os.path.join(resdir, report_base + ".tex"), 'w') as f: - f.write(execute_command(cmd)) - - # Compile report with lualatex - cmd = [] - cmd.append("lualatex") - cmd.append("-interaction=nonstopmode") - cmd.append(os.path.join(resdir, report_base + ".tex")) - - # We run latex in a temporary directory so that it's easy to - # get rid of the log files etc. created during the run that are - # not relevant for the final result - orig_wd = os.getcwd() - tmpdir = mkdtemp() - - os.chdir(tmpdir) - execute_command(cmd, ignore_errors=True) - try: - shutil.copy(report_base + ".pdf", resdir) - except IOError: - log.warning("Could not copy report PDF (missing input data?)") - - os.chdir(orig_wd) - shutil.rmtree(tmpdir) - -def generate_reports(start_rev, end_rev, range_resdir): - files = glob(os.path.join(range_resdir, "*.dot")) - log.info(" -> Generating Reports...") - for file in files: - layout_graph(file) - generate_report(start_rev, end_rev, range_resdir) - -def check4ctags(): - # check if the appropriate ctags is installed on the system - prog_name = 'Universal Ctags' - prog_version = 'Universal Ctags 5.9.0, Copyright (C) 2015 Universal Ctags Team' - cmd = "ctags-universal --version".split() - - res = execute_command(cmd, None) - - if not(res.startswith(prog_name)): - log.error("program '{0}' does not exist".format(prog_name)) - raise Exception("ctags-universal not found") - - if not(res.startswith(prog_version)): - # TODO: change this to use standard mechanism for error logging - log.error("Ctags version '{0}' not found".format(prog_version)) - raise Exception("Incompatible ctags-universal version") - - -def check4cppstats(): - """ - check if the appropriate cppstats is installed on the system. - """ - # We can not check the version directly as there is no version switch - # on cppstats We just check if the first line is OK. - line = "cppstats v0.9." - cmd = "/usr/bin/env cppstats --version".split() - res = execute_command(cmd) - if not (res.startswith(line)): - error_message = "expected the first line to start with '{0}' but "\ - "got '{1}'".format(line, res[0]) - log.error("program cppstats does not exist, or it is not working " - "as expected ({0}" - .format(error_message)) - raise Exception("no working cppstats found ({0})" - .format(error_message)) - - -def gen_prefix(i, num_ranges, start_rev, end_rev): - if (len(start_rev) == 40): - # When revisions are given by commit hashes, shorten them since - # they don't carry any meaning - start_rev = start_rev[0:6] - end_rev = end_rev[0:6] - return(" -> Revision range {0}/{1} ({2}..{3}): ".format(i, num_ranges, - start_rev, end_rev)) - def gen_range_path(base_path, i, start_rev, end_rev): if (len(start_rev) == 40): # Same logic as above, but construct a file system path @@ -440,127 +53,6 @@ def gen_range_path(base_path, i, start_rev, end_rev): return(os.path.join(base_path, "{0}--{1}-{2}". format(str(i).zfill(3), start_rev, end_rev))) - -def parse_iso_git_date(date_string): - # from http://stackoverflow.com/questions/526406/python-time-to-age-part-2-timezones - try: - offset = int(date_string[-5:]) - except: - log.error("could not extract timezone info from \"{0}\"" - .format(date_string)) - raise - minutes = (offset if offset > 0 else -offset) % 100 - delta = timedelta(hours=offset / 100, - minutes=minutes if offset > 0 else -minutes) - # In future python versions we can use "%Y-%m-%d %H:%M:%S %z" - # this way we don't need the above workaround, currently %z isn't - # working as documented - fmt = "%Y-%m-%d %H:%M:%S" - parsed_date = datetime.strptime(date_string[:-6], fmt) - parsed_date -= delta - return parsed_date - -# Determine settings for the size and amount of analysis windows. If nothing -# specific is provided, use default settings -def get_analysis_windows(conf): - window_size_months = 3 - num_window = -1 - - if "windowSize" in list(conf.keys()): - window_size_months = conf["windowSize"] - if "numWindows" in list(conf.keys()): - num_window = conf["numWindows"] - - return window_size_months, num_window - -def generate_analysis_windows(repo, window_size_months): - """ - Generates a list of revisions (commit hash) in increments of the window_size - parameter. The window_size parameter specifies the number of months between - revisions. This function is useful when the git repository has no tags - referencing releases. - """ - cmd_date = 'git --git-dir={0} show --format=%ad --date=iso8601'\ - .format(repo).split() - latest_date_result = execute_command(cmd_date).splitlines()[0] - latest_commit = parse_iso_git_date(latest_date_result) - - cmd_root_commit_dates = 'git --git-dir={0} log --max-parents=0 --format=%ad --date=iso8601'\ - .format(repo).split() - root_commit_dates_result = execute_command(cmd_root_commit_dates).splitlines() - earliest_root_commit_date = min([parse_iso_git_date(root_commit) for root_commit in root_commit_dates_result]) - - print_fmt = "%Y-%m-%dT%H:%M:%S+0000" - month = timedelta(days=30) - - def get_before_arg(num_months): - date = latest_commit - num_months * month - - # Due to a bug in git, broken author information in commit objects can lead to a timestamp of 0 when using the - # --before option although the dates themselves are not broken and can be parsed without problems. - # For more details, see the whole thread conversation on the git mailing list here: - # https://lore.kernel.org/git/7728e059-d58d-cce7-c011-fbc16eb22fb9@cs.uni-saarland.de/ - # To avoid running into an infinite while loop below (due to timestamps being 0), check if the date is earlier - # than the date of the earliest root commit and break if this is the case. - if date < earliest_root_commit_date: - raise ValueError("The before-arg date is earlier than the earliest commit in the repository.") - - return '--before=' + date.strftime(print_fmt) - - revs = [] - start = window_size_months # Window size time ago - end = 0 # Present time - cmd_base = 'git --git-dir={0} log --no-merges --format=%H,%ct,%ci'\ - .format(repo).split() - cmd_base_max1 = cmd_base + ['--max-count=1'] - cmd = cmd_base_max1 + [get_before_arg(end)] - rev_end = execute_command(cmd).splitlines() - revs.extend(rev_end) - - while start != end: - - try: - cmd = cmd_base_max1 + [get_before_arg(start)] - rev_start = execute_command(cmd).splitlines() - except ValueError as ve: - rev_start = [] - log.info("rev_start would be earlier than earliest root commit. Start at initial commit instead...") - - if len(rev_start) == 0: - start = end - #cmd = cmd_base + ['--reverse'] - #rev_start = [execute_command(cmd).splitlines()[0]] - cmd = cmd_base + ['--max-parents=0'] - rev_start = [execute_command(cmd).splitlines()[-1]] - else: - end = start - start = end + window_size_months - - # Check if any commits occurred since the last analysis window - if rev_start[0] != revs[0]: - revs = rev_start + revs - # else: no commit happened since last window, don't add duplicate - # revisions - # End while - - # Check that commit dates are monotonic, in some cases the earliest - # first commit does not carry the earliest commit date - revs = [rev.split(",") for rev in revs] - rev_len = len(revs) - if int(revs[0][1]) > int(revs[1][1]): - del revs[0] - - # Extract hash values and dates intro seperate lists - revs_hash = [rev[0] for rev in revs] - revs_date = [rev[2].split(" ")[0] for rev in revs] - - # We cannot detect release canndidate tags in this analysis mode, - # so provide a list with None entries - rcs = [None for x in range(len(revs))] - - return revs_hash, rcs, revs_date - - def encode_as_utf8(string): """ Encode the given string properly in UTF-8, @@ -573,27 +65,28 @@ def encode_as_utf8(string): :return: the UTF-8 encoded string of type str """ - try: - string = string.decode("utf-8") - except: - # if we have a string, we transform it to unicode - if isinstance(string, str): - string = six.text_type(string, "unicode-escape", errors="replace") - - ## maybe not a string/unicode at all, return rightaway - if not isinstance(string, six.text_type): + # Normalize to str first + if isinstance(string, bytes): + try: + text = string.decode("utf-8") + except UnicodeDecodeError: + text = string.decode("utf-8", errors="replace") + elif isinstance(string, str): + text = string + else: + # not string-like, return as-is return string # convert to real unicode-utf8 encoded string, fix_text ensures proper encoding - new_string = fix_encoding(string) + new_string = fix_encoding(text) # remove unicode characters from "Specials" block # see: https://www.compart.com/en/unicode/block/U+FFF0 - new_string = re.sub(r"\\ufff.", " ", new_string.encode("unicode-escape")) + new_string = re.sub(r"\ufff.", " ", new_string) # remove all kinds of control characters and emojis # see: https://www.fileformat.info/info/unicode/category/index.htm - new_string = u"".join(ch if unicodedata.category(ch)[0] != "C" else " " for ch in new_string.decode("unicode-escape")) + new_string = u"".join(ch if unicodedata.category(ch)[0] != "C" else " " for ch in new_string) new_string = new_string.encode("utf-8") @@ -610,28 +103,3 @@ def encode_as_utf8(string): return str(new_string) - -def encode_items_as_utf8(items): - """ - Encode the given list/tuple/dict of strings properly in UTF-8, - independent from its internal representation (str or unicode). - - This function uses encode_as_utf8(string) internally. - - :param string: any string - :return: the UTF-8 encoded string of type str - """ - - # unpack values if we have a dictionary - items_unpacked = items - if isinstance(items, dict): - items_unpacked = list(items.values()) - - # encode each item as UTF-8 properly - items_enc = list(map(encode_as_utf8, items_unpacked)) - - # add key for dict again - if isinstance(items, dict): - items_enc = dict(zip(list(items.keys()), items_enc)) - - return items_enc diff --git a/csv_writer/csv_writer.py b/csv_writer/csv_writer.py index f950694..ca453be 100644 --- a/csv_writer/csv_writer.py +++ b/csv_writer/csv_writer.py @@ -24,19 +24,6 @@ import csv -def __encode(line): - """Encode the given line (a tuple of columns) properly in UTF-8.""" - - lineres = () # re-encode column if it is unicode - for column in line: - if type(column) is str: - lineres += (column.encode("utf-8"),) - else: - lineres += (column,) - - return lineres - - def write_to_csv(file_path, lines, append=False): """ Write the given lines to the file with the given file path. From f6bc6a2c02159480b2a492a8c54e8e667eda2882 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20L=C3=B6ffler?= Date: Thu, 18 Sep 2025 18:50:21 +0200 Subject: [PATCH 10/13] Ensure string represenation of all user data in 'get_id_and_update_user' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Maximilian Löffler --- issue_processing/issue_processing.py | 7 ++++--- issue_processing/jira_issue_processing.py | 6 +++--- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py index 3b53cab..1b5a823 100644 --- a/issue_processing/issue_processing.py +++ b/issue_processing/issue_processing.py @@ -688,9 +688,10 @@ def get_user_string(name, email): def get_id_and_update_user(user, buffer_db_ids=user_id_buffer, buffer_usernames=username_id_buffer): - # fix encoding for name and e-mail address - name = user["name"] if "name" in user else str(user["username"]) - mail = user["email"] # empty + # ensure string representation for name and e-mail address + username = str(user["username"]) + name = str(user["name"]) if "name" in user else username + mail = str(user["email"]) # construct string for ID service and send query user_string = get_user_string(name, mail) diff --git a/issue_processing/jira_issue_processing.py b/issue_processing/jira_issue_processing.py index bfc2214..97f1711 100644 --- a/issue_processing/jira_issue_processing.py +++ b/issue_processing/jira_issue_processing.py @@ -608,9 +608,9 @@ def get_user_string(name, email): def get_id_and_update_user(user, buffer_db_ids=user_id_buffer): - # fix encoding for name and e-mail address - name = user["name"] if "name" in user else str(user["username"]) - mail = user["email"] + # ensure string representation for name and e-mail address + name = str(user["name"]) if "name" in user else str(user["username"]) + mail = str(user["email"]) # may be empty # construct string for ID service and send query user_string = get_user_string(name, mail) From 41822da520c61b4a0775d1427dbaa04c368bcd9b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maximilian=20L=C3=B6ffler?= Date: Tue, 30 Sep 2025 13:46:56 +0200 Subject: [PATCH 11/13] Use the same return format in all implementations of 'getPersonFromDB' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Maximilian Löffler --- codeface_utils/cluster/idManager.py | 15 +++++++-------- issue_processing/issue_processing.py | 9 +++++---- issue_processing/jira_issue_processing.py | 9 +++++---- 3 files changed, 17 insertions(+), 16 deletions(-) diff --git a/codeface_utils/cluster/idManager.py b/codeface_utils/cluster/idManager.py index 5e64c90..43a4be5 100644 --- a/codeface_utils/cluster/idManager.py +++ b/codeface_utils/cluster/idManager.py @@ -296,11 +296,10 @@ def _query_user_id(self, name, email): raise Exception("Constructed author list is in invalid format. Duplicate entries found") def getPersonFromDB(self, person_id): - """Get a PersonInfo instance from the database by ID.""" - if person_id not in self.persons: - rows = self.df[self.df['ID'] == person_id] - if len(rows) == 1: - name = rows['name'].values[0] - email = rows['email'].values[0] - self.persons[person_id] = PersonInfo(person_id, name, email) - return self.persons.get(person_id, None) + rows = self.df[self.df['ID'] == person_id] + if len(rows) == 1: + return { + 'name': rows['name'].values[0], + 'email1': rows['email'].values[0], + 'id': person_id + } diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py index 1b5a823..013fd8c 100644 --- a/issue_processing/issue_processing.py +++ b/issue_processing/issue_processing.py @@ -727,10 +727,11 @@ def get_user_from_id(idx, buffer_db=user_buffer): # get person information from ID service log.info("Passing user id '{}' to ID service.".format(idx)) person = idservice.getPersonFromDB(idx) - user = dict() - user["email"] = person.getEmail() # column "email1" - user["name"] = person.getName() # column "name" - user["id"] = person.getID() # column "id" + user = { + "name": person["name"], + "email": person["email1"], + "id": person["id"] + } # add user information to buffer buffer_db[idx] = user diff --git a/issue_processing/jira_issue_processing.py b/issue_processing/jira_issue_processing.py index 97f1711..4974707 100644 --- a/issue_processing/jira_issue_processing.py +++ b/issue_processing/jira_issue_processing.py @@ -640,10 +640,11 @@ def get_user_from_id(idx, buffer_db=user_buffer): # get person information from ID service log.info("Passing user id '{}' to ID service.".format(idx)) person = idservice.getPersonFromDB(idx) - user = dict() - user["email"] = person["email1"] # column "email1" - user["name"] = person["name"] # column "name" - user["id"] = person["id"] # column "id" + user = { + "name": person["name"], + "email": person["email1"], + "id": person["id"] + } # add user information to buffer buffer_db[idx] = user From 9e33e834d63ebe729b1d2e0ce252d5ac5219281d Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Tue, 27 Jan 2026 14:46:36 +0100 Subject: [PATCH 12/13] Move changes from PR53 to python 3 Moved all prior changes. Needs testing. Signed-off-by: Leo Sendelbach --- issue_processing/issue_processing.py | 164 +++++++++++++++++++++++---- 1 file changed, 145 insertions(+), 19 deletions(-) diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py index 013fd8c..1e65d09 100644 --- a/issue_processing/issue_processing.py +++ b/issue_processing/issue_processing.py @@ -19,6 +19,7 @@ # Copyright 2019 by Thomas Bock # Copyright 2020-2021 by Thomas Bock # Copyright 2025 by Maximilian Löffler +# Copyright 2025-2026 by Leo Sendelbach # All Rights Reserved. """ This file is able to extract Github issue data from json files. @@ -42,7 +43,10 @@ log = getLogger(__name__) # known types from JIRA and GitHub default labels -known_types = {"bug", "improvement", "enhancement", "new feature", "task", "test", "wish"} +known_types = {"bug", "improvement", "enhancement", "feature", "task", "test", "wish"} + +# Copilot username to be assigned in specific copilot events +copilot_username = "Copilot" # known resolutions from JIRA and GitHub default labels known_resolutions = {"unresolved", "fixed", "wontfix", "duplicate", "invalid", "incomplete", "cannot reproduce", @@ -75,13 +79,14 @@ def run(): # 1) load the list of issues issues = load(__srcdir) # 2) re-format the issues - issues = reformat_issues(issues) + reformat_issues(issues) # 3) merges all issue events into one list - issues = merge_issue_events(issues) + external_connected_events = dict() + filtered_connected_events = merge_issue_events(issues, external_connected_events) # 4) re-format the eventsList of the issues - issues = reformat_events(issues) + reformat_events(issues, filtered_connected_events, external_connected_events) # 5) update user data with Codeface database and dump username-to-name/e-mail list - issues = insert_user_data(issues, __conf, __resdir) + insert_user_data(issues, __conf, __resdir) # 6) dump result to disk print_to_disk(issues, __resdir) @@ -239,7 +244,10 @@ def reformat_issues(issue_data): for issue in issue_data: # empty container for issue types - issue["type"] = [] + if issue["type"] is None: + issue["type"] = [] + else: + issue["type"] = [issue["type"]["name"].lower()] # empty container for issue resolutions issue["resolution"] = [] @@ -264,6 +272,10 @@ def reformat_issues(issue_data): if "relatedIssues" not in issue: issue["relatedIssues"] = [] + # if an issue has no sub-issue list, an empty List gets created + if "subIssues" not in issue: + issue["subIssues"] = [] + # add "closed_at" information if not present yet if issue["closed_at"] is None: issue["closed_at"] = "" @@ -280,10 +292,10 @@ def reformat_issues(issue_data): else: issue["type"].append("issue") - return issue_data + return -def merge_issue_events(issue_data): +def merge_issue_events(issue_data, external_connected_events): """ All issue events are merged together in the eventsList. This simplifies processing in later steps. @@ -294,6 +306,7 @@ def merge_issue_events(issue_data): log.info("Merge issue events ...") issue_data_to_update = dict() + connected_events = dict() for issue in issue_data: @@ -362,6 +375,7 @@ def merge_issue_events(issue_data): # it is a commit which was added to the pull request if rel_commit["type"] == "commitAddedToPullRequest": rel_commit["event"] = "commit_added" + rel_commit["event_info_2"] = rel_commit["commit"]["author"] # if the related commit was mentioned in an issue comment: elif rel_commit["type"] == "commitMentionedInIssue": @@ -477,6 +491,12 @@ def merge_issue_events(issue_data): if event["event"] == "review_requested" or event["event"] == "review_request_removed": event["ref_target"] = event["requestedReviewer"] + # if event is a specific copilot event, assign the copilot user data + if event["event"] == "copilot_work_started" or event["event"] == "copilot_work_finished": + event["user"]["name"] = None + event["user"]["username"] = copilot_username + event["user"]["email"] = "" + # if event dismisses a review, we can determine the original state of the corresponding review if event["event"] == "review_dismissed": for review in issue["reviewsList"]: @@ -489,6 +509,32 @@ def merge_issue_events(issue_data): event["ref_target"] = event["user"] event["user"] = event["assigner"] + # if event is merged event, save the hash of the merge commit in event_info_1 + if event["event"] == "merged": + event["event_info_1"] = event["commit"]["hash"] + + # if event is connected event, create or add to a matching dict entry by matching timestamps, for later reconstruction + if event["event"] == "connected": + if event["created_at"] in list(connected_events.keys()) and connected_events[event["created_at"]]["user"] == event["user"]: + # if there is already a connected event at this time by this user, add this event to the list + connected_events[event["created_at"]]["issues"].append(issue["number"]) + elif subtract_seconds_from_time(event["created_at"], 1) in list(connected_events.keys()) \ + and connected_events[subtract_seconds_from_time(event["created_at"], 1)]["user"] == event["user"]: + # same as above, but accounting for a possible difference in timestamps of 1 second between matching events + connected_events[subtract_seconds_from_time(event["created_at"], 1)]["issues"].append(issue["number"]) + event["created_at"] = subtract_seconds_from_time(event["created_at"], 1) + elif subtract_seconds_from_time(event["created_at"], -1) in list(connected_events.keys()) \ + and connected_events[subtract_seconds_from_time(event["created_at"], -1)]["user"] == event["user"]: + # same as above, with offset calculated in the other direction + connected_events[subtract_seconds_from_time(event["created_at"], -1)]["issues"].append(issue["number"]) + event["created_at"] = subtract_seconds_from_time(event["created_at"], -1) + else: + # if there is no connected event yet at this timestamp, create a new entry for this event + connected_info = dict() + connected_info["issues"] = [issue["number"]] + connected_info["user"] = event["user"] + connected_events[event["created_at"]] = connected_info + # merge events, relatedCommits, relatedIssues and comment lists issue["eventsList"] = issue["commentsList"] + issue["eventsList"] + issue["relatedIssues"] + issue[ "relatedCommits"] + issue["reviewsList"] @@ -500,16 +546,53 @@ def merge_issue_events(issue_data): # sorts eventsList by time issue["eventsList"] = sorted(issue["eventsList"], key=lambda k: k["created_at"]) + # filter out connected events which cannot be perfectly matched + # and populate external_connected_events dict + # because this happens in place, we do not need to return the external_connected_event dict later + filtered_connected_events = dict(filter(lambda item: filter_connected_events(item[0], item[1], external_connected_events), connected_events.items())) + # updates all the issues by the temporarily stored referenced_by events for _, value in issue_data_to_update.items(): for issue in issue_data: if issue["number"] == value["number"]: issue["eventsList"] = issue["eventsList"] + value["eventsList"] - return issue_data - - -def reformat_events(issue_data): + return filtered_connected_events + +def filter_connected_events(key, value, external_connected_events): + num_issues = len(value["issues"]) + # if only a single connected event exists at this time, it has to be connecting to an external issue + if num_issues == 1: + external_connected_events[key] = value + return False + # if 2 connected events exist, matching them is trivial + if num_issues == 2: + return True + occurances = {x: value["issues"].count(x) for x in set(value["issues"])} + # otherwise, if it is an even number, check if it can be easily matched, + # meaning that exactly half the events occur in the same issue + if num_issues % 2 == 0 and num_issues/2 in occurances.values(): + # duplicate issue list for matching the issues later + value["multi_issues_copy"] = list(value["issues"]) + return True + # if it is an odd number, check if it can be easily matched + # meaning that exactly half (rounded up) the events occur in the same issue + if num_issues % 2 == 1 and (num_issues + 1)/2 in occurances.values(): + for sub_key, sub_value in occurances.items(): + # then, assign one of them as an external connected event and proceed as in previous case + if sub_value == (num_issues + 1)/2: + new_entry = dict() + new_entry["user"] = value["user"] + new_entry["issues"] = [sub_key] + external_connected_events[key] = new_entry + value["issues"].remove(sub_key) + # duplicate issue list for matching the issues later + value["multi_issues_copy"] = list(value["issues"]) + return True + # no other variants can be easily matched + return False + +def reformat_events(issue_data, filtered_connected_events, external_connected_events): """ Re-format event information dependent on the event type. @@ -539,6 +622,35 @@ def reformat_events(issue_data): if event["ref_target"] is not None and not event["ref_target"] == "": users = update_user_dict(users, event["ref_target"]) + # reconstruction of connections + if event["event"] == "connected": + if event["created_at"] in external_connected_events \ + and issue["number"] in external_connected_events[event["created_at"]]["issues"]: + # if the event is an external connected event, mark it as such and remove this issue from the list + event["event_info_1"] = "external" + external_connected_events[event["created_at"]]["issues"].remove(issue["number"]) + elif event["created_at"] in filtered_connected_events \ + and issue["number"] in filtered_connected_events[event["created_at"]]["issues"]: + # if it is instead an internal connected event + value = filtered_connected_events[event["created_at"]] + if len(value["issues"]) == 2: + # and we only have 2 issues in the list, connect to the other issue + event["event_info_1"] = value["issues"][0] if value["issues"][1] == issue["number"] else value["issues"][1] + else: + # and we have more than two issues, count each issue's occurences + occurances = {x: value["issues"].count(x) for x in set(value["issues"])} + if occurances[issue["number"]] == max(occurances.values()): + # if our issue is the most common one, that means it is the common denominator + # for all connected events at this time + # so this event connects to any other issue + # which is then removed from a copied list to avoid duplications + number = next(x for x in value["multi_issues_copy"] if x != issue["number"]) + value["multi_issues_copy"].remove(number) + event["event_info_1"] = number + else: + # otherwise, connect this event to the common denominator + event["event_info_1"] = max(occurances, key = occurances.get) + # as the user dictionary is created, start re-formating the event information of all issues for issue in issue_data: @@ -556,13 +668,16 @@ def reformat_events(issue_data): if event["event"] == "closed": event["event"] = "state_updated" event["event_info_1"] = "closed" # new state - event["event_info_2"] = "open" # old state + if event["commit"] is not None: + event["event_info_2"] = event["commit"]["hash"] + else: + event["event_info_2"] = event["state_reason"] issue["state_new"] = "closed" elif event["event"] == "reopened": event["event"] = "state_updated" event["event_info_1"] = "open" # new state - event["event_info_2"] = "closed" # old state + event["event_info_2"] = event["state_reason"] issue["state_new"] = "reopened" elif event["event"] == "labeled": @@ -570,7 +685,7 @@ def reformat_events(issue_data): event["event_info_1"] = label # if the label is in this list, it also is a type of the issue - if label in known_types: + if label in known_types and label not in issue["type"]: issue["type"].append(str(label)) # creates an event for type updates and adds it to the eventsList @@ -635,7 +750,11 @@ def reformat_events(issue_data): # "state_new" and "resolution" of the issue give the information about the state and the resolution of # the issue when the comment was written, because the eventsList is sorted by time event["event_info_1"] = issue["state_new"] - event["event_info_2"] = issue["resolution"] + # if event is a review comment, it can contain suggestions + if "contains_suggestion" in event: + event["event_info_2"] = event["contains_suggestion"] + else: + event["event_info_2"] = False elif event["event"] == "referenced" and event["commit"] is not None: # remove "referenced" events originating from commits @@ -649,7 +768,7 @@ def reformat_events(issue_data): for event_to_remove in events_to_remove: issue["eventsList"].remove(event_to_remove) - return issue_data + return def insert_user_data(issues, conf, resdir): @@ -748,6 +867,9 @@ def get_user_from_id(idx, buffer_db=user_buffer): for event in issue["eventsList"]: event["user"] = get_id_and_update_user(event["user"]) + if event["event"] == "commit_added": + event["event_info_2"] = get_id_and_update_user(event["event_info_2"]) + # check database for the reference-target user if needed if event["ref_target"] != "": event["ref_target"] = get_id_and_update_user(event["ref_target"]) @@ -761,6 +883,10 @@ def get_user_from_id(idx, buffer_db=user_buffer): for event in issue["eventsList"]: event["user"] = get_user_from_id(event["user"]) + # for commit_added events, save the commit's author's name in event_info_2 + if event["event"] == "commit_added": + event["event_info_2"] = get_user_from_id(event["event_info_2"])["name"] + # get the reference-target user if needed if event["ref_target"] != "": event["ref_target"] = get_user_from_id(event["ref_target"]) @@ -781,7 +907,7 @@ def get_user_from_id(idx, buffer_db=user_buffer): username_dump = os.path.join(resdir, "usernames.list") csv_writer.write_to_csv(username_dump, sorted(set(lines), key=lambda line: line[0])) - return issues + return def print_to_disk(issues, results_folder): @@ -808,7 +934,7 @@ def print_to_disk(issues, results_folder): json.dumps(issue["resolution"]), issue["created_at"], issue["closed_at"], - json.dumps([]), # components + json.dumps([issue["subIssues"]]), # components event["event"], event["user"]["name"], event["user"]["email"], From c4d82bcf86f29125449a872ee73f74f106d97c33 Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Tue, 27 Jan 2026 15:37:12 +0100 Subject: [PATCH 13/13] Add changes from PR53 to author postprocessing All older changes transcribed, needs etsting Signed-off-by: Leo Sendelbach --- .../author_postprocessing.py | 24 +++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/author_postprocessing/author_postprocessing.py b/author_postprocessing/author_postprocessing.py index 53caeb2..cee17c5 100644 --- a/author_postprocessing/author_postprocessing.py +++ b/author_postprocessing/author_postprocessing.py @@ -15,6 +15,7 @@ # Copyright 2015-2017 by Claus Hunsen # Copyright 2020-2022 by Thomas Bock # Copyright 2025 by Maximilian Löffler +# Copyright 2025-2026 by Leo Sendelbach # All Rights Reserved. """ This file is able to disambiguate authors after the extraction from the Codeface database was performed. A manually @@ -51,6 +52,16 @@ log = getLogger(__name__) +## +# GLOBAL VARIABLES +## + +# global variable containing all known copilot users and the name and mail adress copilot users will be assigned +known_copilot_users = {"Copilot", "copilot-pull-request-reviewer[bot]", "copilot-swe-agentbot"} +copilot_unified_name = "Copilot" +copilot_unified_email = "copilot@example.com" + + ## # RUN POSTPROCESSING ## @@ -79,7 +90,7 @@ def perform_data_backup(results_path, results_path_backup): copy(current_file, backup_file) -def fix_github_browser_commits(data_path, issues_github_list, commits_list, authors_list, emails_list, bots_list): +def fix_github_browser_commits(data_path, issues_github_list, commits_list, authors_list, emails_list, bots_list, unify_copilot_users=True): """ Replace the author "GitHub " in both commit and GitHub issue data by the correct author. The author "GitHub " is automatically inserted as the committer of a commit that is made when @@ -90,7 +101,7 @@ def fix_github_browser_commits(data_path, issues_github_list, commits_list, auth "GitHub " are removed. Also "mentioned" or "subscribed" events in the GitHub issue data which reference the author "GitHub " are removed from the GitHub issue data. In addition, remove the author "GitHub " also from the author data and bot data and remove e-mails that have been sent - by this author. + by this author. This method also unifies all known copilot users into a single user if desired. :param data_path: the path to the project data that is to be fixed :param issues_github_list: file name of the github issue data @@ -98,6 +109,7 @@ def fix_github_browser_commits(data_path, issues_github_list, commits_list, auth :param authors_list: file name of the corresponding author data :param emails_list: file name of the corresponding email data :param bots_list: file name of the corresponding bot data + :param unify_copilot_users: whether to unify known copilot users into a single user """ github_user = "GitHub" github_email = "noreply@github.com" @@ -179,7 +191,7 @@ def is_github_noreply_author(name, email): commit_data_file = path.join(data_path, commits_list) commit_data = csv_writer.read_from_csv(commit_data_file) commit_hash_to_author = {commit[7]: commit[2:4] for commit in commit_data} - + author_name_to_data = {author[1]: author[1:3] for author in author_data_new} issue_data_new = [] for event in issue_data: @@ -187,12 +199,16 @@ def is_github_noreply_author(name, email): if is_github_noreply_author(event[9], event[10]) and event[8] == commit_added_event: # extract commit hash from event info 1 commit_hash = event[12] - + name = event[13][1:-1] # extract commit author from commit data, if available if commit_hash in commit_hash_to_author: event[9] = commit_hash_to_author[commit_hash][0] event[10] = commit_hash_to_author[commit_hash][1] issue_data_new.append(event) + elif name in author_name_to_data: + event[9] = author_name_to_data[name][0] + event[10] = author_name_to_data[name][1] + issue_data_new.append(event) else: # the added commit is not part of the commit data. In most cases, this is due to merge commits # appearing in another pull request, as Codeface does not keep track of merge commits. As we