diff --git a/author_postprocessing/author_postprocessing.py b/author_postprocessing/author_postprocessing.py index 13b1e38..7aa9526 100644 --- a/author_postprocessing/author_postprocessing.py +++ b/author_postprocessing/author_postprocessing.py @@ -14,6 +14,7 @@ # # Copyright 2015-2017 by Claus Hunsen # Copyright 2020-2022 by Thomas Bock +# Copyright 2025-2026 by Leo Sendelbach # All Rights Reserved. """ This file is able to disambiguate authors after the extraction from the Codeface database was performed. A manually @@ -49,7 +50,13 @@ from csv_writer import csv_writer +from github_user_utils.github_user_utils import known_copilot_users, copilot_unified_name, copilot_unified_email, \ + is_github_noreply_author, github_user, github_email, \ + commit_added_event, mentioned_event, subscribed_event, \ + assigned_event, unassigned_event, review_requested_event, \ + review_request_removed_event, generate_botname_variants, quot_m +known_copilot_users_extended = generate_botname_variants(known_copilot_users) ## # RUN POSTPROCESSING ## @@ -78,7 +85,7 @@ def perform_data_backup(results_path, results_path_backup): copy(current_file, backup_file) -def fix_github_browser_commits(data_path, issues_github_list, commits_list, authors_list, emails_list, bots_list): +def fix_github_browser_commits(data_path, issues_github_list, commits_list, authors_list, emails_list, bots_list, unify_copilot_users=True): """ Replace the author "GitHub " in both commit and GitHub issue data by the correct author. The author "GitHub " is automatically inserted as the committer of a commit that is made when @@ -89,7 +96,7 @@ def fix_github_browser_commits(data_path, issues_github_list, commits_list, auth "GitHub " are removed. Also "mentioned" or "subscribed" events in the GitHub issue data which reference the author "GitHub " are removed from the GitHub issue data. In addition, remove the author "GitHub " also from the author data and bot data and remove e-mails that have been sent - by this author. + by this author. This method also unifies all known copilot users into a single user if desired. :param data_path: the path to the project data that is to be fixed :param issues_github_list: file name of the github issue data @@ -97,26 +104,8 @@ def fix_github_browser_commits(data_path, issues_github_list, commits_list, auth :param authors_list: file name of the corresponding author data :param emails_list: file name of the corresponding email data :param bots_list: file name of the corresponding bot data + :param unify_copilot_users: whether to unify known copilot users into a single user """ - github_user = "GitHub" - github_email = "noreply@github.com" - commit_added_event = "commit_added" - mentioned_event = "mentioned" - subscribed_event = "subscribed" - - """ - Helper function to check whether a (name, e-mail) pair belongs to the author "GitHub ". - There are two options in Codeface how this can happen: - (1) Username is "GitHub" and e-mail address is "noreply@github.com" - (2) Username is "GitHub" and e-mail address has been replaced by Codeface, resulting in "GitHub.noreply@github.com" - - :param name: the name of the author to be checked - :param email: the email address of the author to be checked - :return: whether the given (name, email) pair belongs to the "GitHub " author - """ - def is_github_noreply_author(name, email): - return (name == github_user and (email == github_email or email == (github_user + "." + github_email))) - # Check for all files in the result directory of the project whether they need to be adjusted for filepath, dirnames, filenames in walk(data_path): @@ -125,20 +114,32 @@ def is_github_noreply_author(name, email): if authors_list in filenames: f = path.join(filepath, authors_list) log.info("Remove author %s <%s> in %s ...", github_user, github_email, f) + if unify_copilot_users: + log.info("Also unify copilot users to %s <%s> in %s ...", copilot_unified_name, copilot_unified_email, f) author_data = csv_writer.read_from_csv(f) author_data_new = [] - + copilot_user_added = False for author in author_data: # keep author entry only if it should not be removed if not is_github_noreply_author(author[1], author[2]): - author_data_new.append(author) + # unify copilot author if desired + if unify_copilot_users and author[1] in known_copilot_users_extended: + if not copilot_user_added: + author[1] = copilot_unified_name + author[2] = copilot_unified_email + copilot_user_added = True + author_data_new.append(author) + else: + author_data_new.append(author) csv_writer.write_to_csv(f, author_data_new) # (2) Remove e-mails from author 'GitHub ' from all emails.list files if emails_list in filenames: f = path.join(filepath, emails_list) log.info("Remove emails from author %s <%s> in %s ...", github_user, github_email, f) + if unify_copilot_users: + log.info("Also unify copilot users to %s <%s> in %s ...", copilot_unified_name, copilot_unified_email, f) email_data = csv_writer.read_from_csv(f) email_data_new = [] @@ -146,6 +147,10 @@ def is_github_noreply_author(name, email): for email in email_data: # keep author entry only if it should not be removed if not is_github_noreply_author(email[0], email[1]): + # unify copilot users if desired + if unify_copilot_users and email[0] in known_copilot_users_extended: + email[0] = copilot_unified_name + email[1] = copilot_unified_email email_data_new.append(email) else: log.warn("Remove email %s as it was sent by %s <%s>.", email[2], email[0], email[1]) @@ -156,6 +161,8 @@ def is_github_noreply_author(name, email): if commits_list in filenames: f = path.join(filepath, commits_list) log.info("Replace author %s <%s> in %s ...", github_user, github_email, f) + if unify_copilot_users: + log.info("Also unify copilot users to %s <%s> in %s ...", copilot_unified_name, copilot_unified_email, f) commit_data = csv_writer.read_from_csv(f) for commit in commit_data: @@ -164,6 +171,13 @@ def is_github_noreply_author(name, email): if is_github_noreply_author(commit[5], commit[6]): commit[5] = commit[2] commit[6] = commit[3] + # unify copilot author if desired + if unify_copilot_users and commit[5] in known_copilot_users_extended: + commit[5] = copilot_unified_name + commit[6] = copilot_unified_email + if unify_copilot_users and commit[2] in known_copilot_users_extended: + commit[2] = copilot_unified_name + commit[3] = copilot_unified_email csv_writer.write_to_csv(f, commit_data) @@ -172,26 +186,45 @@ def is_github_noreply_author(name, email): if issues_github_list in filenames: f = path.join(filepath, issues_github_list) log.info("Replace author %s <%s> in %s ...", github_user, github_email, f) + if unify_copilot_users: + log.info("Also unify copilot users to %s <%s> in %s ...", copilot_unified_name, copilot_unified_email, f) issue_data = csv_writer.read_from_csv(f) # read commit data commit_data_file = path.join(data_path, commits_list) commit_data = csv_writer.read_from_csv(commit_data_file) commit_hash_to_author = {commit[7]: commit[2:4] for commit in commit_data} - + author_name_to_data = {author[1]: author[1:3] for author in author_data_new} issue_data_new = [] - for event in issue_data: + # unify events to use a single copilot user for all events triggered by a known copilot user + if unify_copilot_users and event[9] in known_copilot_users_extended: + event[9] = copilot_unified_name + event[10] = copilot_unified_email + if unify_copilot_users and event[8] == commit_added_event and event[13][1:-1] in known_copilot_users_extended: + # for commit added events, also unify the referenced author in event info 2 if it is a known copilot user + event[13] = quot_m + copilot_unified_name + quot_m + elif unify_copilot_users and event[8] in (mentioned_event, subscribed_event, assigned_event, unassigned_event, + review_requested_event, review_request_removed_event) \ + and event[12] in known_copilot_users_extended: + # for mentioned/subscribed events, also unify the referenced user in event info 1 and 2 if it is a known copilot user + event[12] = copilot_unified_name + event[13] = quot_m + copilot_unified_email + quot_m # replace author if necessary if is_github_noreply_author(event[9], event[10]) and event[8] == commit_added_event: # extract commit hash from event info 1 commit_hash = event[12] - + # extract author name from event info 2 while cutting excess '"' + name = event[13][1:-1] # extract commit author from commit data, if available if commit_hash in commit_hash_to_author: event[9] = commit_hash_to_author[commit_hash][0] event[10] = commit_hash_to_author[commit_hash][1] issue_data_new.append(event) + elif name in author_name_to_data: + event[9] = author_name_to_data[name][0] + event[10] = author_name_to_data[name][1] + issue_data_new.append(event) else: # the added commit is not part of the commit data. In most cases, this is due to merge commits # appearing in another pull request, as Codeface does not keep track of merge commits. As we @@ -220,6 +253,9 @@ def is_github_noreply_author(name, email): if bots_list in filenames: f = path.join(filepath, bots_list) log.info("Remove author %s <%s> from %s ...", github_user, github_email, f) + if unify_copilot_users: + log.info("Also unify copilot users to %s <%s> in %s ...", copilot_unified_name, copilot_unified_email, f) + copilot_user_added = False bot_data = csv_writer.read_from_csv(f) bot_data_new = [] @@ -227,7 +263,15 @@ def is_github_noreply_author(name, email): for entry in bot_data: # keep bot entry only if it should not be removed if not is_github_noreply_author(entry[0], entry[1]): - bot_data_new.append(entry) + # unify copilot users if desired + if unify_copilot_users and entry[0] in known_copilot_users_extended: + if not copilot_user_added: + entry[0] = copilot_unified_name + entry[1] = copilot_unified_email + copilot_user_added = True + bot_data_new.append(entry) + else: + bot_data_new.append(entry) else: log.warn("Remove entry %s <%s> from bots list.", entry[0], entry[1]) @@ -264,9 +308,6 @@ def run_postprocessing(conf, resdir, backup_data): bugs_jira_list = "bugs-jira.list" bots_list = "bots.list" - # When looking at elements originating from json lists, we need to consider quotation marks around the string - quot_m = "\"" - data_path = path.join(resdir, conf["project"], conf["tagging"]) # Correctly replace author 'GitHub ' in the commit data and in "commit_added" events of the @@ -356,6 +397,9 @@ def run_postprocessing(conf, resdir, backup_data): if person[4] == issue_event[12] and (quot_m + person[5] + quot_m) == issue_event[13]: issue_event[12] = person[1] issue_event[13] = quot_m + person[2] + quot_m + # replace name in event info 2 if necessary + if quot_m + person[4] + quot_m == issue_event[13]: + issue_event[13] = quot_m + person[1] + quot_m csv_writer.write_to_csv(f, issue_data) @@ -422,8 +466,12 @@ def run_postprocessing(conf, resdir, backup_data): # the bot is already in the list, check if there are different predictions stored_bot = bot_names_and_emails[(bot[0], bot[1])] if stored_bot[2] != bot[2]: + # if either of the predictions is agent, keep agent + if (stored_bot[2] == "Agent" or bot[2] == "Agent"): + stored_bot[2] = "Agent" + bot_names_and_emails[(bot[0], bot[1])] = stored_bot # if either of the predictions is bot, keep bot - if (stored_bot[2] == "Bot" or bot[2] == "Bot"): + elif (stored_bot[2] == "Bot" or bot[2] == "Bot"): stored_bot[2] = "Bot" bot_names_and_emails[(bot[0], bot[1])] = stored_bot # otherwise, if either of the predictions is human, keep human diff --git a/bot_processing/bot_processing.py b/bot_processing/bot_processing.py index 53a397e..d5800b3 100644 --- a/bot_processing/bot_processing.py +++ b/bot_processing/bot_processing.py @@ -13,6 +13,7 @@ # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. # # Copyright 2021-2022 by Thomas Bock +# Copyright 2026 by Leo Sendelbach # All Rights Reserved. """ This file is able to extract information on bot/human users from csv files. @@ -29,6 +30,7 @@ from codeface.configuration import Configuration from csv_writer import csv_writer +from github_user_utils.github_user_utils import known_copilot_users, generate_botname_variants def run(): # get all needed paths and arguments for the method call. @@ -52,6 +54,7 @@ def run(): # (the known bots file is the file in which known bots have been added manually and project independent) __confdir = os.path.join(args.resdir, os.path.dirname(args.config)) __known_bots_file = os.path.abspath(os.path.join(__confdir, "known_github_bots.list")) + __known_agents_file = os.path.abspath(os.path.join(__confdir, "known_github_agents.list")) # run processing of bot data: # 1) load bot data @@ -59,7 +62,7 @@ def run(): # 2) load user data users = load_user_data(os.path.join(__resdir, "usernames.list")) # 3) update bot data with user data and additionally add known bots if they occur in the project - bots = add_user_data(bots, users, __known_bots_file) + bots = add_user_data(bots, users, __known_bots_file, __known_agents_file) # 4) dump result to disk print_to_disk(bots, __resdir) @@ -79,7 +82,7 @@ def load_bot_data(bot_file, header = True): # check if file exists and exit early if not if not os.path.exists(bot_file): - log.error("Bot file '{}' does not exist! Exiting early...".format(bot_file)) + log.error("Bot/Agent file '{}' does not exist (can be empty)! Exiting early...".format(bot_file)) sys.exit(-1) bot_data = csv_writer.read_from_csv(bot_file, delimiter=',') @@ -111,12 +114,13 @@ def load_user_data(user_data_file): return user_data -def check_with_known_bot_list(known_bots_file, bot_data, user_data, bot_data_reduced): +def check_with_known_bot_or_agent_list(known_bots_file, known_agents_file, bot_data, user_data, bot_data_reduced): """ Check whether there are known bots occurring in the project. If so, add them to the bots list or update the bots list accordingly. :param known_bots_file: the file path to the list of known bot data + :param known_agents_file: the file path to the list of known agent data :param bot_data: the bot data originating from the bot prediction :param user_data: a dictionary from the issue data which maps GitHub usernames to authors :param bot_data_reduced: the bot data after mapping GitHub user names to authors @@ -126,6 +130,7 @@ def check_with_known_bot_list(known_bots_file, bot_data, user_data, bot_data_red # Read the list of known bots known_bots = load_bot_data(known_bots_file, header = False) + known_agents = load_bot_data(known_agents_file, header = False) # Get the GitHub usernames of the bots predicted to be a bot predicted_bots = [bot[0] if len(bot) > 0 else "" for bot in bot_data] @@ -133,30 +138,62 @@ def check_with_known_bot_list(known_bots_file, bot_data, user_data, bot_data_red for bot in known_bots: # (1) check if a known bot occurs in the GitHub issue data but has not been predicted - if bot[0] not in predicted_bots and bot[0] in user_data: + bot_variation_predicted_bots = containing_bot_variation(bot[0], predicted_bots) + bot_variation_user_data = containing_bot_variation(bot[0], user_data) + if bot_variation_predicted_bots is None and bot_variation_user_data is not None: # add the known bot as a bot to the bots list additional_bot = dict() - additional_bot["user"] = user_data[bot[0]] + additional_bot["user"] = user_data[bot_variation_user_data] additional_bot["prediction"] = "Bot" bot_data_reduced.append(additional_bot) log.info("Add known bot '{}' to bot data.".format(additional_bot["user"])) # (2) handle known bots that are already present in the bots list - elif bot[0] in predicted_bots and bot[0] in user_data: + elif bot_variation_predicted_bots is not None and bot_variation_user_data is not None: # make sure that this bot has also been predicited to be bot for predicted_bot in bot_data_reduced: - if predicted_bot["user"] == user_data[bot[0]]: + if predicted_bot["user"] == user_data[bot_variation_user_data]: predicted_bot["prediction"] = "Bot" - log.info("Mark user '{}' as bot in the bot data.".format(user_data[bot[0]])) + log.info("Mark user '{}' as bot in the bot data.".format(user_data[bot_variation_user_data])) + break + + # get list of known agents and combine it with the list of known copilot users + copilot_users_variants = generate_botname_variants(known_copilot_users) + # get list of known agent names + known_agents_names = [agent[0] for agent in known_agents] + for copilot_user in copilot_users_variants: + if copilot_user not in known_agents_names: + known_agents.append([copilot_user]) + + for agent in known_agents: + + # (1) check if a known agent occurs in the GitHub issue data but has not been predicted + if agent[0] not in predicted_bots and agent[0] in user_data: + + # add the known agent as a bot to the bots list + additional_agent = dict() + additional_agent["user"] = user_data[agent[0]] + additional_agent["prediction"] = "Agent" + bot_data_reduced.append(additional_agent) + log.info("Add known agent '{}' to bot data.".format(additional_agent["user"])) + + # (2) handle known agents that are already present in the bots list + elif agent[0] in predicted_bots and agent[0] in user_data: + + # make sure that this bot has also been predicited to be an agent + for predicted_bot in bot_data_reduced: + if predicted_bot["user"] == user_data[agent[0]]: + predicted_bot["prediction"] = "Agent" + log.info("Mark user '{}' as agent in the bot data.".format(user_data[agent[0]])) break # return the updated bot data return bot_data_reduced -def add_user_data(bot_data, user_data, known_bots_file): +def add_user_data(bot_data, user_data, known_bots_file, known_agents_file): """ Add user data to bot data, i.e., replace username by name and e-mail. In addition, check in the global bots list whether there are authors in the projects which are @@ -192,19 +229,41 @@ def add_user_data(bot_data, user_data, known_bots_file): continue # get user information if available - if user[0] in user_buffer.keys(): - bot_reduced["user"] = user_buffer[user[0]] + bot_variation = containing_bot_variation(user[0], user_buffer.keys()) + if bot_variation is not None: + bot_reduced["user"] = user_buffer[bot_variation] bot_reduced["prediction"] = user[-1] bot_data_reduced.append(bot_reduced) else: log.warn("User '{}' in bot data does not occur in GitHub user data. Remove user...".format(user[0])) # check whether known GitHub bots occur in the GitHub issue data and, if so, update the bot data accordingly - bot_data_reduced = check_with_known_bot_list(known_bots_file, bot_data, user_buffer, bot_data_reduced) + bot_data_reduced = check_with_known_bot_or_agent_list(known_bots_file, known_agents_file, bot_data, user_buffer, bot_data_reduced) return bot_data_reduced +def containing_bot_variation(botname, name_list): + """ + Helper function to return the variation of a given bot name that occurs in a list of names. + + :param botname: the bot name for which the variation should be returned + :param name_list: the list of names to be checked for containing the bot name or a variation of it + :return: the variation of the given bot name that occurs in the given list of names, or None if no such variation exists + """ + + if botname in name_list: + return botname + elif botname + "bot" in name_list: + return botname + "bot" + elif botname + "[bot]" in name_list: + return botname + "[bot]" + elif botname.replace("[", "").replace("]", "") in name_list: + return botname.replace("[", "").replace("]", "") + else: + return None + + def print_to_disk(bot_data, results_folder): """ Print bot data to file "bots.list" in result folder. diff --git a/github_user_utils/__init__.py b/github_user_utils/__init__.py new file mode 100644 index 0000000..9bad579 --- /dev/null +++ b/github_user_utils/__init__.py @@ -0,0 +1 @@ +# coding=utf-8 diff --git a/github_user_utils/github_user_utils.py b/github_user_utils/github_user_utils.py new file mode 100644 index 0000000..20fa8d8 --- /dev/null +++ b/github_user_utils/github_user_utils.py @@ -0,0 +1,78 @@ +# coding=utf-8 +# This file is part of codeface-extraction, which is free software: you +# can redistribute it and/or modify it under the terms of the GNU General +# Public License as published by the Free Software Foundation, version 2. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +# Copyright 2026 by Leo Sendelbach +# All Rights Reserved. +""" +This file serves as a collection of global variables and utility functions, which are used throughout the +issue data extraction and post-processing, in particular for the processing of GitHub and Copilot user data. +""" + +## +# GLOBAL VARIABLES +## + +# global variables containing all known copilot users and the name and mail adress copilot users will be assigned +known_copilot_users = {"Copilot", "copilot-pull-request-reviewer[bot]", "copilot-swe-agent[bot]"} +copilot_unified_name = "Copilot" +copilot_unified_email = "copilot@example.com" + +## global variables for the GitHub author +github_user = "GitHub" +github_email = "noreply@github.com" +commit_added_event = "commit_added" +mentioned_event = "mentioned" +subscribed_event = "subscribed" +assigned_event = "assigned" +unassigned_event = "unassigned" +review_requested_event = "review_requested" +review_request_removed_event = "review_request_removed" + +# When looking at elements originating from json lists, we need to consider quotation marks around the string +quot_m = "\"" + +## +# UTILITY FUNCTIONS +## + +def is_github_noreply_author(name, email): + """ + Helper function to check whether a (name, e-mail) pair belongs to the author "GitHub ". + There are two options in Codeface how this can happen: + (1) Username is "GitHub" and e-mail address is "noreply@github.com" + (2) Username is "GitHub" and e-mail address has been replaced by Codeface, resulting in "GitHub.noreply@github.com" + + :param name: the name of the author to be checked + :param email: the email address of the author to be checked + :return: whether the given (name, email) pair belongs to the "GitHub " author + """ + + return (name == github_user and (email == github_email or email == (github_user + "." + github_email))) + +def generate_botname_variants(botnames): + """ + Helper function to generate variants of bot names, which are used in the list of + known bots and agents as well as during author postprocessing. + + :param botnames: the list of bot names for which variants should be generated + :return: a set of bot name variants + """ + + botname_variants = set() + for botname in botnames: + botname_variants.add(botname) + botname = botname.replace("[", "").replace("]", "") + botname_variants.add(botname) + + return botname_variants diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py index a901e19..4dc8c63 100644 --- a/issue_processing/issue_processing.py +++ b/issue_processing/issue_processing.py @@ -18,6 +18,7 @@ # Copyright 2018-2019 by Anselm Fehnker # Copyright 2019 by Thomas Bock # Copyright 2020-2021 by Thomas Bock +# Copyright 2025-2026 by Leo Sendelbach # All Rights Reserved. """ This file is able to extract Github issue data from json files. @@ -39,9 +40,10 @@ from dateutil import parser as dateparser from csv_writer import csv_writer +from github_user_utils.github_user_utils import copilot_unified_name # known types from JIRA and GitHub default labels -known_types = {"bug", "improvement", "enhancement", "new feature", "task", "test", "wish"} +known_types = {"bug", "improvement", "enhancement", "feature", "task", "test", "wish"} # known resolutions from JIRA and GitHub default labels known_resolutions = {"unresolved", "fixed", "wontfix", "duplicate", "invalid", "incomplete", "cannot reproduce", @@ -52,6 +54,7 @@ # datetime format string datetime_format = "%Y-%m-%d %H:%M:%S" + def run(): # get all needed paths and arguments for the method call. parser = argparse.ArgumentParser(prog='codeface-extraction-issues-github', description='Codeface extraction') @@ -74,13 +77,14 @@ def run(): # 1) load the list of issues issues = load(__srcdir) # 2) re-format the issues - issues = reformat_issues(issues) + reformat_issues(issues) # 3) merges all issue events into one list - issues = merge_issue_events(issues) + external_connected_events = dict() + filtered_connected_events = merge_issue_events(issues, external_connected_events) # 4) re-format the eventsList of the issues - issues = reformat_events(issues) + reformat_events(issues, filtered_connected_events, external_connected_events) # 5) update user data with Codeface database and dump username-to-name/e-mail list - issues = insert_user_data(issues, __conf, __resdir) + insert_user_data(issues, __conf, __resdir) # 6) dump result to disk print_to_disk(issues, __resdir) @@ -229,7 +233,6 @@ def reformat_issues(issue_data): Re-arrange issue data structure. :param issue_data: the issue data to re-arrange - :return: the re-arranged issue data """ log.devinfo("Re-arranging Github issues...") @@ -238,7 +241,10 @@ def reformat_issues(issue_data): for issue in issue_data: # empty container for issue types - issue["type"] = [] + if issue["type"] is None: + issue["type"] = [] + else: + issue["type"] = [issue["type"]["name"].lower()] # empty container for issue resolutions issue["resolution"] = [] @@ -255,7 +261,7 @@ def reformat_issues(issue_data): if issue["relatedCommits"] is None: issue["relatedCommits"] = [] - # if an issue has no reviewsList, an empty Listgets created + # if an issue has no reviewsList, an empty List gets created if issue["reviewsList"] is None: issue["reviewsList"] = [] @@ -263,6 +269,10 @@ def reformat_issues(issue_data): if "relatedIssues" not in issue: issue["relatedIssues"] = [] + # if an issue has no sub-issue list, an empty List gets created + if "subIssues" not in issue: + issue["subIssues"] = [] + # add "closed_at" information if not present yet if issue["closed_at"] is None: issue["closed_at"] = "" @@ -279,20 +289,22 @@ def reformat_issues(issue_data): else: issue["type"].append("issue") - return issue_data + return -def merge_issue_events(issue_data): +def merge_issue_events(issue_data, external_connected_events): """ All issue events are merged together in the eventsList. This simplifies processing in later steps. :param issue_data: the issue data from which the events shall be merged - :return: the issue data with merged eventsList + :param external_connected_events: a dict to store connected events to external issues + :return: a filtered dict of connected events for future reconstruction """ log.info("Merge issue events ...") issue_data_to_update = dict() + connected_events = dict() for issue in issue_data: @@ -361,6 +373,7 @@ def merge_issue_events(issue_data): # it is a commit which was added to the pull request if rel_commit["type"] == "commitAddedToPullRequest": rel_commit["event"] = "commit_added" + rel_commit["event_info_2"] = rel_commit["commit"]["author"] # if the related commit was mentioned in an issue comment: elif rel_commit["type"] == "commitMentionedInIssue": @@ -476,6 +489,12 @@ def merge_issue_events(issue_data): if event["event"] == "review_requested" or event["event"] == "review_request_removed": event["ref_target"] = event["requestedReviewer"] + # if event is a specific copilot event, assign the copilot user data + if event["event"] == "copilot_work_started" or event["event"] == "copilot_work_finished": + event["user"]["name"] = None + event["user"]["username"] = copilot_unified_name + event["user"]["email"] = "" + # if event dismisses a review, we can determine the original state of the corresponding review if event["event"] == "review_dismissed": for review in issue["reviewsList"]: @@ -488,6 +507,36 @@ def merge_issue_events(issue_data): event["ref_target"] = event["user"] event["user"] = event["assigner"] + # if event is merged event, save the hash of the merge commit in event_info_1 + if event["event"] == "merged": + event["event_info_1"] = event["commit"]["hash"] + + # if event is connected event, create or add to a matching dict entry by matching timestamps, for later reconstruction + if event["event"] == "connected": + if event["created_at"] in connected_events.keys() and connected_events[event["created_at"]]["user"] == event["user"]: + # if there is already a connected event at this time by this user, add this event to the list + connected_events[event["created_at"]]["issues"].append(issue["number"]) + elif subtract_seconds_from_time(event["created_at"], 1) in connected_events.keys() \ + and connected_events[subtract_seconds_from_time(event["created_at"], 1)]["user"] == event["user"]: + # same as above, but accounting for a possible difference in timestamps of 1 second between matching events + connected_events[subtract_seconds_from_time(event["created_at"], 1)]["issues"].append(issue["number"]) + event["created_at"] = subtract_seconds_from_time(event["created_at"], 1) + elif subtract_seconds_from_time(event["created_at"], -1) in connected_events.keys() \ + and connected_events[subtract_seconds_from_time(event["created_at"], -1)]["user"] == event["user"]: + # same as above, with offset calculated in the other direction + connected_events[subtract_seconds_from_time(event["created_at"], -1)]["issues"].append(issue["number"]) + event["created_at"] = subtract_seconds_from_time(event["created_at"], -1) + else: + # if there is no connected event yet at this timestamp, create a new entry for this event + connected_info = dict() + connected_info["issues"] = [issue["number"]] + connected_info["user"] = event["user"] + connected_events[event["created_at"]] = connected_info + + # if event is a locked event, save the lock reason in event_info_1 + if event["event"] == "locked": + event["event_info_1"] = event["lock_reason"] + # merge events, relatedCommits, relatedIssues and comment lists issue["eventsList"] = issue["commentsList"] + issue["eventsList"] + issue["relatedIssues"] + issue[ "relatedCommits"] + issue["reviewsList"] @@ -499,21 +548,62 @@ def merge_issue_events(issue_data): # sorts eventsList by time issue["eventsList"] = sorted(issue["eventsList"], key=lambda k: k["created_at"]) + # filter out connected events which cannot be perfectly matched + # and populate external_connected_events dict + # because this happens in place, we do not need to return the external_connected_event dict later + filtered_connected_events = dict(filter(lambda item: filter_connected_events(item[0], item[1], external_connected_events), connected_events.items())) + # updates all the issues by the temporarily stored referenced_by events for key, value in issue_data_to_update.iteritems(): for issue in issue_data: if issue["number"] == value["number"]: issue["eventsList"] = issue["eventsList"] + value["eventsList"] - return issue_data - - -def reformat_events(issue_data): + # return the filtered_connected_events dict for later reconstruction + return filtered_connected_events + + +def filter_connected_events(key, value, external_connected_events): + num_issues = len(value["issues"]) + # if only a single connected event exists at this time, it has to be connecting to an external issue + if num_issues == 1: + external_connected_events[key] = value + return False + # if 2 connected events exist, matching them is trivial + if num_issues == 2: + return True + occurrences = {x: value["issues"].count(x) for x in set(value["issues"])} + # otherwise, if it is an even number, check if it can be easily matched, + # meaning that exactly half the events occur in the same issue + if num_issues % 2 == 0 and num_issues/2 in occurrences.values(): + # duplicate issue list for matching the issues later + value["multi_issues_copy"] = list(value["issues"]) + return True + # if it is an odd number, check if it can be easily matched + # meaning that exactly half (rounded up) the events occur in the same issue + if num_issues % 2 == 1 and (num_issues + 1)/2 in occurrences.values(): + for sub_key, sub_value in occurrences.iteritems(): + # then, assign one of them as an external connected event and proceed as in previous case + if sub_value == (num_issues + 1)/2: + new_entry = dict() + new_entry["user"] = value["user"] + new_entry["issues"] = [sub_key] + external_connected_events[key] = new_entry + value["issues"].remove(sub_key) + # duplicate issue list for matching the issues later + value["multi_issues_copy"] = list(value["issues"]) + return True + # no other variants can be easily matched + return False + + +def reformat_events(issue_data, filtered_connected_events, external_connected_events): """ Re-format event information dependent on the event type. :param issue_data: the data of all issues that shall be re-formatted - :return: the issue data with updated event information + :param filtered_connected_events: the dict of connected events which can be reconstructed + :param external_connected_events: the dict of connected events to external issues """ log.info("Update event information ...") @@ -538,6 +628,35 @@ def reformat_events(issue_data): if not event["ref_target"] is None and not event["ref_target"] == "": users = update_user_dict(users, event["ref_target"]) + # reconstruction of connections + if event["event"] == "connected": + if event["created_at"] in external_connected_events \ + and issue["number"] in external_connected_events[event["created_at"]]["issues"]: + # if the event is an external connected event, mark it as such and remove this issue from the list + event["event_info_1"] = "external" + external_connected_events[event["created_at"]]["issues"].remove(issue["number"]) + elif event["created_at"] in filtered_connected_events \ + and issue["number"] in filtered_connected_events[event["created_at"]]["issues"]: + # if it is instead an internal connected event + value = filtered_connected_events[event["created_at"]] + if len(value["issues"]) == 2: + # and we only have 2 issues in the list, connect to the other issue + event["event_info_1"] = value["issues"][0] if value["issues"][1] == issue["number"] else value["issues"][1] + else: + # and we have more than two issues, count each issue's occurrences + occurrences = {x: value["issues"].count(x) for x in set(value["issues"])} + if occurrences[issue["number"]] == max(occurrences.values()): + # if our issue is the most common one, that means it is the common denominator + # for all connected events at this time + # so this event connects to any other issue + # which is then removed from a copied list to avoid duplications + number = next(x for x in value["multi_issues_copy"] if x != issue["number"]) + value["multi_issues_copy"].remove(number) + event["event_info_1"] = number + else: + # otherwise, connect this event to the common denominator + event["event_info_1"] = max(occurrences, key=occurrences.get) + # as the user dictionary is created, start re-formating the event information of all issues for issue in issue_data: @@ -555,13 +674,16 @@ def reformat_events(issue_data): if event["event"] == "closed": event["event"] = "state_updated" event["event_info_1"] = "closed" # new state - event["event_info_2"] = "open" # old state + if event["commit"] is not None: + event["event_info_2"] = event["commit"]["hash"] + else: + event["event_info_2"] = event["state_reason"] issue["state_new"] = "closed" elif event["event"] == "reopened": event["event"] = "state_updated" event["event_info_1"] = "open" # new state - event["event_info_2"] = "closed" # old state + event["event_info_2"] = event["state_reason"] issue["state_new"] = "reopened" elif event["event"] == "labeled": @@ -569,7 +691,7 @@ def reformat_events(issue_data): event["event_info_1"] = label # if the label is in this list, it also is a type of the issue - if label in known_types: + if label in known_types and label not in issue["type"]: issue["type"].append(str(label)) # creates an event for type updates and adds it to the eventsList @@ -631,10 +753,14 @@ def reformat_events(issue_data): issue["eventsList"].append(resolution_event) elif event["event"] == "commented": - # "state_new" and "resolution" of the issue give the information about the state and the resolution of + # "state_new" of the issue gives the information about the state of # the issue when the comment was written, because the eventsList is sorted by time event["event_info_1"] = issue["state_new"] - event["event_info_2"] = issue["resolution"] + # if event is a review comment, it can contain suggestions + if "contains_suggestion" in event: + event["event_info_2"] = str(event["contains_suggestion"]) + else: + event["event_info_2"] = str(False) elif event["event"] == "referenced" and not event["commit"] is None: # remove "referenced" events originating from commits @@ -648,7 +774,7 @@ def reformat_events(issue_data): for event_to_remove in events_to_remove: issue["eventsList"].remove(event_to_remove) - return issue_data + return def insert_user_data(issues, conf, resdir): @@ -659,7 +785,6 @@ def insert_user_data(issues, conf, resdir): :param issues: the issues to retrieve user data from :param conf: the project configuration :param resdir: the directory in which the username-to-user-list should be dumped - :return: the updated issue data """ log.info("Syncing users with ID service...") @@ -745,6 +870,9 @@ def get_user_from_id(idx, buffer_db=user_buffer): for event in issue["eventsList"]: event["user"] = get_id_and_update_user(event["user"]) + if event["event"] == "commit_added": + event["event_info_2"] = get_id_and_update_user(event["event_info_2"]) + # check database for the reference-target user if needed if event["ref_target"] != "": event["ref_target"] = get_id_and_update_user(event["ref_target"]) @@ -758,6 +886,10 @@ def get_user_from_id(idx, buffer_db=user_buffer): for event in issue["eventsList"]: event["user"] = get_user_from_id(event["user"]) + # for commit_added events, save the commit's author's name in event_info_2 + if event["event"] == "commit_added": + event["event_info_2"] = get_user_from_id(event["event_info_2"])["name"] + # get the reference-target user if needed if event["ref_target"] != "": event["ref_target"] = get_user_from_id(event["ref_target"]) @@ -778,7 +910,7 @@ def get_user_from_id(idx, buffer_db=user_buffer): username_dump = os.path.join(resdir, "usernames.list") csv_writer.write_to_csv(username_dump, sorted(set(lines), key=lambda line: line[0])) - return issues + return def print_to_disk(issues, results_folder): @@ -805,7 +937,7 @@ def print_to_disk(issues, results_folder): json.dumps(issue["resolution"]), issue["created_at"], issue["closed_at"], - json.dumps([]), # components + json.dumps(issue["subIssues"]), # components event["event"], event["user"]["name"], event["user"]["email"], diff --git a/issue_processing/jira_issue_processing.py b/issue_processing/jira_issue_processing.py index d9748ae..3b12a93 100644 --- a/issue_processing/jira_issue_processing.py +++ b/issue_processing/jira_issue_processing.py @@ -18,6 +18,7 @@ # Copyright 2018-2019 by Anselm Fehnker # Copyright 2020-2021 by Thomas Bock # Copyright 2023 by Maximilian Löffler +# Copyright 2025-2026 by Leo Sendelbach # All Rights Reserved. """ This file is able to extract Jira issue data from xml files. @@ -125,7 +126,7 @@ def run(): referenced_issue["history"].append(referenced_by) # 5) update user data with Codeface database - processed_issues = insert_user_data(processed_issues, __conf) + insert_user_data(processed_issues, __conf) # 6) dump result to disk print_to_disk(processed_issues, __resdir) # # 7) export for Gephi @@ -300,9 +301,12 @@ def parse_xml(issue_data, persons, skip_history, referenced_bys): link = issue_x.getElementsByTagName("link")[0] issue["url"] = link.firstChild.data - type = issue_x.getElementsByTagName("type")[0] - issue["type"] = type.firstChild.data - issue["type_list"] = ["issue", str(type.firstChild.data.lower())] + type = issue_x.getElementsByTagName("type")[0].firstChild.data + # rename 'new feature' type to 'feature' to be in line with the github original issue type + if type == "New Feature": + type = "Feature" + issue["type"] = type + issue["type_list"] = ["issue", str(type.lower())] status = issue_x.getElementsByTagName("status")[0] issue["state"] = status.firstChild.data @@ -460,21 +464,19 @@ def load_issues_via_api(issues, persons, url, referenced_bys): for change in changelog.histories: # default values for state and resolution - old_state, new_state, old_resolution, new_resolution = "open", "open", "unresolved", "unresolved" + new_state, old_resolution, new_resolution = "open", "unresolved", "unresolved" # all changes in the issue changelog are checked if they contain a useful information for item in change.items: # state_updated event gets created and added to the issue history if item.field == "status": - if item.fromString is not None: - old_state = item.fromString.lower() if item.toString is not None: new_state = item.toString.lower() history = dict() history["event"] = "state_updated" history["event_info_1"] = new_state - history["event_info_2"] = old_state + history["event_info_2"] = "" if hasattr(change, "author"): user = create_user(change.author.displayName, change.author.name, "") else: @@ -686,7 +688,7 @@ def get_user_from_id(idx, buffer_db=user_buffer): event["event_info_2"] = assigned_user["email"] log.debug("number of issues after insert_user_data: '{}'".format(len(issues))) - return issues + return def print_to_disk(issues, results_folder):