From 65f0a8770339427d2fbddbd31ba1dde2110c003d Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Mon, 25 Aug 2025 13:35:24 +0200 Subject: [PATCH 01/26] Add commit author of 'commit_added' events to event info This allows for reconstruction of correct commit author if user is github Signed-off-by: Leo Sendelbach --- author_postprocessing/author_postprocessing.py | 8 ++++++-- issue_processing/issue_processing.py | 8 ++++++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/author_postprocessing/author_postprocessing.py b/author_postprocessing/author_postprocessing.py index 13b1e38..994fcd2 100644 --- a/author_postprocessing/author_postprocessing.py +++ b/author_postprocessing/author_postprocessing.py @@ -178,7 +178,7 @@ def is_github_noreply_author(name, email): commit_data_file = path.join(data_path, commits_list) commit_data = csv_writer.read_from_csv(commit_data_file) commit_hash_to_author = {commit[7]: commit[2:4] for commit in commit_data} - + author_name_to_data = {author[1]: author[1:3] for author in author_data_new} issue_data_new = [] for event in issue_data: @@ -186,12 +186,16 @@ def is_github_noreply_author(name, email): if is_github_noreply_author(event[9], event[10]) and event[8] == commit_added_event: # extract commit hash from event info 1 commit_hash = event[12] - + name = event[13][1:-1] # extract commit author from commit data, if available if commit_hash in commit_hash_to_author: event[9] = commit_hash_to_author[commit_hash][0] event[10] = commit_hash_to_author[commit_hash][1] issue_data_new.append(event) + elif name in author_name_to_data: + event[9] = author_name_to_data[name][0] + event[10] = author_name_to_data[name][1] + issue_data_new.append(event) else: # the added commit is not part of the commit data. In most cases, this is due to merge commits # appearing in another pull request, as Codeface does not keep track of merge commits. As we diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py index a901e19..410769c 100644 --- a/issue_processing/issue_processing.py +++ b/issue_processing/issue_processing.py @@ -361,6 +361,7 @@ def merge_issue_events(issue_data): # it is a commit which was added to the pull request if rel_commit["type"] == "commitAddedToPullRequest": rel_commit["event"] = "commit_added" + rel_commit["event_info_2"] = rel_commit["commit"]["author"] # if the related commit was mentioned in an issue comment: elif rel_commit["type"] == "commitMentionedInIssue": @@ -745,6 +746,9 @@ def get_user_from_id(idx, buffer_db=user_buffer): for event in issue["eventsList"]: event["user"] = get_id_and_update_user(event["user"]) + if event["event"] == "commit_added": + event["event_info_2"] = get_id_and_update_user(event["event_info_2"]) + # check database for the reference-target user if needed if event["ref_target"] != "": event["ref_target"] = get_id_and_update_user(event["ref_target"]) @@ -758,6 +762,10 @@ def get_user_from_id(idx, buffer_db=user_buffer): for event in issue["eventsList"]: event["user"] = get_user_from_id(event["user"]) + # for commit_added events, save the commit's author's name in event_info_2 + if event["event"] == "commit_added": + event["event_info_2"] = get_user_from_id(event["event_info_2"])["name"] + # get the reference-target user if needed if event["ref_target"] != "": event["ref_target"] = get_user_from_id(event["ref_target"]) From 2e67f0df16b40af75c688b12744e0103c40dff57 Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Tue, 26 Aug 2025 10:56:56 +0200 Subject: [PATCH 02/26] Update Copyright headers also added one comment for clarity Signed-off-by: Leo Sendelbach --- author_postprocessing/author_postprocessing.py | 2 ++ issue_processing/issue_processing.py | 1 + 2 files changed, 3 insertions(+) diff --git a/author_postprocessing/author_postprocessing.py b/author_postprocessing/author_postprocessing.py index 994fcd2..42ca247 100644 --- a/author_postprocessing/author_postprocessing.py +++ b/author_postprocessing/author_postprocessing.py @@ -14,6 +14,7 @@ # # Copyright 2015-2017 by Claus Hunsen # Copyright 2020-2022 by Thomas Bock +# Copyright 2025 by Leo Sendelbach # All Rights Reserved. """ This file is able to disambiguate authors after the extraction from the Codeface database was performed. A manually @@ -186,6 +187,7 @@ def is_github_noreply_author(name, email): if is_github_noreply_author(event[9], event[10]) and event[8] == commit_added_event: # extract commit hash from event info 1 commit_hash = event[12] + # extract author name from event info 2 while cutting excess '"' name = event[13][1:-1] # extract commit author from commit data, if available if commit_hash in commit_hash_to_author: diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py index 410769c..72cf331 100644 --- a/issue_processing/issue_processing.py +++ b/issue_processing/issue_processing.py @@ -18,6 +18,7 @@ # Copyright 2018-2019 by Anselm Fehnker # Copyright 2019 by Thomas Bock # Copyright 2020-2021 by Thomas Bock +# Copyright 2025 by Leo Sendelbach # All Rights Reserved. """ This file is able to extract Github issue data from json files. From c4f4af51251da5ec1bab4c4d233ec454c70b079a Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Thu, 25 Sep 2025 14:37:37 +0200 Subject: [PATCH 03/26] Add connected events reconstruction also save merge commits reconstruction of connected events is done by first saving all connected events that occured at the same time. Then, it is possible to match connected events iff: - half of the involved issues are equal, meaning that one issue is connected to multiple others - half rounded up of the involved isses are equal, meaning that we have one external connected event and then the previous case with the remaining issues Signed-off-by: Leo Sendelbach --- issue_processing/issue_processing.py | 97 ++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py index 72cf331..7e13f9b 100644 --- a/issue_processing/issue_processing.py +++ b/issue_processing/issue_processing.py @@ -31,6 +31,7 @@ import sys import urllib from datetime import datetime, timedelta +import math import operator from codeface.cli import log @@ -53,6 +54,9 @@ # datetime format string datetime_format = "%Y-%m-%d %H:%M:%S" +filtered_connected_events = dict() +external_connected_events = dict() + def run(): # get all needed paths and arguments for the method call. parser = argparse.ArgumentParser(prog='codeface-extraction-issues-github', description='Codeface extraction') @@ -294,6 +298,7 @@ def merge_issue_events(issue_data): log.info("Merge issue events ...") issue_data_to_update = dict() + connected_events = dict() for issue in issue_data: @@ -490,6 +495,28 @@ def merge_issue_events(issue_data): event["ref_target"] = event["user"] event["user"] = event["assigner"] + # if event is merged event, save the hash of the merge commit in event_info_1 + if event["event"] == "merged": + event["event_info_1"] = event["commit"]["hash"] + + # if event is connected event, create or add to a matching dict entry by matching timestamps, for later reconstruction + if event["event"] == "connected": + if event["created_at"] in connected_events.keys() and connected_events[event["created_at"]]["user"] == event["user"]: + connected_events[event["created_at"]]["issues"].append(issue["number"]) + elif subtract_seconds_from_time(event["created_at"], 1) in connected_events.keys() \ + and connected_events[subtract_seconds_from_time(event["created_at"], 1)]["user"] == event["user"]: + connected_events[subtract_seconds_from_time(event["created_at"], 1)]["issues"].append(issue["number"]) + event["created_at"] = subtract_seconds_from_time(event["created_at"], 1) + elif subtract_seconds_from_time(event["created_at"], -1) in connected_events.keys() \ + and connected_events[subtract_seconds_from_time(event["created_at"], -1)]["user"] == event["user"]: + connected_events[subtract_seconds_from_time(event["created_at"], -1)]["issues"].append(issue["number"]) + event["created_at"] = subtract_seconds_from_time(event["created_at"], -1) + else: + connected_info = dict() + connected_info["issues"] = [issue["number"]] + connected_info["user"] = issue["user"] + connected_events[event["created_at"]] = connected_info + # merge events, relatedCommits, relatedIssues and comment lists issue["eventsList"] = issue["commentsList"] + issue["eventsList"] + issue["relatedIssues"] + issue[ "relatedCommits"] + issue["reviewsList"] @@ -501,6 +528,10 @@ def merge_issue_events(issue_data): # sorts eventsList by time issue["eventsList"] = sorted(issue["eventsList"], key=lambda k: k["created_at"]) + # filter out connected events which cannot be perfectly matched + global filtered_connected_events + filtered_connected_events = dict(filter(lambda item: filter_connected_events(item[0], item[1]), connected_events.iteritems())) + # updates all the issues by the temporarily stored referenced_by events for key, value in issue_data_to_update.iteritems(): for issue in issue_data: @@ -510,6 +541,41 @@ def merge_issue_events(issue_data): return issue_data +def filter_connected_events(key, value): + num_issues = len(value["issues"]) + global external_connected_events + # if only a single connected event exists at this time, it has to be connecting to an external issue + if num_issues == 1: + external_connected_events[key] = value + return False + # if 2 connected events exist, matching them is trivial + if num_issues == 2: + return True + occurances = {x: value["issues"].count(x) for x in set(value["issues"])} + # otherwise, if it is an even number, check if it can be easily matched, + # meaning that exactly half the events occur in the same issue + if num_issues % 2 == 0 and num_issues/2 in occurances.values(): + # duplicate issue list for matching the issues later + value["multi_issues_copy"] = list(value["issues"]) + return True + # if it is an odd number, check if it can be easily matched + # meaning that exactly half (rounded up) the events occur in the same issue + if num_issues % 2 == 1 and math.ceil(num_issues/2) in occurances.values(): + for sub_key, sub_value in occurances.iteritems(): + # then, assign one of them as an external connected event and proceed as in previous case + if sub_value == math.ceil(num_issues/2): + new_entry = dict() + new_entry["user"] = value["user"] + new_entry["issues"] = [sub_key] + external_connected_events[key] = new_entry + value["issues"].remove(sub_key) + # duplicate issue list for matching the issues later + value["multi_issues_copy"] = list(value["issues"]) + return True + # no other variants can be easily matched + return False + + def reformat_events(issue_data): """ Re-format event information dependent on the event type. @@ -540,6 +606,37 @@ def reformat_events(issue_data): if not event["ref_target"] is None and not event["ref_target"] == "": users = update_user_dict(users, event["ref_target"]) + # reconstruction of connections + if event["event"] == "connected": + external = False + # check if event is external + for key, value in external_connected_events.iteritems(): + if issue["number"] in value["issues"]: + if key == event["created_at"]: + external = True + event["event_info_1"] = "external" + value["issues"].remove(issue["number"]) + # if so, skip the next checks + if external: + continue + # otherwise, it must be internal + for key, value in filtered_connected_events.iteritems(): + if issue["number"] in value["issues"]: + if key == event["created_at"]: + if len(value["issues"]) == 2: + # if only 2 events occured at this timestamp, matching the issues is trivial + event["event_info_1"] = value["issues"][0] if value["issues"][1] == issue["number"] else value["issues"][1] + else: + occurances = {x: value["issues"].count(x) for x in set(value["issues"])} + if occurances[issue["number"]] == max(occurances.values()): + # otherwise, if current issue is the centerpiece of all connected events, use previous copy to match issues + number = next(x for x in value["multi_issues_copy"] if x != issue["number"]) + value["multi_issues_copy"].remove(number) + event["event_info_1"] = number + else: + # if current issue is not the centerpiece, connect it to the centerpiece + event["event_info_1"] = max(occurances, key = occurances.get) + # as the user dictionary is created, start re-formating the event information of all issues for issue in issue_data: From e77b009f277f48fa63640cf195dca1b2d4273330 Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Tue, 14 Oct 2025 14:43:33 +0200 Subject: [PATCH 04/26] Remove unnecessary returns of issue data since data is modified in-place, return of input data is not needed Signed-off-by: Leo Sendelbach --- issue_processing/issue_processing.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py index 7e13f9b..52e262d 100644 --- a/issue_processing/issue_processing.py +++ b/issue_processing/issue_processing.py @@ -79,13 +79,13 @@ def run(): # 1) load the list of issues issues = load(__srcdir) # 2) re-format the issues - issues = reformat_issues(issues) + reformat_issues(issues) # 3) merges all issue events into one list - issues = merge_issue_events(issues) + merge_issue_events(issues) # 4) re-format the eventsList of the issues - issues = reformat_events(issues) + reformat_events(issues) # 5) update user data with Codeface database and dump username-to-name/e-mail list - issues = insert_user_data(issues, __conf, __resdir) + insert_user_data(issues, __conf, __resdir) # 6) dump result to disk print_to_disk(issues, __resdir) @@ -284,7 +284,7 @@ def reformat_issues(issue_data): else: issue["type"].append("issue") - return issue_data + return def merge_issue_events(issue_data): @@ -538,7 +538,7 @@ def merge_issue_events(issue_data): if issue["number"] == value["number"]: issue["eventsList"] = issue["eventsList"] + value["eventsList"] - return issue_data + return def filter_connected_events(key, value): @@ -747,7 +747,7 @@ def reformat_events(issue_data): for event_to_remove in events_to_remove: issue["eventsList"].remove(event_to_remove) - return issue_data + return def insert_user_data(issues, conf, resdir): @@ -884,7 +884,7 @@ def get_user_from_id(idx, buffer_db=user_buffer): username_dump = os.path.join(resdir, "usernames.list") csv_writer.write_to_csv(username_dump, sorted(set(lines), key=lambda line: line[0])) - return issues + return def print_to_disk(issues, results_folder): From c28b1385504a63a8cc4e5d64976944d890cb7a34 Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Tue, 14 Oct 2025 14:47:19 +0200 Subject: [PATCH 05/26] Add reasons to reopen/closed events ALso add commit hash if closed by commit Signed-off-by: Leo Sendelbach --- issue_processing/issue_processing.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py index 52e262d..e5a40e7 100644 --- a/issue_processing/issue_processing.py +++ b/issue_processing/issue_processing.py @@ -654,13 +654,16 @@ def reformat_events(issue_data): if event["event"] == "closed": event["event"] = "state_updated" event["event_info_1"] = "closed" # new state - event["event_info_2"] = "open" # old state + if event["commit"] is not None: + event["event_info_2"] = event["commit"]["hash"] + else: + event["event_info_2"] = event["state_reason"] issue["state_new"] = "closed" elif event["event"] == "reopened": event["event"] = "state_updated" event["event_info_1"] = "open" # new state - event["event_info_2"] = "closed" # old state + event["event_info_2"] = event["state_reason"] issue["state_new"] = "reopened" elif event["event"] == "labeled": From 488626e9b55313339502d01e13cb7a5b2def0126 Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Tue, 14 Oct 2025 17:01:48 +0200 Subject: [PATCH 06/26] Add GitHub issue types also rename 'new feature' to 'feature' Signed-off-by: Leo Sendelbach --- issue_processing/issue_processing.py | 7 +++++-- issue_processing/jira_issue_processing.py | 9 ++++++--- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py index e5a40e7..adf8b7c 100644 --- a/issue_processing/issue_processing.py +++ b/issue_processing/issue_processing.py @@ -43,7 +43,7 @@ from csv_writer import csv_writer # known types from JIRA and GitHub default labels -known_types = {"bug", "improvement", "enhancement", "new feature", "task", "test", "wish"} +known_types = {"bug", "improvement", "enhancement", "feature", "task", "test", "wish"} # known resolutions from JIRA and GitHub default labels known_resolutions = {"unresolved", "fixed", "wontfix", "duplicate", "invalid", "incomplete", "cannot reproduce", @@ -243,7 +243,10 @@ def reformat_issues(issue_data): for issue in issue_data: # empty container for issue types - issue["type"] = [] + if issue["type"] is None: + issue["type"] = [] + else: + issue["type"] = [issue["type"]["name"].lower()] # empty container for issue resolutions issue["resolution"] = [] diff --git a/issue_processing/jira_issue_processing.py b/issue_processing/jira_issue_processing.py index d9748ae..6f60beb 100644 --- a/issue_processing/jira_issue_processing.py +++ b/issue_processing/jira_issue_processing.py @@ -300,9 +300,12 @@ def parse_xml(issue_data, persons, skip_history, referenced_bys): link = issue_x.getElementsByTagName("link")[0] issue["url"] = link.firstChild.data - type = issue_x.getElementsByTagName("type")[0] - issue["type"] = type.firstChild.data - issue["type_list"] = ["issue", str(type.firstChild.data.lower())] + type = issue_x.getElementsByTagName("type")[0].firstChild.data + # rename 'new feature' type to 'feature' to be in line with the github original issue type + if type == "New Feature": + type = "Feature" + issue["type"] = type + issue["type_list"] = ["issue", str(type.lower())] status = issue_x.getElementsByTagName("status")[0] issue["state"] = status.firstChild.data From 2894b0d95c8f7063444ef8bf3c5b14569714416f Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Tue, 14 Oct 2025 17:26:55 +0200 Subject: [PATCH 07/26] Simplify loops for reconstruction of connections also remove duplicates from type list Signed-off-by: Leo Sendelbach --- issue_processing/issue_processing.py | 67 +++++++++++----------------- 1 file changed, 26 insertions(+), 41 deletions(-) diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py index adf8b7c..dce85f1 100644 --- a/issue_processing/issue_processing.py +++ b/issue_processing/issue_processing.py @@ -54,9 +54,6 @@ # datetime format string datetime_format = "%Y-%m-%d %H:%M:%S" -filtered_connected_events = dict() -external_connected_events = dict() - def run(): # get all needed paths and arguments for the method call. parser = argparse.ArgumentParser(prog='codeface-extraction-issues-github', description='Codeface extraction') @@ -81,9 +78,10 @@ def run(): # 2) re-format the issues reformat_issues(issues) # 3) merges all issue events into one list - merge_issue_events(issues) + external_connected_events = dict() + filtered_connected_events = merge_issue_events(issues, external_connected_events) # 4) re-format the eventsList of the issues - reformat_events(issues) + reformat_events(issues, filtered_connected_events, external_connected_events) # 5) update user data with Codeface database and dump username-to-name/e-mail list insert_user_data(issues, __conf, __resdir) # 6) dump result to disk @@ -290,7 +288,7 @@ def reformat_issues(issue_data): return -def merge_issue_events(issue_data): +def merge_issue_events(issue_data, external_connected_events): """ All issue events are merged together in the eventsList. This simplifies processing in later steps. @@ -532,8 +530,7 @@ def merge_issue_events(issue_data): issue["eventsList"] = sorted(issue["eventsList"], key=lambda k: k["created_at"]) # filter out connected events which cannot be perfectly matched - global filtered_connected_events - filtered_connected_events = dict(filter(lambda item: filter_connected_events(item[0], item[1]), connected_events.iteritems())) + filtered_connected_events = dict(filter(lambda item: filter_connected_events(item[0], item[1], external_connected_events), connected_events.items())) # updates all the issues by the temporarily stored referenced_by events for key, value in issue_data_to_update.iteritems(): @@ -541,12 +538,11 @@ def merge_issue_events(issue_data): if issue["number"] == value["number"]: issue["eventsList"] = issue["eventsList"] + value["eventsList"] - return + return filtered_connected_events -def filter_connected_events(key, value): +def filter_connected_events(key, value, external_connected_events): num_issues = len(value["issues"]) - global external_connected_events # if only a single connected event exists at this time, it has to be connecting to an external issue if num_issues == 1: external_connected_events[key] = value @@ -579,7 +575,7 @@ def filter_connected_events(key, value): return False -def reformat_events(issue_data): +def reformat_events(issue_data, filtered_connected_events, external_connected_events): """ Re-format event information dependent on the event type. @@ -611,34 +607,23 @@ def reformat_events(issue_data): # reconstruction of connections if event["event"] == "connected": - external = False - # check if event is external - for key, value in external_connected_events.iteritems(): - if issue["number"] in value["issues"]: - if key == event["created_at"]: - external = True - event["event_info_1"] = "external" - value["issues"].remove(issue["number"]) - # if so, skip the next checks - if external: - continue - # otherwise, it must be internal - for key, value in filtered_connected_events.iteritems(): - if issue["number"] in value["issues"]: - if key == event["created_at"]: - if len(value["issues"]) == 2: - # if only 2 events occured at this timestamp, matching the issues is trivial - event["event_info_1"] = value["issues"][0] if value["issues"][1] == issue["number"] else value["issues"][1] - else: - occurances = {x: value["issues"].count(x) for x in set(value["issues"])} - if occurances[issue["number"]] == max(occurances.values()): - # otherwise, if current issue is the centerpiece of all connected events, use previous copy to match issues - number = next(x for x in value["multi_issues_copy"] if x != issue["number"]) - value["multi_issues_copy"].remove(number) - event["event_info_1"] = number - else: - # if current issue is not the centerpiece, connect it to the centerpiece - event["event_info_1"] = max(occurances, key = occurances.get) + if event["created_at"] in external_connected_events \ + and issue["number"] in external_connected_events[event["created_at"]]["issues"]: + event["event_info_1"] = "external" + external_connected_events[event["created_at"]]["issues"].remove(issue["number"]) + elif event["created_at"] in filtered_connected_events \ + and issue["number"] in filtered_connected_events[event["created_at"]]["issues"]: + value = filtered_connected_events[event["created_at"]] + if len(value["issues"]) == 2: + event["event_info_1"] = value["issues"][0] if value["issues"][1] == issue["number"] else value["issues"][1] + else: + occurances = {x: value["issues"].count(x) for x in set(value["issues"])} + if occurances[issue["number"]] == max(occurances.values()): + number = next(x for x in value["multi_issues_copy"] if x != issue["number"]) + value["multi_issues_copy"].remove(number) + event["event_info_1"] = number + else: + event["event_info_1"] = max(occurances, key = occurances.get) # as the user dictionary is created, start re-formating the event information of all issues for issue in issue_data: @@ -674,7 +659,7 @@ def reformat_events(issue_data): event["event_info_1"] = label # if the label is in this list, it also is a type of the issue - if label in known_types: + if label in known_types and label not in issue["type"]: issue["type"].append(str(label)) # creates an event for type updates and adds it to the eventsList From 7ae507972788a998f7f51dd37b224e09841641ff Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Tue, 14 Oct 2025 17:28:23 +0200 Subject: [PATCH 08/26] Add subissues to results csv using empty line reserved for jira components Signed-off-by: Leo Sendelbach --- issue_processing/issue_processing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py index dce85f1..1fd3d24 100644 --- a/issue_processing/issue_processing.py +++ b/issue_processing/issue_processing.py @@ -902,7 +902,7 @@ def print_to_disk(issues, results_folder): json.dumps(issue["resolution"]), issue["created_at"], issue["closed_at"], - json.dumps([]), # components + json.dumps([issue["subIssues"]]), # components event["event"], event["user"]["name"], event["user"]["email"], From f44a8b729dbd1ee1a6d80555218fa4d5fa97cd23 Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Tue, 21 Oct 2025 14:10:18 +0200 Subject: [PATCH 09/26] Remove unneccesary return value also added copyright header Signed-off-by: Leo Sendelbach --- issue_processing/jira_issue_processing.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/issue_processing/jira_issue_processing.py b/issue_processing/jira_issue_processing.py index 6f60beb..9d384c3 100644 --- a/issue_processing/jira_issue_processing.py +++ b/issue_processing/jira_issue_processing.py @@ -18,6 +18,7 @@ # Copyright 2018-2019 by Anselm Fehnker # Copyright 2020-2021 by Thomas Bock # Copyright 2023 by Maximilian Löffler +# Copyright 2025 by Leo Sendelbach # All Rights Reserved. """ This file is able to extract Jira issue data from xml files. @@ -125,7 +126,7 @@ def run(): referenced_issue["history"].append(referenced_by) # 5) update user data with Codeface database - processed_issues = insert_user_data(processed_issues, __conf) + insert_user_data(processed_issues, __conf) # 6) dump result to disk print_to_disk(processed_issues, __resdir) # # 7) export for Gephi @@ -689,7 +690,7 @@ def get_user_from_id(idx, buffer_db=user_buffer): event["event_info_2"] = assigned_user["email"] log.debug("number of issues after insert_user_data: '{}'".format(len(issues))) - return issues + return def print_to_disk(issues, results_folder): From e46af3fed9155591b78fab19f30db420cf4900f0 Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Tue, 21 Oct 2025 14:11:29 +0200 Subject: [PATCH 10/26] Add comments also minor fixes and removal of math.ceil Signed-off-by: Leo Sendelbach --- issue_processing/issue_processing.py | 36 +++++++++++++++++++++------- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py index 1fd3d24..3b7ca64 100644 --- a/issue_processing/issue_processing.py +++ b/issue_processing/issue_processing.py @@ -77,10 +77,15 @@ def run(): issues = load(__srcdir) # 2) re-format the issues reformat_issues(issues) - # 3) merges all issue events into one list + # create an empty dict for external connected events, meaning connected + # events that connect to an issue in another repository external_connected_events = dict() + # 3) merges all issue events into one list + # this step returns a dict containing all connected events that can be matched to the correct issues later filtered_connected_events = merge_issue_events(issues, external_connected_events) # 4) re-format the eventsList of the issues + # this step also reconstructs the connections previously stored + # in 'external_connected_events' and 'filtered_connected_events' reformat_events(issues, filtered_connected_events, external_connected_events) # 5) update user data with Codeface database and dump username-to-name/e-mail list insert_user_data(issues, __conf, __resdir) @@ -503,16 +508,20 @@ def merge_issue_events(issue_data, external_connected_events): # if event is connected event, create or add to a matching dict entry by matching timestamps, for later reconstruction if event["event"] == "connected": if event["created_at"] in connected_events.keys() and connected_events[event["created_at"]]["user"] == event["user"]: + # if there is already a connected event at this time by this user, add this event to the list connected_events[event["created_at"]]["issues"].append(issue["number"]) elif subtract_seconds_from_time(event["created_at"], 1) in connected_events.keys() \ and connected_events[subtract_seconds_from_time(event["created_at"], 1)]["user"] == event["user"]: + # same as above, but accounting for a possible difference in timestamps of 1 second between matching events connected_events[subtract_seconds_from_time(event["created_at"], 1)]["issues"].append(issue["number"]) event["created_at"] = subtract_seconds_from_time(event["created_at"], 1) elif subtract_seconds_from_time(event["created_at"], -1) in connected_events.keys() \ and connected_events[subtract_seconds_from_time(event["created_at"], -1)]["user"] == event["user"]: + # same as above, with offset calculated in the other direction connected_events[subtract_seconds_from_time(event["created_at"], -1)]["issues"].append(issue["number"]) event["created_at"] = subtract_seconds_from_time(event["created_at"], -1) else: + # if there is no connected event yet at this timestamp, create a new entry for this event connected_info = dict() connected_info["issues"] = [issue["number"]] connected_info["user"] = issue["user"] @@ -550,19 +559,19 @@ def filter_connected_events(key, value, external_connected_events): # if 2 connected events exist, matching them is trivial if num_issues == 2: return True - occurances = {x: value["issues"].count(x) for x in set(value["issues"])} + occurences = {x: value["issues"].count(x) for x in set(value["issues"])} # otherwise, if it is an even number, check if it can be easily matched, # meaning that exactly half the events occur in the same issue - if num_issues % 2 == 0 and num_issues/2 in occurances.values(): + if num_issues % 2 == 0 and num_issues/2 in occurences.values(): # duplicate issue list for matching the issues later value["multi_issues_copy"] = list(value["issues"]) return True # if it is an odd number, check if it can be easily matched # meaning that exactly half (rounded up) the events occur in the same issue - if num_issues % 2 == 1 and math.ceil(num_issues/2) in occurances.values(): - for sub_key, sub_value in occurances.iteritems(): + if num_issues % 2 == 1 and (num_issues + 1)/2 in occurences.values(): + for sub_key, sub_value in occurences.iteritems(): # then, assign one of them as an external connected event and proceed as in previous case - if sub_value == math.ceil(num_issues/2): + if sub_value == (num_issues + 1)/2: new_entry = dict() new_entry["user"] = value["user"] new_entry["issues"] = [sub_key] @@ -609,21 +618,30 @@ def reformat_events(issue_data, filtered_connected_events, external_connected_ev if event["event"] == "connected": if event["created_at"] in external_connected_events \ and issue["number"] in external_connected_events[event["created_at"]]["issues"]: + # if the event is an external connected event, mark it as such and remove this issue from the list event["event_info_1"] = "external" external_connected_events[event["created_at"]]["issues"].remove(issue["number"]) elif event["created_at"] in filtered_connected_events \ and issue["number"] in filtered_connected_events[event["created_at"]]["issues"]: + # if it is instead an internal connected event value = filtered_connected_events[event["created_at"]] if len(value["issues"]) == 2: + # and we only have 2 issues in the list, connect to the other issue event["event_info_1"] = value["issues"][0] if value["issues"][1] == issue["number"] else value["issues"][1] else: - occurances = {x: value["issues"].count(x) for x in set(value["issues"])} - if occurances[issue["number"]] == max(occurances.values()): + # and we have more than two issues, count each issue's occurences + occurences = {x: value["issues"].count(x) for x in set(value["issues"])} + if occurences[issue["number"]] == max(occurences.values()): + # if our issue is the most common one, that means it is the common denominator + # for all connected events at this time + # so this event connects to any other issue + # which is then removed from a copied list to avoid duplications number = next(x for x in value["multi_issues_copy"] if x != issue["number"]) value["multi_issues_copy"].remove(number) event["event_info_1"] = number else: - event["event_info_1"] = max(occurances, key = occurances.get) + # otherwise, connect this event to the common denominator + event["event_info_1"] = max(occurences, key=occurences.get) # as the user dictionary is created, start re-formating the event information of all issues for issue in issue_data: From cfaba7169dd3cffb6914d5bc0b5679dcd701c58e Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Fri, 31 Oct 2025 16:28:42 +0100 Subject: [PATCH 11/26] Add new json field for suggestions to result comments now each have a boolean field that describes whether the comment contains a suggestion or not Signed-off-by: Leo Sendelbach --- issue_processing/issue_processing.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py index 3b7ca64..c6648c2 100644 --- a/issue_processing/issue_processing.py +++ b/issue_processing/issue_processing.py @@ -742,7 +742,10 @@ def reformat_events(issue_data, filtered_connected_events, external_connected_ev # "state_new" and "resolution" of the issue give the information about the state and the resolution of # the issue when the comment was written, because the eventsList is sorted by time event["event_info_1"] = issue["state_new"] - event["event_info_2"] = issue["resolution"] + if "contains_suggestion" in event: + event["event_info_2"] = event["contains_suggestion"] + else: + event["event_info_2"] = False elif event["event"] == "referenced" and not event["commit"] is None: # remove "referenced" events originating from commits From fae4c47f98ca8987bd20f202e4cebeb4c1c2dd42 Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Fri, 31 Oct 2025 16:36:11 +0100 Subject: [PATCH 12/26] Improve documentation dicts for reconstructing connected events are now better explained and the comments do not disruot the workflow in the run function anymore Signed-off-by: Leo Sendelbach --- issue_processing/issue_processing.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py index c6648c2..97298cb 100644 --- a/issue_processing/issue_processing.py +++ b/issue_processing/issue_processing.py @@ -77,15 +77,10 @@ def run(): issues = load(__srcdir) # 2) re-format the issues reformat_issues(issues) - # create an empty dict for external connected events, meaning connected - # events that connect to an issue in another repository - external_connected_events = dict() # 3) merges all issue events into one list - # this step returns a dict containing all connected events that can be matched to the correct issues later + external_connected_events = dict() filtered_connected_events = merge_issue_events(issues, external_connected_events) # 4) re-format the eventsList of the issues - # this step also reconstructs the connections previously stored - # in 'external_connected_events' and 'filtered_connected_events' reformat_events(issues, filtered_connected_events, external_connected_events) # 5) update user data with Codeface database and dump username-to-name/e-mail list insert_user_data(issues, __conf, __resdir) @@ -539,6 +534,8 @@ def merge_issue_events(issue_data, external_connected_events): issue["eventsList"] = sorted(issue["eventsList"], key=lambda k: k["created_at"]) # filter out connected events which cannot be perfectly matched + # and populate external_connected_events dict + # because this happens in place, we do not need to return the external_connected_event dict later filtered_connected_events = dict(filter(lambda item: filter_connected_events(item[0], item[1], external_connected_events), connected_events.items())) # updates all the issues by the temporarily stored referenced_by events @@ -547,6 +544,7 @@ def merge_issue_events(issue_data, external_connected_events): if issue["number"] == value["number"]: issue["eventsList"] = issue["eventsList"] + value["eventsList"] + # return the filtered_connected_events dict for later reconstruction return filtered_connected_events From 89f0f0160d78635e824ad09f13904c971f0cbb50 Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Tue, 4 Nov 2025 12:46:51 +0100 Subject: [PATCH 13/26] Incorporate requested changes includes: - updated comments - spelling mistake - fix for potential crash if script is used on old data Signed-off-by: Leo Sendelbach --- issue_processing/issue_processing.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py index 97298cb..0ff891f 100644 --- a/issue_processing/issue_processing.py +++ b/issue_processing/issue_processing.py @@ -261,7 +261,7 @@ def reformat_issues(issue_data): if issue["relatedCommits"] is None: issue["relatedCommits"] = [] - # if an issue has no reviewsList, an empty Listgets created + # if an issue has no reviewsList, an empty List gets created if issue["reviewsList"] is None: issue["reviewsList"] = [] @@ -269,6 +269,10 @@ def reformat_issues(issue_data): if "relatedIssues" not in issue: issue["relatedIssues"] = [] + # if an issue has no sub-issue list, an empty List gets created + if "subIssues" not in issue: + issue["subIssues"] = [] + # add "closed_at" information if not present yet if issue["closed_at"] is None: issue["closed_at"] = "" @@ -737,9 +741,10 @@ def reformat_events(issue_data, filtered_connected_events, external_connected_ev issue["eventsList"].append(resolution_event) elif event["event"] == "commented": - # "state_new" and "resolution" of the issue give the information about the state and the resolution of + # "state_new" of the issue gives the information about the state of # the issue when the comment was written, because the eventsList is sorted by time event["event_info_1"] = issue["state_new"] + # if event is a review comment, it can contain suggestions if "contains_suggestion" in event: event["event_info_2"] = event["contains_suggestion"] else: From 73d5f64ce60dc2056e5469557cf97e62825e3cde Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Tue, 27 Jan 2026 13:41:27 +0100 Subject: [PATCH 14/26] Add copilot user unification to author postprocessing author postprocessing now also contains a list of known copilot use names that can be extended to unify more different copilot users Signed-off-by: Leo Sendelbach --- author_postprocessing/author_postprocessing.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/author_postprocessing/author_postprocessing.py b/author_postprocessing/author_postprocessing.py index 42ca247..d46074a 100644 --- a/author_postprocessing/author_postprocessing.py +++ b/author_postprocessing/author_postprocessing.py @@ -14,7 +14,7 @@ # # Copyright 2015-2017 by Claus Hunsen # Copyright 2020-2022 by Thomas Bock -# Copyright 2025 by Leo Sendelbach +# Copyright 2025-2026 by Leo Sendelbach # All Rights Reserved. """ This file is able to disambiguate authors after the extraction from the Codeface database was performed. A manually @@ -51,6 +51,15 @@ from csv_writer import csv_writer +## +# GLOBAL VARIABLES +## + +# global variable containing all known copilot users and the name and mail adress copilot users will be assigned +known_copilot_users = {"Copilot", "copilot-pull-request-reviewer[bot]", "copilot-swe-agentbot"} +copilot_unified_name = "Copilot" +copilot_unified_email = "copilot@example.com" + ## # RUN POSTPROCESSING ## @@ -79,7 +88,7 @@ def perform_data_backup(results_path, results_path_backup): copy(current_file, backup_file) -def fix_github_browser_commits(data_path, issues_github_list, commits_list, authors_list, emails_list, bots_list): +def fix_github_browser_commits(data_path, issues_github_list, commits_list, authors_list, emails_list, bots_list, unify_copilot_users=True): """ Replace the author "GitHub " in both commit and GitHub issue data by the correct author. The author "GitHub " is automatically inserted as the committer of a commit that is made when @@ -183,6 +192,11 @@ def is_github_noreply_author(name, email): issue_data_new = [] for event in issue_data: + # unify events to use a single copilot user for all events triggered by a known copilot user + if unify_copilot_users and event[9] in known_copilot_users: + event[9] = copilot_unified_name + event[10] = copilot_unified_email + # replace author if necessary if is_github_noreply_author(event[9], event[10]) and event[8] == commit_added_event: # extract commit hash from event info 1 From befbee3e09b2190565debe3393a6c45f4cc3a7e1 Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Tue, 27 Jan 2026 13:48:43 +0100 Subject: [PATCH 15/26] Assign copilot user data in case of specific events the events 'copilot_work_started' and 'copilot_work_finished' now always have the standard copilot user data Signed-off-by: Leo Sendelbach --- issue_processing/issue_processing.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py index 0ff891f..4144db1 100644 --- a/issue_processing/issue_processing.py +++ b/issue_processing/issue_processing.py @@ -18,7 +18,7 @@ # Copyright 2018-2019 by Anselm Fehnker # Copyright 2019 by Thomas Bock # Copyright 2020-2021 by Thomas Bock -# Copyright 2025 by Leo Sendelbach +# Copyright 2025-2026 by Leo Sendelbach # All Rights Reserved. """ This file is able to extract Github issue data from json files. @@ -54,6 +54,9 @@ # datetime format string datetime_format = "%Y-%m-%d %H:%M:%S" +# Copilot username to be assigned in specific copilot events +copilot_username = "Copilot" + def run(): # get all needed paths and arguments for the method call. parser = argparse.ArgumentParser(prog='codeface-extraction-issues-github', description='Codeface extraction') @@ -488,6 +491,12 @@ def merge_issue_events(issue_data, external_connected_events): if event["event"] == "review_requested" or event["event"] == "review_request_removed": event["ref_target"] = event["requestedReviewer"] + # if event is a specific copilot event, assign the copilot user data + if event["event"] == "copilot_work_started" or event["event"] == "copilot_work_finished": + event["user"]["name"] = None + event["user"]["username"] = copilot_username + event["user"]["email"] = "" + # if event dismisses a review, we can determine the original state of the corresponding review if event["event"] == "review_dismissed": for review in issue["reviewsList"]: From 23f0dd6e16d4331ca5dc741503462ce5f58661e8 Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Tue, 27 Jan 2026 15:27:23 +0100 Subject: [PATCH 16/26] Add documentation for new copilot user unification Method doc updated to reflect new functionality Signed-off-by: Leo Sendelbach --- author_postprocessing/author_postprocessing.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/author_postprocessing/author_postprocessing.py b/author_postprocessing/author_postprocessing.py index d46074a..42d602c 100644 --- a/author_postprocessing/author_postprocessing.py +++ b/author_postprocessing/author_postprocessing.py @@ -99,7 +99,7 @@ def fix_github_browser_commits(data_path, issues_github_list, commits_list, auth "GitHub " are removed. Also "mentioned" or "subscribed" events in the GitHub issue data which reference the author "GitHub " are removed from the GitHub issue data. In addition, remove the author "GitHub " also from the author data and bot data and remove e-mails that have been sent - by this author. + by this author. This method also unifies all known copilot users into a single user if desired. :param data_path: the path to the project data that is to be fixed :param issues_github_list: file name of the github issue data @@ -107,6 +107,7 @@ def fix_github_browser_commits(data_path, issues_github_list, commits_list, auth :param authors_list: file name of the corresponding author data :param emails_list: file name of the corresponding email data :param bots_list: file name of the corresponding bot data + :param unify_copilot_users: whether to unify known copilot users into a single user """ github_user = "GitHub" github_email = "noreply@github.com" From eb53c790009af5a71c7d9504ecad0b737f669a22 Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Tue, 27 Jan 2026 15:48:39 +0100 Subject: [PATCH 17/26] Fix connected event assignment previously, the creator of the issues was falsely matched to the connected event instead of the user triggering the event Signed-off-by: Leo Sendelbach --- issue_processing/issue_processing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py index 4144db1..dc05682 100644 --- a/issue_processing/issue_processing.py +++ b/issue_processing/issue_processing.py @@ -532,7 +532,7 @@ def merge_issue_events(issue_data, external_connected_events): # if there is no connected event yet at this timestamp, create a new entry for this event connected_info = dict() connected_info["issues"] = [issue["number"]] - connected_info["user"] = issue["user"] + connected_info["user"] = event["user"] connected_events[event["created_at"]] = connected_info # merge events, relatedCommits, relatedIssues and comment lists From 9e6ccceb3caa91966ef0a9f56a696a9fe7a609e6 Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Wed, 18 Feb 2026 14:06:28 +0100 Subject: [PATCH 18/26] Unify copilot users in all files unification now done on all files, which should prevent any issues arising from unknown authors during anonymization also move all global variables to a new utils file Signed-off-by: Leo Sendelbach --- .../author_postprocessing.py | 69 +++++++++++-------- github_user_utils/github_user_utils.py | 54 +++++++++++++++ issue_processing/issue_processing.py | 5 +- 3 files changed, 95 insertions(+), 33 deletions(-) create mode 100644 github_user_utils/github_user_utils.py diff --git a/author_postprocessing/author_postprocessing.py b/author_postprocessing/author_postprocessing.py index 42d602c..c4a9e24 100644 --- a/author_postprocessing/author_postprocessing.py +++ b/author_postprocessing/author_postprocessing.py @@ -50,15 +50,10 @@ from csv_writer import csv_writer +from github_user_utils import known_copilot_users, copilot_unified_name, copilot_unified_email, \ + is_github_noreply_author, github_user, github_email, \ + commit_added_event, mentioned_event, subscribed_event -## -# GLOBAL VARIABLES -## - -# global variable containing all known copilot users and the name and mail adress copilot users will be assigned -known_copilot_users = {"Copilot", "copilot-pull-request-reviewer[bot]", "copilot-swe-agentbot"} -copilot_unified_name = "Copilot" -copilot_unified_email = "copilot@example.com" ## # RUN POSTPROCESSING @@ -109,25 +104,6 @@ def fix_github_browser_commits(data_path, issues_github_list, commits_list, auth :param bots_list: file name of the corresponding bot data :param unify_copilot_users: whether to unify known copilot users into a single user """ - github_user = "GitHub" - github_email = "noreply@github.com" - commit_added_event = "commit_added" - mentioned_event = "mentioned" - subscribed_event = "subscribed" - - """ - Helper function to check whether a (name, e-mail) pair belongs to the author "GitHub ". - There are two options in Codeface how this can happen: - (1) Username is "GitHub" and e-mail address is "noreply@github.com" - (2) Username is "GitHub" and e-mail address has been replaced by Codeface, resulting in "GitHub.noreply@github.com" - - :param name: the name of the author to be checked - :param email: the email address of the author to be checked - :return: whether the given (name, email) pair belongs to the "GitHub " author - """ - def is_github_noreply_author(name, email): - return (name == github_user and (email == github_email or email == (github_user + "." + github_email))) - # Check for all files in the result directory of the project whether they need to be adjusted for filepath, dirnames, filenames in walk(data_path): @@ -136,20 +112,32 @@ def is_github_noreply_author(name, email): if authors_list in filenames: f = path.join(filepath, authors_list) log.info("Remove author %s <%s> in %s ...", github_user, github_email, f) + if unify_copilot_users: + log.info("Also unify copilot users to %s <%s> in %s ...", copilot_unified_name, copilot_unified_email, f) author_data = csv_writer.read_from_csv(f) author_data_new = [] - + copilot_user_added = False for author in author_data: # keep author entry only if it should not be removed if not is_github_noreply_author(author[1], author[2]): - author_data_new.append(author) + # unify copilot author if desired + if unify_copilot_users and author[1] in known_copilot_users: + if not copilot_user_added: + author[1] = copilot_unified_name + author[2] = copilot_unified_email + copilot_user_added = True + author_data_new.append(author) + else: + author_data_new.append(author) csv_writer.write_to_csv(f, author_data_new) # (2) Remove e-mails from author 'GitHub ' from all emails.list files if emails_list in filenames: f = path.join(filepath, emails_list) log.info("Remove emails from author %s <%s> in %s ...", github_user, github_email, f) + if unify_copilot_users: + log.info("Also unify copilot users to %s <%s> in %s ...", copilot_unified_name, copilot_unified_email, f) email_data = csv_writer.read_from_csv(f) email_data_new = [] @@ -157,6 +145,10 @@ def is_github_noreply_author(name, email): for email in email_data: # keep author entry only if it should not be removed if not is_github_noreply_author(email[0], email[1]): + # unify copilot users if desired + if unify_copilot_users and email[0] in known_copilot_users: + email[0] = copilot_unified_name + email[1] = copilot_unified_email email_data_new.append(email) else: log.warn("Remove email %s as it was sent by %s <%s>.", email[2], email[0], email[1]) @@ -167,6 +159,8 @@ def is_github_noreply_author(name, email): if commits_list in filenames: f = path.join(filepath, commits_list) log.info("Replace author %s <%s> in %s ...", github_user, github_email, f) + if unify_copilot_users: + log.info("Also unify copilot users to %s <%s> in %s ...", copilot_unified_name, copilot_unified_email, f) commit_data = csv_writer.read_from_csv(f) for commit in commit_data: @@ -175,6 +169,10 @@ def is_github_noreply_author(name, email): if is_github_noreply_author(commit[5], commit[6]): commit[5] = commit[2] commit[6] = commit[3] + # unify copilot author if desired + if unify_copilot_users and commit[5] in known_copilot_users: + commit[5] = copilot_unified_name + commit[6] = copilot_unified_email csv_writer.write_to_csv(f, commit_data) @@ -183,6 +181,8 @@ def is_github_noreply_author(name, email): if issues_github_list in filenames: f = path.join(filepath, issues_github_list) log.info("Replace author %s <%s> in %s ...", github_user, github_email, f) + if unify_copilot_users: + log.info("Also unify copilot users to %s <%s> in %s ...", copilot_unified_name, copilot_unified_email, f) issue_data = csv_writer.read_from_csv(f) # read commit data @@ -197,7 +197,13 @@ def is_github_noreply_author(name, email): if unify_copilot_users and event[9] in known_copilot_users: event[9] = copilot_unified_name event[10] = copilot_unified_email - + if event[8] == commit_added_event and event[13][-1:1] in known_copilot_users: + # for commit added events, also unify the referenced author in event info 2 if it is a known copilot user + event[13] = '"' + copilot_unified_name + '"' + elif event[8] in (mentioned_event, subscribed_event) and event[12][-1:1] in known_copilot_users: + # for mentioned/subscribed events, also unify the referenced user in event info 1 and 2 if it is a known copilot user + event[12] = '"' + copilot_unified_name + '"' + event[13] = '"' + copilot_unified_email + '"' # replace author if necessary if is_github_noreply_author(event[9], event[10]) and event[8] == commit_added_event: # extract commit hash from event info 1 @@ -377,6 +383,9 @@ def run_postprocessing(conf, resdir, backup_data): if person[4] == issue_event[12] and (quot_m + person[5] + quot_m) == issue_event[13]: issue_event[12] = person[1] issue_event[13] = quot_m + person[2] + quot_m + # replace name in event info 2 if necessary + if person[4] == issue_event[13]: + issue_event[13] = person[1] csv_writer.write_to_csv(f, issue_data) diff --git a/github_user_utils/github_user_utils.py b/github_user_utils/github_user_utils.py new file mode 100644 index 0000000..20a3aa3 --- /dev/null +++ b/github_user_utils/github_user_utils.py @@ -0,0 +1,54 @@ +# coding=utf-8 +# This file is part of codeface-extraction, which is free software: you +# can redistribute it and/or modify it under the terms of the GNU General +# Public License as published by the Free Software Foundation, version 2. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +# +# Copyright 2026 by Leo Sendelbach +# All Rights Reserved. +""" +This file serves as a collection of global variables and utility functions, which are used throughout the +issue data extraction and post-processing, in particular for the processing of GitHub and Copilot user data. +""" + +## +# GLOBAL VARIABLES +## + +# global variables containing all known copilot users and the name and mail adress copilot users will be assigned +known_copilot_users = {"Copilot", "copilot-pull-request-reviewer[bot]", "copilot-swe-agentbot"} +copilot_unified_name = "Copilot" +copilot_unified_email = "copilot@example.com" + +## global variables for the GitHub author +github_user = "GitHub" +github_email = "noreply@github.com" +commit_added_event = "commit_added" +mentioned_event = "mentioned" +subscribed_event = "subscribed" + +## +# UTILITY FUNCTIONS +## + +def is_github_noreply_author(name, email): + """ + Helper function to check whether a (name, e-mail) pair belongs to the author "GitHub ". + There are two options in Codeface how this can happen: + (1) Username is "GitHub" and e-mail address is "noreply@github.com" + (2) Username is "GitHub" and e-mail address has been replaced by Codeface, resulting in "GitHub.noreply@github.com" + + :param name: the name of the author to be checked + :param email: the email address of the author to be checked + :return: whether the given (name, email) pair belongs to the "GitHub " author + """ + + return (name == github_user and (email == github_email or email == (github_user + "." + github_email))) \ No newline at end of file diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py index dc05682..25669cd 100644 --- a/issue_processing/issue_processing.py +++ b/issue_processing/issue_processing.py @@ -41,6 +41,7 @@ from dateutil import parser as dateparser from csv_writer import csv_writer +from github_user_utils import copilot_unified_name # known types from JIRA and GitHub default labels known_types = {"bug", "improvement", "enhancement", "feature", "task", "test", "wish"} @@ -54,8 +55,6 @@ # datetime format string datetime_format = "%Y-%m-%d %H:%M:%S" -# Copilot username to be assigned in specific copilot events -copilot_username = "Copilot" def run(): # get all needed paths and arguments for the method call. @@ -494,7 +493,7 @@ def merge_issue_events(issue_data, external_connected_events): # if event is a specific copilot event, assign the copilot user data if event["event"] == "copilot_work_started" or event["event"] == "copilot_work_finished": event["user"]["name"] = None - event["user"]["username"] = copilot_username + event["user"]["username"] = copilot_unified_name event["user"]["email"] = "" # if event dismisses a review, we can determine the original state of the corresponding review From a3558a6e8a8d208dcb39135176c02c80326034b4 Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Wed, 18 Feb 2026 14:14:11 +0100 Subject: [PATCH 19/26] Add support for 'known agents' Known agentsc such as 'copilot' or 'claude' can now be read, similar to known bots. They will be flagged as agents during bot processing. Signed-off-by: Leo Sendelbach --- bot_processing/bot_processing.py | 34 ++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/bot_processing/bot_processing.py b/bot_processing/bot_processing.py index 53a397e..ba3fa61 100644 --- a/bot_processing/bot_processing.py +++ b/bot_processing/bot_processing.py @@ -13,6 +13,7 @@ # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. # # Copyright 2021-2022 by Thomas Bock +# Copyright 2026 by Leo Sendelbach # All Rights Reserved. """ This file is able to extract information on bot/human users from csv files. @@ -52,6 +53,7 @@ def run(): # (the known bots file is the file in which known bots have been added manually and project independent) __confdir = os.path.join(args.resdir, os.path.dirname(args.config)) __known_bots_file = os.path.abspath(os.path.join(__confdir, "known_github_bots.list")) + __known_agents_file = os.path.abspath(os.path.join(__confdir, "known_github_agents.list")) # run processing of bot data: # 1) load bot data @@ -59,7 +61,7 @@ def run(): # 2) load user data users = load_user_data(os.path.join(__resdir, "usernames.list")) # 3) update bot data with user data and additionally add known bots if they occur in the project - bots = add_user_data(bots, users, __known_bots_file) + bots = add_user_data(bots, users, __known_bots_file, __known_agents_file) # 4) dump result to disk print_to_disk(bots, __resdir) @@ -111,12 +113,13 @@ def load_user_data(user_data_file): return user_data -def check_with_known_bot_list(known_bots_file, bot_data, user_data, bot_data_reduced): +def check_with_known_bot_or_agent_list(known_bots_file, known_agents_file, bot_data, user_data, bot_data_reduced): """ Check whether there are known bots occurring in the project. If so, add them to the bots list or update the bots list accordingly. :param known_bots_file: the file path to the list of known bot data + :param known_agents_file: the file path to the list of known agent data :param bot_data: the bot data originating from the bot prediction :param user_data: a dictionary from the issue data which maps GitHub usernames to authors :param bot_data_reduced: the bot data after mapping GitHub user names to authors @@ -126,6 +129,7 @@ def check_with_known_bot_list(known_bots_file, bot_data, user_data, bot_data_red # Read the list of known bots known_bots = load_bot_data(known_bots_file, header = False) + known_agents = load_bot_data(known_agents_file, header = False) # Get the GitHub usernames of the bots predicted to be a bot predicted_bots = [bot[0] if len(bot) > 0 else "" for bot in bot_data] @@ -152,11 +156,33 @@ def check_with_known_bot_list(known_bots_file, bot_data, user_data, bot_data_red log.info("Mark user '{}' as bot in the bot data.".format(user_data[bot[0]])) break + for agent in known_agents: + + # (1) check if a known agent occurs in the GitHub issue data but has not been predicted + if agent[0] not in predicted_bots and agent[0] in user_data: + + # add the known agent as a bot to the bots list + additional_agent = dict() + additional_agent["user"] = user_data[agent[0]] + additional_agent["prediction"] = "Agent" + bot_data_reduced.append(additional_agent) + log.info("Add known agent '{}' to bot data.".format(additional_agent["user"])) + + # (2) handle known agents that are already present in the bots list + elif agent[0] in predicted_bots and agent[0] in user_data: + + # make sure that this bot has also been predicited to be an agent + for predicted_bot in bot_data_reduced: + if predicted_bot["user"] == user_data[agent[0]]: + predicted_bot["prediction"] = "Agent" + log.info("Mark user '{}' as agent in the bot data.".format(user_data[agent[0]])) + break + # return the updated bot data return bot_data_reduced -def add_user_data(bot_data, user_data, known_bots_file): +def add_user_data(bot_data, user_data, known_bots_file, known_agents_file): """ Add user data to bot data, i.e., replace username by name and e-mail. In addition, check in the global bots list whether there are authors in the projects which are @@ -200,7 +226,7 @@ def add_user_data(bot_data, user_data, known_bots_file): log.warn("User '{}' in bot data does not occur in GitHub user data. Remove user...".format(user[0])) # check whether known GitHub bots occur in the GitHub issue data and, if so, update the bot data accordingly - bot_data_reduced = check_with_known_bot_list(known_bots_file, bot_data, user_buffer, bot_data_reduced) + bot_data_reduced = check_with_known_bot_or_agent_list(known_bots_file, known_agents_file, bot_data, user_buffer, bot_data_reduced) return bot_data_reduced From 0bde8a0e54940b7a6798da503a0af805a3d3afe9 Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Wed, 18 Feb 2026 14:31:59 +0100 Subject: [PATCH 20/26] Add better bot name variant support Add a helper function for creating bot name variants utilizing either '[bot]' or 'bot' suffix. Also update bot processing to check user buffer for all variants. Signed-off-by: Leo Sendelbach --- .../author_postprocessing.py | 16 +++++++------- bot_processing/bot_processing.py | 8 +++++++ github_user_utils/github_user_utils.py | 21 ++++++++++++++++++- 3 files changed, 36 insertions(+), 9 deletions(-) diff --git a/author_postprocessing/author_postprocessing.py b/author_postprocessing/author_postprocessing.py index c4a9e24..7168e00 100644 --- a/author_postprocessing/author_postprocessing.py +++ b/author_postprocessing/author_postprocessing.py @@ -52,9 +52,9 @@ from github_user_utils import known_copilot_users, copilot_unified_name, copilot_unified_email, \ is_github_noreply_author, github_user, github_email, \ - commit_added_event, mentioned_event, subscribed_event - + commit_added_event, mentioned_event, subscribed_event, generate_botname_variants +known_copilot_users_extended = generate_botname_variants(known_copilot_users) ## # RUN POSTPROCESSING ## @@ -122,7 +122,7 @@ def fix_github_browser_commits(data_path, issues_github_list, commits_list, auth # keep author entry only if it should not be removed if not is_github_noreply_author(author[1], author[2]): # unify copilot author if desired - if unify_copilot_users and author[1] in known_copilot_users: + if unify_copilot_users and author[1] in known_copilot_users_extended: if not copilot_user_added: author[1] = copilot_unified_name author[2] = copilot_unified_email @@ -146,7 +146,7 @@ def fix_github_browser_commits(data_path, issues_github_list, commits_list, auth # keep author entry only if it should not be removed if not is_github_noreply_author(email[0], email[1]): # unify copilot users if desired - if unify_copilot_users and email[0] in known_copilot_users: + if unify_copilot_users and email[0] in known_copilot_users_extended: email[0] = copilot_unified_name email[1] = copilot_unified_email email_data_new.append(email) @@ -170,7 +170,7 @@ def fix_github_browser_commits(data_path, issues_github_list, commits_list, auth commit[5] = commit[2] commit[6] = commit[3] # unify copilot author if desired - if unify_copilot_users and commit[5] in known_copilot_users: + if unify_copilot_users and commit[5] in known_copilot_users_extended: commit[5] = copilot_unified_name commit[6] = copilot_unified_email @@ -194,13 +194,13 @@ def fix_github_browser_commits(data_path, issues_github_list, commits_list, auth for event in issue_data: # unify events to use a single copilot user for all events triggered by a known copilot user - if unify_copilot_users and event[9] in known_copilot_users: + if unify_copilot_users and event[9] in known_copilot_users_extended: event[9] = copilot_unified_name event[10] = copilot_unified_email - if event[8] == commit_added_event and event[13][-1:1] in known_copilot_users: + if event[8] == commit_added_event and event[13][-1:1] in known_copilot_users_extended: # for commit added events, also unify the referenced author in event info 2 if it is a known copilot user event[13] = '"' + copilot_unified_name + '"' - elif event[8] in (mentioned_event, subscribed_event) and event[12][-1:1] in known_copilot_users: + elif event[8] in (mentioned_event, subscribed_event) and event[12][-1:1] in known_copilot_users_extended: # for mentioned/subscribed events, also unify the referenced user in event info 1 and 2 if it is a known copilot user event[12] = '"' + copilot_unified_name + '"' event[13] = '"' + copilot_unified_email + '"' diff --git a/bot_processing/bot_processing.py b/bot_processing/bot_processing.py index ba3fa61..3681c5d 100644 --- a/bot_processing/bot_processing.py +++ b/bot_processing/bot_processing.py @@ -222,6 +222,14 @@ def add_user_data(bot_data, user_data, known_bots_file, known_agents_file): bot_reduced["user"] = user_buffer[user[0]] bot_reduced["prediction"] = user[-1] bot_data_reduced.append(bot_reduced) + elif user[0] + "bot" in user_buffer.keys(): + bot_reduced["user"] = user_buffer[user[0] + "bot"] + bot_reduced["prediction"] = user[-1] + bot_data_reduced.append(bot_reduced) + elif user[0] + "[bot]" in user_buffer.keys(): + bot_reduced["user"] = user_buffer[user[0] + "[bot]"] + bot_reduced["prediction"] = user[-1] + bot_data_reduced.append(bot_reduced) else: log.warn("User '{}' in bot data does not occur in GitHub user data. Remove user...".format(user[0])) diff --git a/github_user_utils/github_user_utils.py b/github_user_utils/github_user_utils.py index 20a3aa3..561a345 100644 --- a/github_user_utils/github_user_utils.py +++ b/github_user_utils/github_user_utils.py @@ -51,4 +51,23 @@ def is_github_noreply_author(name, email): :return: whether the given (name, email) pair belongs to the "GitHub " author """ - return (name == github_user and (email == github_email or email == (github_user + "." + github_email))) \ No newline at end of file + return (name == github_user and (email == github_email or email == (github_user + "." + github_email))) + +def generate_botname_variants(botnames): + """ + Helper function to generate variants of bot names, which are used in the list of + known bots and agents as well as during author postprocessing. + + :param botnames: the list of bot names for which variants should be generated + :return: a set of bot name variants + """ + + botname_variants = set() + for botname in botnames: + botname_variants.add(botname) + if botname.endswith("[bot]"): + botname_variants.add(botname[:-5] + "bot") + elif botname.endswith("bot"): + botname_variants.add(botname[:-3] + "[bot]") + + return botname_variants From 1776a13fb2ea6c86695c6215b55148dd7965e764 Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Fri, 20 Feb 2026 17:17:46 +0100 Subject: [PATCH 21/26] Add better bot name handling Add a helper function that given a botname and a list of names, returns which bot name variant is contained in the list (or None). This is used whenever we check if a known bot is in the userdata or has been predicted to be a bot, and means that botnames in the known_bots file do not need to be duplicated for each variant. Also, automatically add all known coplilot users to the known_agents list, and then unify those during author postprocessing. Signed-off-by: Leo Sendelbach --- .../author_postprocessing.py | 13 ++++- bot_processing/bot_processing.py | 55 ++++++++++++++----- github_user_utils/github_user_utils.py | 8 +-- 3 files changed, 55 insertions(+), 21 deletions(-) diff --git a/author_postprocessing/author_postprocessing.py b/author_postprocessing/author_postprocessing.py index 7168e00..d05015b 100644 --- a/author_postprocessing/author_postprocessing.py +++ b/author_postprocessing/author_postprocessing.py @@ -247,6 +247,9 @@ def fix_github_browser_commits(data_path, issues_github_list, commits_list, auth if bots_list in filenames: f = path.join(filepath, bots_list) log.info("Remove author %s <%s> from %s ...", github_user, github_email, f) + if unify_copilot_users: + log.info("Also unify copilot users to %s <%s> in %s ...", copilot_unified_name, copilot_unified_email, f) + copilot_user_added = False bot_data = csv_writer.read_from_csv(f) bot_data_new = [] @@ -254,7 +257,15 @@ def fix_github_browser_commits(data_path, issues_github_list, commits_list, auth for entry in bot_data: # keep bot entry only if it should not be removed if not is_github_noreply_author(entry[0], entry[1]): - bot_data_new.append(entry) + # unify copilot users if desired + if unify_copilot_users and entry[0] in known_copilot_users_extended: + if not copilot_user_added: + entry[0] = copilot_unified_name + entry[1] = copilot_unified_email + copilot_user_added = True + bot_data_new.append(entry) + else: + bot_data_new.append(entry) else: log.warn("Remove entry %s <%s> from bots list.", entry[0], entry[1]) diff --git a/bot_processing/bot_processing.py b/bot_processing/bot_processing.py index 3681c5d..113405a 100644 --- a/bot_processing/bot_processing.py +++ b/bot_processing/bot_processing.py @@ -30,6 +30,7 @@ from codeface.configuration import Configuration from csv_writer import csv_writer +from github_user_utils import known_copilot_users, generate_botname_variants def run(): # get all needed paths and arguments for the method call. @@ -137,25 +138,35 @@ def check_with_known_bot_or_agent_list(known_bots_file, known_agents_file, bot_d for bot in known_bots: # (1) check if a known bot occurs in the GitHub issue data but has not been predicted - if bot[0] not in predicted_bots and bot[0] in user_data: + bot_variation_predicted_bots = containing_bot_variation(bot[0], predicted_bots) + bot_variation_user_data = containing_bot_variation(bot[0], user_data) + if bot_variation_predicted_bots is None and bot_variation_user_data is not None: # add the known bot as a bot to the bots list additional_bot = dict() - additional_bot["user"] = user_data[bot[0]] + additional_bot["user"] = user_data[bot_variation_user_data] additional_bot["prediction"] = "Bot" bot_data_reduced.append(additional_bot) log.info("Add known bot '{}' to bot data.".format(additional_bot["user"])) # (2) handle known bots that are already present in the bots list - elif bot[0] in predicted_bots and bot[0] in user_data: + elif bot_variation_predicted_bots is not None and bot_variation_user_data is not None: # make sure that this bot has also been predicited to be bot for predicted_bot in bot_data_reduced: - if predicted_bot["user"] == user_data[bot[0]]: + if predicted_bot["user"] == user_data[bot_variation_user_data]: predicted_bot["prediction"] = "Bot" - log.info("Mark user '{}' as bot in the bot data.".format(user_data[bot[0]])) + log.info("Mark user '{}' as bot in the bot data.".format(user_data[bot_variation_user_data])) break + # get list of known agents and combine it with the list of known copilot users + copilot_users_variants = generate_botname_variants(known_copilot_users) + # get list of known agent names + known_agents_names = [agent[0] for agent in known_agents] + for copilot_user in copilot_users_variants: + if copilot_user not in known_agents_names: + known_agents.append([copilot_user]) + for agent in known_agents: # (1) check if a known agent occurs in the GitHub issue data but has not been predicted @@ -218,16 +229,9 @@ def add_user_data(bot_data, user_data, known_bots_file, known_agents_file): continue # get user information if available - if user[0] in user_buffer.keys(): - bot_reduced["user"] = user_buffer[user[0]] - bot_reduced["prediction"] = user[-1] - bot_data_reduced.append(bot_reduced) - elif user[0] + "bot" in user_buffer.keys(): - bot_reduced["user"] = user_buffer[user[0] + "bot"] - bot_reduced["prediction"] = user[-1] - bot_data_reduced.append(bot_reduced) - elif user[0] + "[bot]" in user_buffer.keys(): - bot_reduced["user"] = user_buffer[user[0] + "[bot]"] + bot_variation = containing_bot_variation(user[0], user_buffer.keys()) + if bot_variation is not None: + bot_reduced["user"] = user_buffer[bot_variation] bot_reduced["prediction"] = user[-1] bot_data_reduced.append(bot_reduced) else: @@ -239,6 +243,27 @@ def add_user_data(bot_data, user_data, known_bots_file, known_agents_file): return bot_data_reduced +def containing_bot_variation(botname, name_list): + """ + Helper function to return the variation of a given bot name that occurs in a list of names. + + :param botname: the bot name for which the variation should be returned + :param name_list: the list of names to be checked for containing the bot name or a variation of it + :return: the variation of the given bot name that occurs in the given list of names, or None if no such variation exists + """ + + if botname in name_list: + return botname + elif botname + "bot" in name_list: + return botname + "bot" + elif botname + "[bot]" in name_list: + return botname + "[bot]" + elif botname.replace("[", "").replace("]", "") in name_list: + return botname.replace("[", "").replace("]", "") + else: + return None + + def print_to_disk(bot_data, results_folder): """ Print bot data to file "bots.list" in result folder. diff --git a/github_user_utils/github_user_utils.py b/github_user_utils/github_user_utils.py index 561a345..652b63b 100644 --- a/github_user_utils/github_user_utils.py +++ b/github_user_utils/github_user_utils.py @@ -24,7 +24,7 @@ ## # global variables containing all known copilot users and the name and mail adress copilot users will be assigned -known_copilot_users = {"Copilot", "copilot-pull-request-reviewer[bot]", "copilot-swe-agentbot"} +known_copilot_users = {"Copilot", "copilot-pull-request-reviewer[bot]", "copilot-swe-agent[bot]"} copilot_unified_name = "Copilot" copilot_unified_email = "copilot@example.com" @@ -65,9 +65,7 @@ def generate_botname_variants(botnames): botname_variants = set() for botname in botnames: botname_variants.add(botname) - if botname.endswith("[bot]"): - botname_variants.add(botname[:-5] + "bot") - elif botname.endswith("bot"): - botname_variants.add(botname[:-3] + "[bot]") + botname = botname.replace("[", "").replace("]", "") + botname_variants.add(botname) return botname_variants From 105f88c5d22836ab9de332449c6ea0ae3fba8350 Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Tue, 3 Mar 2026 15:43:11 +0100 Subject: [PATCH 22/26] Add copilot user unification for more events also add agents to bot handling, fix formatting for event_info_2 and subissues also fix a typo where strings would not have their quotes correctly removed Signed-off-by: Leo Sendelbach --- .../author_postprocessing.py | 39 +++++++++++-------- bot_processing/bot_processing.py | 2 +- github_user_utils/__init__.py | 1 + github_user_utils/github_user_utils.py | 7 ++++ issue_processing/issue_processing.py | 8 ++-- 5 files changed, 36 insertions(+), 21 deletions(-) create mode 100644 github_user_utils/__init__.py diff --git a/author_postprocessing/author_postprocessing.py b/author_postprocessing/author_postprocessing.py index d05015b..7aa9526 100644 --- a/author_postprocessing/author_postprocessing.py +++ b/author_postprocessing/author_postprocessing.py @@ -50,9 +50,11 @@ from csv_writer import csv_writer -from github_user_utils import known_copilot_users, copilot_unified_name, copilot_unified_email, \ +from github_user_utils.github_user_utils import known_copilot_users, copilot_unified_name, copilot_unified_email, \ is_github_noreply_author, github_user, github_email, \ - commit_added_event, mentioned_event, subscribed_event, generate_botname_variants + commit_added_event, mentioned_event, subscribed_event, \ + assigned_event, unassigned_event, review_requested_event, \ + review_request_removed_event, generate_botname_variants, quot_m known_copilot_users_extended = generate_botname_variants(known_copilot_users) ## @@ -173,6 +175,9 @@ def fix_github_browser_commits(data_path, issues_github_list, commits_list, auth if unify_copilot_users and commit[5] in known_copilot_users_extended: commit[5] = copilot_unified_name commit[6] = copilot_unified_email + if unify_copilot_users and commit[2] in known_copilot_users_extended: + commit[2] = copilot_unified_name + commit[3] = copilot_unified_email csv_writer.write_to_csv(f, commit_data) @@ -191,19 +196,20 @@ def fix_github_browser_commits(data_path, issues_github_list, commits_list, auth commit_hash_to_author = {commit[7]: commit[2:4] for commit in commit_data} author_name_to_data = {author[1]: author[1:3] for author in author_data_new} issue_data_new = [] - for event in issue_data: # unify events to use a single copilot user for all events triggered by a known copilot user if unify_copilot_users and event[9] in known_copilot_users_extended: event[9] = copilot_unified_name event[10] = copilot_unified_email - if event[8] == commit_added_event and event[13][-1:1] in known_copilot_users_extended: - # for commit added events, also unify the referenced author in event info 2 if it is a known copilot user - event[13] = '"' + copilot_unified_name + '"' - elif event[8] in (mentioned_event, subscribed_event) and event[12][-1:1] in known_copilot_users_extended: - # for mentioned/subscribed events, also unify the referenced user in event info 1 and 2 if it is a known copilot user - event[12] = '"' + copilot_unified_name + '"' - event[13] = '"' + copilot_unified_email + '"' + if unify_copilot_users and event[8] == commit_added_event and event[13][1:-1] in known_copilot_users_extended: + # for commit added events, also unify the referenced author in event info 2 if it is a known copilot user + event[13] = quot_m + copilot_unified_name + quot_m + elif unify_copilot_users and event[8] in (mentioned_event, subscribed_event, assigned_event, unassigned_event, + review_requested_event, review_request_removed_event) \ + and event[12] in known_copilot_users_extended: + # for mentioned/subscribed events, also unify the referenced user in event info 1 and 2 if it is a known copilot user + event[12] = copilot_unified_name + event[13] = quot_m + copilot_unified_email + quot_m # replace author if necessary if is_github_noreply_author(event[9], event[10]) and event[8] == commit_added_event: # extract commit hash from event info 1 @@ -302,9 +308,6 @@ def run_postprocessing(conf, resdir, backup_data): bugs_jira_list = "bugs-jira.list" bots_list = "bots.list" - # When looking at elements originating from json lists, we need to consider quotation marks around the string - quot_m = "\"" - data_path = path.join(resdir, conf["project"], conf["tagging"]) # Correctly replace author 'GitHub ' in the commit data and in "commit_added" events of the @@ -395,8 +398,8 @@ def run_postprocessing(conf, resdir, backup_data): issue_event[12] = person[1] issue_event[13] = quot_m + person[2] + quot_m # replace name in event info 2 if necessary - if person[4] == issue_event[13]: - issue_event[13] = person[1] + if quot_m + person[4] + quot_m == issue_event[13]: + issue_event[13] = quot_m + person[1] + quot_m csv_writer.write_to_csv(f, issue_data) @@ -463,8 +466,12 @@ def run_postprocessing(conf, resdir, backup_data): # the bot is already in the list, check if there are different predictions stored_bot = bot_names_and_emails[(bot[0], bot[1])] if stored_bot[2] != bot[2]: + # if either of the predictions is agent, keep agent + if (stored_bot[2] == "Agent" or bot[2] == "Agent"): + stored_bot[2] = "Agent" + bot_names_and_emails[(bot[0], bot[1])] = stored_bot # if either of the predictions is bot, keep bot - if (stored_bot[2] == "Bot" or bot[2] == "Bot"): + elif (stored_bot[2] == "Bot" or bot[2] == "Bot"): stored_bot[2] = "Bot" bot_names_and_emails[(bot[0], bot[1])] = stored_bot # otherwise, if either of the predictions is human, keep human diff --git a/bot_processing/bot_processing.py b/bot_processing/bot_processing.py index 113405a..ad76d85 100644 --- a/bot_processing/bot_processing.py +++ b/bot_processing/bot_processing.py @@ -30,7 +30,7 @@ from codeface.configuration import Configuration from csv_writer import csv_writer -from github_user_utils import known_copilot_users, generate_botname_variants +from github_user_utils.github_user_utils import known_copilot_users, generate_botname_variants def run(): # get all needed paths and arguments for the method call. diff --git a/github_user_utils/__init__.py b/github_user_utils/__init__.py new file mode 100644 index 0000000..9bad579 --- /dev/null +++ b/github_user_utils/__init__.py @@ -0,0 +1 @@ +# coding=utf-8 diff --git a/github_user_utils/github_user_utils.py b/github_user_utils/github_user_utils.py index 652b63b..20fa8d8 100644 --- a/github_user_utils/github_user_utils.py +++ b/github_user_utils/github_user_utils.py @@ -34,6 +34,13 @@ commit_added_event = "commit_added" mentioned_event = "mentioned" subscribed_event = "subscribed" +assigned_event = "assigned" +unassigned_event = "unassigned" +review_requested_event = "review_requested" +review_request_removed_event = "review_request_removed" + +# When looking at elements originating from json lists, we need to consider quotation marks around the string +quot_m = "\"" ## # UTILITY FUNCTIONS diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py index 25669cd..1513c06 100644 --- a/issue_processing/issue_processing.py +++ b/issue_processing/issue_processing.py @@ -41,7 +41,7 @@ from dateutil import parser as dateparser from csv_writer import csv_writer -from github_user_utils import copilot_unified_name +from github_user_utils.github_user_utils import copilot_unified_name # known types from JIRA and GitHub default labels known_types = {"bug", "improvement", "enhancement", "feature", "task", "test", "wish"} @@ -754,9 +754,9 @@ def reformat_events(issue_data, filtered_connected_events, external_connected_ev event["event_info_1"] = issue["state_new"] # if event is a review comment, it can contain suggestions if "contains_suggestion" in event: - event["event_info_2"] = event["contains_suggestion"] + event["event_info_2"] = str(event["contains_suggestion"]) else: - event["event_info_2"] = False + event["event_info_2"] = str(False) elif event["event"] == "referenced" and not event["commit"] is None: # remove "referenced" events originating from commits @@ -934,7 +934,7 @@ def print_to_disk(issues, results_folder): json.dumps(issue["resolution"]), issue["created_at"], issue["closed_at"], - json.dumps([issue["subIssues"]]), # components + json.dumps(issue["subIssues"]), # components event["event"], event["user"]["name"], event["user"]["email"], From 2ea7392061aab90b1a28aff18e718c4c3a8a6401 Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Tue, 10 Mar 2026 13:58:06 +0100 Subject: [PATCH 23/26] Add reason for conversation locking lock reason is saved in event_info_1 Signed-off-by: Leo Sendelbach --- issue_processing/issue_processing.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py index 1513c06..1de194e 100644 --- a/issue_processing/issue_processing.py +++ b/issue_processing/issue_processing.py @@ -534,6 +534,10 @@ def merge_issue_events(issue_data, external_connected_events): connected_info["user"] = event["user"] connected_events[event["created_at"]] = connected_info + # if event is a locked event, save the lock reason in event_info_1 + if event["event"] == "locked": + event["event_info_1"] = event["lock_reason"] + # merge events, relatedCommits, relatedIssues and comment lists issue["eventsList"] = issue["commentsList"] + issue["eventsList"] + issue["relatedIssues"] + issue[ "relatedCommits"] + issue["reviewsList"] From 977e86187e3097e25e2b7d22a3643ac6956a061c Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Tue, 10 Mar 2026 14:09:27 +0100 Subject: [PATCH 24/26] Fix spelling and documentation docstrings should now more accurately reflect parameters and return values Signed-off-by: Leo Sendelbach --- issue_processing/issue_processing.py | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py index 1de194e..4dc8c63 100644 --- a/issue_processing/issue_processing.py +++ b/issue_processing/issue_processing.py @@ -31,7 +31,6 @@ import sys import urllib from datetime import datetime, timedelta -import math import operator from codeface.cli import log @@ -234,7 +233,6 @@ def reformat_issues(issue_data): Re-arrange issue data structure. :param issue_data: the issue data to re-arrange - :return: the re-arranged issue data """ log.devinfo("Re-arranging Github issues...") @@ -299,7 +297,8 @@ def merge_issue_events(issue_data, external_connected_events): All issue events are merged together in the eventsList. This simplifies processing in later steps. :param issue_data: the issue data from which the events shall be merged - :return: the issue data with merged eventsList + :param external_connected_events: a dict to store connected events to external issues + :return: a filtered dict of connected events for future reconstruction """ log.info("Merge issue events ...") @@ -573,17 +572,17 @@ def filter_connected_events(key, value, external_connected_events): # if 2 connected events exist, matching them is trivial if num_issues == 2: return True - occurences = {x: value["issues"].count(x) for x in set(value["issues"])} + occurrences = {x: value["issues"].count(x) for x in set(value["issues"])} # otherwise, if it is an even number, check if it can be easily matched, # meaning that exactly half the events occur in the same issue - if num_issues % 2 == 0 and num_issues/2 in occurences.values(): + if num_issues % 2 == 0 and num_issues/2 in occurrences.values(): # duplicate issue list for matching the issues later value["multi_issues_copy"] = list(value["issues"]) return True # if it is an odd number, check if it can be easily matched # meaning that exactly half (rounded up) the events occur in the same issue - if num_issues % 2 == 1 and (num_issues + 1)/2 in occurences.values(): - for sub_key, sub_value in occurences.iteritems(): + if num_issues % 2 == 1 and (num_issues + 1)/2 in occurrences.values(): + for sub_key, sub_value in occurrences.iteritems(): # then, assign one of them as an external connected event and proceed as in previous case if sub_value == (num_issues + 1)/2: new_entry = dict() @@ -603,7 +602,8 @@ def reformat_events(issue_data, filtered_connected_events, external_connected_ev Re-format event information dependent on the event type. :param issue_data: the data of all issues that shall be re-formatted - :return: the issue data with updated event information + :param filtered_connected_events: the dict of connected events which can be reconstructed + :param external_connected_events: the dict of connected events to external issues """ log.info("Update event information ...") @@ -643,9 +643,9 @@ def reformat_events(issue_data, filtered_connected_events, external_connected_ev # and we only have 2 issues in the list, connect to the other issue event["event_info_1"] = value["issues"][0] if value["issues"][1] == issue["number"] else value["issues"][1] else: - # and we have more than two issues, count each issue's occurences - occurences = {x: value["issues"].count(x) for x in set(value["issues"])} - if occurences[issue["number"]] == max(occurences.values()): + # and we have more than two issues, count each issue's occurrences + occurrences = {x: value["issues"].count(x) for x in set(value["issues"])} + if occurrences[issue["number"]] == max(occurrences.values()): # if our issue is the most common one, that means it is the common denominator # for all connected events at this time # so this event connects to any other issue @@ -655,7 +655,7 @@ def reformat_events(issue_data, filtered_connected_events, external_connected_ev event["event_info_1"] = number else: # otherwise, connect this event to the common denominator - event["event_info_1"] = max(occurences, key=occurences.get) + event["event_info_1"] = max(occurrences, key=occurrences.get) # as the user dictionary is created, start re-formating the event information of all issues for issue in issue_data: @@ -785,7 +785,6 @@ def insert_user_data(issues, conf, resdir): :param issues: the issues to retrieve user data from :param conf: the project configuration :param resdir: the directory in which the username-to-user-list should be dumped - :return: the updated issue data """ log.info("Syncing users with ID service...") From d7ea47eed23be4d060ed35b6354723116a4acaa1 Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Tue, 10 Mar 2026 14:12:43 +0100 Subject: [PATCH 25/26] Remove old state from jira state_updated events For consistency with github events Signed-off-by: Leo Sendelbach --- issue_processing/jira_issue_processing.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/issue_processing/jira_issue_processing.py b/issue_processing/jira_issue_processing.py index 9d384c3..7b85076 100644 --- a/issue_processing/jira_issue_processing.py +++ b/issue_processing/jira_issue_processing.py @@ -464,21 +464,18 @@ def load_issues_via_api(issues, persons, url, referenced_bys): for change in changelog.histories: # default values for state and resolution - old_state, new_state, old_resolution, new_resolution = "open", "open", "unresolved", "unresolved" + new_state, old_resolution, new_resolution = "open", "unresolved", "unresolved" # all changes in the issue changelog are checked if they contain a useful information for item in change.items: # state_updated event gets created and added to the issue history if item.field == "status": - if item.fromString is not None: - old_state = item.fromString.lower() if item.toString is not None: new_state = item.toString.lower() history = dict() history["event"] = "state_updated" history["event_info_1"] = new_state - history["event_info_2"] = old_state if hasattr(change, "author"): user = create_user(change.author.displayName, change.author.name, "") else: From 8ab46a0103b7926e890032c49c9c248c7bb2622e Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Fri, 13 Mar 2026 11:32:36 +0100 Subject: [PATCH 26/26] Fix jira processing error previously removed event_info_2 for state_updated event, leading to crashes of the issue processing. Now, it instead contains an empty string. Also fix a minor spelling mistake Signed-off-by: --- bot_processing/bot_processing.py | 2 +- issue_processing/jira_issue_processing.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/bot_processing/bot_processing.py b/bot_processing/bot_processing.py index ad76d85..d5800b3 100644 --- a/bot_processing/bot_processing.py +++ b/bot_processing/bot_processing.py @@ -82,7 +82,7 @@ def load_bot_data(bot_file, header = True): # check if file exists and exit early if not if not os.path.exists(bot_file): - log.error("Bot file '{}' does not exist! Exiting early...".format(bot_file)) + log.error("Bot/Agent file '{}' does not exist (can be empty)! Exiting early...".format(bot_file)) sys.exit(-1) bot_data = csv_writer.read_from_csv(bot_file, delimiter=',') diff --git a/issue_processing/jira_issue_processing.py b/issue_processing/jira_issue_processing.py index 7b85076..3b12a93 100644 --- a/issue_processing/jira_issue_processing.py +++ b/issue_processing/jira_issue_processing.py @@ -18,7 +18,7 @@ # Copyright 2018-2019 by Anselm Fehnker # Copyright 2020-2021 by Thomas Bock # Copyright 2023 by Maximilian Löffler -# Copyright 2025 by Leo Sendelbach +# Copyright 2025-2026 by Leo Sendelbach # All Rights Reserved. """ This file is able to extract Jira issue data from xml files. @@ -476,6 +476,7 @@ def load_issues_via_api(issues, persons, url, referenced_bys): history = dict() history["event"] = "state_updated" history["event_info_1"] = new_state + history["event_info_2"] = "" if hasattr(change, "author"): user = create_user(change.author.displayName, change.author.name, "") else: