From 65f0a8770339427d2fbddbd31ba1dde2110c003d Mon Sep 17 00:00:00 2001
From: Leo Sendelbach <s8lesend@stud.uni-saarland.de>
Date: Mon, 25 Aug 2025 13:35:24 +0200
Subject: [PATCH 01/26] Add commit author of 'commit_added' events to event
 info

This allows for reconstruction of correct commit author if user is
github

Signed-off-by: Leo Sendelbach <s8lesend@stud.uni-saarland.de>
---
 author_postprocessing/author_postprocessing.py | 8 ++++++--
 issue_processing/issue_processing.py           | 8 ++++++++
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/author_postprocessing/author_postprocessing.py b/author_postprocessing/author_postprocessing.py
index 13b1e38..994fcd2 100644
--- a/author_postprocessing/author_postprocessing.py
+++ b/author_postprocessing/author_postprocessing.py
@@ -178,7 +178,7 @@ def is_github_noreply_author(name, email):
             commit_data_file = path.join(data_path, commits_list)
             commit_data = csv_writer.read_from_csv(commit_data_file)
             commit_hash_to_author = {commit[7]: commit[2:4] for commit in commit_data}
-
+            author_name_to_data = {author[1]: author[1:3] for author in author_data_new}
             issue_data_new = []
 
             for event in issue_data:
@@ -186,12 +186,16 @@ def is_github_noreply_author(name, email):
                 if is_github_noreply_author(event[9], event[10]) and event[8] == commit_added_event:
                     # extract commit hash from event info 1
                     commit_hash = event[12]
-
+                    name = event[13][1:-1]
                     # extract commit author from commit data, if available
                     if commit_hash in commit_hash_to_author:
                         event[9] = commit_hash_to_author[commit_hash][0]
                         event[10] = commit_hash_to_author[commit_hash][1]
                         issue_data_new.append(event)
+                    elif name in author_name_to_data:
+                        event[9] = author_name_to_data[name][0]
+                        event[10] = author_name_to_data[name][1]
+                        issue_data_new.append(event)
                     else:
                         # the added commit is not part of the commit data. In most cases, this is due to merge commits
                         # appearing in another pull request, as Codeface does not keep track of merge commits. As we
diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py
index a901e19..410769c 100644
--- a/issue_processing/issue_processing.py
+++ b/issue_processing/issue_processing.py
@@ -361,6 +361,7 @@ def merge_issue_events(issue_data):
             # it is a commit which was added to the pull request
             if rel_commit["type"] == "commitAddedToPullRequest":
                 rel_commit["event"] = "commit_added"
+                rel_commit["event_info_2"] = rel_commit["commit"]["author"]
 
             # if the related commit was mentioned in an issue comment:
             elif rel_commit["type"] == "commitMentionedInIssue":
@@ -745,6 +746,9 @@ def get_user_from_id(idx, buffer_db=user_buffer):
         for event in issue["eventsList"]:
             event["user"] = get_id_and_update_user(event["user"])
 
+            if event["event"] == "commit_added":
+                event["event_info_2"] = get_id_and_update_user(event["event_info_2"])
+
             # check database for the reference-target user if needed
             if event["ref_target"] != "":
                 event["ref_target"] = get_id_and_update_user(event["ref_target"])
@@ -758,6 +762,10 @@ def get_user_from_id(idx, buffer_db=user_buffer):
         for event in issue["eventsList"]:
             event["user"] = get_user_from_id(event["user"])
 
+            # for commit_added events, save the commit's author's name in event_info_2
+            if event["event"] == "commit_added":
+                event["event_info_2"] = get_user_from_id(event["event_info_2"])["name"]
+
             # get the reference-target user if needed
             if event["ref_target"] != "":
                 event["ref_target"] = get_user_from_id(event["ref_target"])

From 2e67f0df16b40af75c688b12744e0103c40dff57 Mon Sep 17 00:00:00 2001
From: Leo Sendelbach <s8lesend@stud.uni-saarland.de>
Date: Tue, 26 Aug 2025 10:56:56 +0200
Subject: [PATCH 02/26] Update Copyright headers

also added one comment for clarity

Signed-off-by: Leo Sendelbach <s8lesend@stud.uni-saarland.de>
---
 author_postprocessing/author_postprocessing.py | 2 ++
 issue_processing/issue_processing.py           | 1 +
 2 files changed, 3 insertions(+)

diff --git a/author_postprocessing/author_postprocessing.py b/author_postprocessing/author_postprocessing.py
index 994fcd2..42ca247 100644
--- a/author_postprocessing/author_postprocessing.py
+++ b/author_postprocessing/author_postprocessing.py
@@ -14,6 +14,7 @@
 #
 # Copyright 2015-2017 by Claus Hunsen <hunsen@fim.uni-passau.de>
 # Copyright 2020-2022 by Thomas Bock <bockthom@cs.uni-saarland.de>
+# Copyright 2025 by Leo Sendelbach <s8lesend@stud.uni-saarland.de>
 # All Rights Reserved.
 """
 This file is able to disambiguate authors after the extraction from the Codeface database was performed. A manually
@@ -186,6 +187,7 @@ def is_github_noreply_author(name, email):
                 if is_github_noreply_author(event[9], event[10]) and event[8] == commit_added_event:
                     # extract commit hash from event info 1
                     commit_hash = event[12]
+                    # extract author name from event info 2 while cutting excess '"'
                     name = event[13][1:-1]
                     # extract commit author from commit data, if available
                     if commit_hash in commit_hash_to_author:
diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py
index 410769c..72cf331 100644
--- a/issue_processing/issue_processing.py
+++ b/issue_processing/issue_processing.py
@@ -18,6 +18,7 @@
 # Copyright 2018-2019 by Anselm Fehnker <fehnker@fim.uni-passau.de>
 # Copyright 2019 by Thomas Bock <bockthom@fim.uni-passau.de>
 # Copyright 2020-2021 by Thomas Bock <bockthom@cs.uni-saarland.de>
+# Copyright 2025 by Leo Sendelbach <s8lesend@stud.uni-saarland.de>
 # All Rights Reserved.
 """
 This file is able to extract Github issue data from json files.

From c4f4af51251da5ec1bab4c4d233ec454c70b079a Mon Sep 17 00:00:00 2001
From: Leo Sendelbach <s8lesend@stud.uni-saarland.de>
Date: Thu, 25 Sep 2025 14:37:37 +0200
Subject: [PATCH 03/26] Add connected events reconstruction

also save merge commits
reconstruction of connected events is done by first saving all connected
events that occured at the same time. Then, it is possible to match
connected events iff:
- half of the involved issues are equal, meaning that one issue is
  connected to multiple others
- half rounded up of the involved isses are equal, meaning that we have
  one external connected event and then the previous case with the
remaining issues

Signed-off-by: Leo Sendelbach <s8lesend@stud.uni-saarland.de>
---
 issue_processing/issue_processing.py | 97 ++++++++++++++++++++++++++++
 1 file changed, 97 insertions(+)

diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py
index 72cf331..7e13f9b 100644
--- a/issue_processing/issue_processing.py
+++ b/issue_processing/issue_processing.py
@@ -31,6 +31,7 @@
 import sys
 import urllib
 from datetime import datetime, timedelta
+import math
 
 import operator
 from codeface.cli import log
@@ -53,6 +54,9 @@
 # datetime format string
 datetime_format = "%Y-%m-%d %H:%M:%S"
 
+filtered_connected_events = dict()
+external_connected_events = dict()
+
 def run():
     # get all needed paths and arguments for the method call.
     parser = argparse.ArgumentParser(prog='codeface-extraction-issues-github', description='Codeface extraction')
@@ -294,6 +298,7 @@ def merge_issue_events(issue_data):
     log.info("Merge issue events ...")
 
     issue_data_to_update = dict()
+    connected_events = dict()
 
     for issue in issue_data:
 
@@ -490,6 +495,28 @@ def merge_issue_events(issue_data):
                 event["ref_target"] = event["user"]
                 event["user"] = event["assigner"]
 
+            # if event is merged event, save the hash of the merge commit in event_info_1
+            if event["event"] == "merged":
+                event["event_info_1"] = event["commit"]["hash"]
+
+            # if event is connected event, create or add to a matching dict entry by matching timestamps, for later reconstruction
+            if event["event"] == "connected":
+                if event["created_at"] in connected_events.keys() and connected_events[event["created_at"]]["user"] == event["user"]:
+                    connected_events[event["created_at"]]["issues"].append(issue["number"])
+                elif subtract_seconds_from_time(event["created_at"], 1) in connected_events.keys() \
+                        and connected_events[subtract_seconds_from_time(event["created_at"], 1)]["user"] == event["user"]:
+                    connected_events[subtract_seconds_from_time(event["created_at"], 1)]["issues"].append(issue["number"])
+                    event["created_at"] = subtract_seconds_from_time(event["created_at"], 1)
+                elif subtract_seconds_from_time(event["created_at"], -1) in connected_events.keys() \
+                        and connected_events[subtract_seconds_from_time(event["created_at"], -1)]["user"] == event["user"]:
+                    connected_events[subtract_seconds_from_time(event["created_at"], -1)]["issues"].append(issue["number"])
+                    event["created_at"] = subtract_seconds_from_time(event["created_at"], -1)
+                else:
+                    connected_info = dict()
+                    connected_info["issues"] = [issue["number"]]
+                    connected_info["user"] = issue["user"]
+                    connected_events[event["created_at"]] = connected_info
+
         # merge events, relatedCommits, relatedIssues and comment lists
         issue["eventsList"] = issue["commentsList"] + issue["eventsList"] + issue["relatedIssues"] + issue[
             "relatedCommits"] + issue["reviewsList"]
@@ -501,6 +528,10 @@ def merge_issue_events(issue_data):
         # sorts eventsList by time
         issue["eventsList"] = sorted(issue["eventsList"], key=lambda k: k["created_at"])
 
+    # filter out connected events which cannot be perfectly matched
+    global filtered_connected_events
+    filtered_connected_events = dict(filter(lambda item: filter_connected_events(item[0], item[1]), connected_events.iteritems()))
+
     # updates all the issues by the temporarily stored referenced_by events
     for key, value in issue_data_to_update.iteritems():
         for issue in issue_data:
@@ -510,6 +541,41 @@ def merge_issue_events(issue_data):
     return issue_data
 
 
+def filter_connected_events(key, value):
+    num_issues = len(value["issues"])
+    global external_connected_events
+    # if only a single connected event exists at this time, it has to be connecting to an external issue
+    if num_issues == 1:
+        external_connected_events[key] = value
+        return False
+    # if 2 connected events exist, matching them is trivial
+    if num_issues == 2:
+        return True
+    occurances = {x: value["issues"].count(x) for x in set(value["issues"])}
+    # otherwise, if it is an even number, check if it can be easily matched,
+    # meaning that exactly half the events occur in the same issue
+    if num_issues % 2 == 0 and num_issues/2 in occurances.values():
+        # duplicate issue list for matching the issues later
+        value["multi_issues_copy"] = list(value["issues"])
+        return True
+    # if it is an odd number, check if it can be easily matched
+    # meaning that exactly half (rounded up) the events occur in the same issue
+    if num_issues % 2 == 1 and math.ceil(num_issues/2) in occurances.values():
+        for sub_key, sub_value in occurances.iteritems():
+            # then, assign one of them as an external connected event and proceed as in previous case
+            if sub_value == math.ceil(num_issues/2):
+                new_entry = dict()
+                new_entry["user"] = value["user"]
+                new_entry["issues"] = [sub_key]
+                external_connected_events[key] = new_entry
+                value["issues"].remove(sub_key)
+                # duplicate issue list for matching the issues later
+                value["multi_issues_copy"] = list(value["issues"])
+                return True
+    # no other variants can be easily matched
+    return False
+
+
 def reformat_events(issue_data):
     """
     Re-format event information dependent on the event type.
@@ -540,6 +606,37 @@ def reformat_events(issue_data):
             if not event["ref_target"] is None and not event["ref_target"] == "":
                 users = update_user_dict(users, event["ref_target"])
 
+            # reconstruction of connections
+            if event["event"] == "connected":
+                external = False
+                # check if event is external
+                for key, value in external_connected_events.iteritems():
+                    if issue["number"] in value["issues"]:
+                        if key == event["created_at"]:
+                            external = True
+                            event["event_info_1"] = "external"
+                            value["issues"].remove(issue["number"])
+                # if so, skip the next checks
+                if external:
+                    continue
+                # otherwise, it must be internal
+                for key, value in filtered_connected_events.iteritems():
+                    if issue["number"] in value["issues"]:
+                        if key == event["created_at"]:
+                            if len(value["issues"]) == 2:
+                                # if only 2 events occured at this timestamp, matching the issues is trivial
+                                event["event_info_1"] = value["issues"][0] if value["issues"][1] == issue["number"] else value["issues"][1]
+                            else:
+                                occurances = {x: value["issues"].count(x) for x in set(value["issues"])}
+                                if occurances[issue["number"]] == max(occurances.values()):
+                                    # otherwise, if current issue is the centerpiece of all connected events, use previous copy to match issues
+                                    number = next(x for x in value["multi_issues_copy"] if x != issue["number"])
+                                    value["multi_issues_copy"].remove(number)
+                                    event["event_info_1"] = number
+                                else:
+                                    # if current issue is not the centerpiece, connect it to the centerpiece
+                                    event["event_info_1"] = max(occurances, key = occurances.get)
+
     # as the user dictionary is created, start re-formating the event information of all issues
     for issue in issue_data:
 

From e77b009f277f48fa63640cf195dca1b2d4273330 Mon Sep 17 00:00:00 2001
From: Leo Sendelbach <s8lesend@stud.uni-saarland.de>
Date: Tue, 14 Oct 2025 14:43:33 +0200
Subject: [PATCH 04/26] Remove unnecessary returns of issue data

since data is modified in-place, return of input data is not needed

Signed-off-by: Leo Sendelbach <s8lesend@stud.uni-saarland.de>
---
 issue_processing/issue_processing.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py
index 7e13f9b..52e262d 100644
--- a/issue_processing/issue_processing.py
+++ b/issue_processing/issue_processing.py
@@ -79,13 +79,13 @@ def run():
     # 1) load the list of issues
     issues = load(__srcdir)
     # 2) re-format the issues
-    issues = reformat_issues(issues)
+    reformat_issues(issues)
     # 3) merges all issue events into one list
-    issues = merge_issue_events(issues)
+    merge_issue_events(issues)
     # 4) re-format the eventsList of the issues
-    issues = reformat_events(issues)
+    reformat_events(issues)
     # 5) update user data with Codeface database and dump username-to-name/e-mail list
-    issues = insert_user_data(issues, __conf, __resdir)
+    insert_user_data(issues, __conf, __resdir)
     # 6) dump result to disk
     print_to_disk(issues, __resdir)
 
@@ -284,7 +284,7 @@ def reformat_issues(issue_data):
         else:
             issue["type"].append("issue")
 
-    return issue_data
+    return
 
 
 def merge_issue_events(issue_data):
@@ -538,7 +538,7 @@ def merge_issue_events(issue_data):
             if issue["number"] == value["number"]:
                 issue["eventsList"] = issue["eventsList"] + value["eventsList"]
 
-    return issue_data
+    return
 
 
 def filter_connected_events(key, value):
@@ -747,7 +747,7 @@ def reformat_events(issue_data):
         for event_to_remove in events_to_remove:
             issue["eventsList"].remove(event_to_remove)
 
-    return issue_data
+    return
 
 
 def insert_user_data(issues, conf, resdir):
@@ -884,7 +884,7 @@ def get_user_from_id(idx, buffer_db=user_buffer):
     username_dump = os.path.join(resdir, "usernames.list")
     csv_writer.write_to_csv(username_dump, sorted(set(lines), key=lambda line: line[0]))
 
-    return issues
+    return
 
 
 def print_to_disk(issues, results_folder):

From c28b1385504a63a8cc4e5d64976944d890cb7a34 Mon Sep 17 00:00:00 2001
From: Leo Sendelbach <s8lesend@stud.uni-saarland.de>
Date: Tue, 14 Oct 2025 14:47:19 +0200
Subject: [PATCH 05/26] Add reasons to reopen/closed events

ALso add commit hash if closed by commit

Signed-off-by: Leo Sendelbach <s8lesend@stud.uni-saarland.de>
---
 issue_processing/issue_processing.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py
index 52e262d..e5a40e7 100644
--- a/issue_processing/issue_processing.py
+++ b/issue_processing/issue_processing.py
@@ -654,13 +654,16 @@ def reformat_events(issue_data):
             if event["event"] == "closed":
                 event["event"] = "state_updated"
                 event["event_info_1"] = "closed"  # new state
-                event["event_info_2"] = "open"  # old state
+                if event["commit"] is not None:
+                    event["event_info_2"] = event["commit"]["hash"]
+                else:
+                    event["event_info_2"] = event["state_reason"]
                 issue["state_new"] = "closed"
 
             elif event["event"] == "reopened":
                 event["event"] = "state_updated"
                 event["event_info_1"] = "open"  # new state
-                event["event_info_2"] = "closed"  # old state
+                event["event_info_2"] = event["state_reason"]
                 issue["state_new"] = "reopened"
 
             elif event["event"] == "labeled":

From 488626e9b55313339502d01e13cb7a5b2def0126 Mon Sep 17 00:00:00 2001
From: Leo Sendelbach <s8lesend@stud.uni-saarland.de>
Date: Tue, 14 Oct 2025 17:01:48 +0200
Subject: [PATCH 06/26] Add GitHub issue types

also rename 'new feature' to 'feature'

Signed-off-by: Leo Sendelbach <s8lesend@stud.uni-saarland.de>
---
 issue_processing/issue_processing.py      | 7 +++++--
 issue_processing/jira_issue_processing.py | 9 ++++++---
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py
index e5a40e7..adf8b7c 100644
--- a/issue_processing/issue_processing.py
+++ b/issue_processing/issue_processing.py
@@ -43,7 +43,7 @@
 from csv_writer import csv_writer
 
 # known types from JIRA and GitHub default labels
-known_types = {"bug", "improvement", "enhancement", "new feature", "task", "test", "wish"}
+known_types = {"bug", "improvement", "enhancement", "feature", "task", "test", "wish"}
 
 # known resolutions from JIRA and GitHub default labels
 known_resolutions = {"unresolved", "fixed", "wontfix", "duplicate", "invalid", "incomplete", "cannot reproduce",
@@ -243,7 +243,10 @@ def reformat_issues(issue_data):
     for issue in issue_data:
 
         # empty container for issue types
-        issue["type"] = []
+        if issue["type"] is None:
+            issue["type"] = []
+        else:
+            issue["type"] = [issue["type"]["name"].lower()]
 
         # empty container for issue resolutions
         issue["resolution"] = []
diff --git a/issue_processing/jira_issue_processing.py b/issue_processing/jira_issue_processing.py
index d9748ae..6f60beb 100644
--- a/issue_processing/jira_issue_processing.py
+++ b/issue_processing/jira_issue_processing.py
@@ -300,9 +300,12 @@ def parse_xml(issue_data, persons, skip_history, referenced_bys):
         link = issue_x.getElementsByTagName("link")[0]
         issue["url"] = link.firstChild.data
 
-        type = issue_x.getElementsByTagName("type")[0]
-        issue["type"] = type.firstChild.data
-        issue["type_list"] = ["issue", str(type.firstChild.data.lower())]
+        type = issue_x.getElementsByTagName("type")[0].firstChild.data
+        # rename 'new feature' type to 'feature' to be in line with the github original issue type
+        if type == "New Feature":
+            type = "Feature"
+        issue["type"] = type
+        issue["type_list"] = ["issue", str(type.lower())]
 
         status = issue_x.getElementsByTagName("status")[0]
         issue["state"] = status.firstChild.data

From 2894b0d95c8f7063444ef8bf3c5b14569714416f Mon Sep 17 00:00:00 2001
From: Leo Sendelbach <s8lesend@stud.uni-saarland.de>
Date: Tue, 14 Oct 2025 17:26:55 +0200
Subject: [PATCH 07/26] Simplify loops for reconstruction of connections

also remove duplicates from type list

Signed-off-by: Leo Sendelbach <s8lesend@stud.uni-saarland.de>
---
 issue_processing/issue_processing.py | 67 +++++++++++-----------------
 1 file changed, 26 insertions(+), 41 deletions(-)

diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py
index adf8b7c..dce85f1 100644
--- a/issue_processing/issue_processing.py
+++ b/issue_processing/issue_processing.py
@@ -54,9 +54,6 @@
 # datetime format string
 datetime_format = "%Y-%m-%d %H:%M:%S"
 
-filtered_connected_events = dict()
-external_connected_events = dict()
-
 def run():
     # get all needed paths and arguments for the method call.
     parser = argparse.ArgumentParser(prog='codeface-extraction-issues-github', description='Codeface extraction')
@@ -81,9 +78,10 @@ def run():
     # 2) re-format the issues
     reformat_issues(issues)
     # 3) merges all issue events into one list
-    merge_issue_events(issues)
+    external_connected_events = dict()
+    filtered_connected_events = merge_issue_events(issues, external_connected_events)
     # 4) re-format the eventsList of the issues
-    reformat_events(issues)
+    reformat_events(issues, filtered_connected_events, external_connected_events)
     # 5) update user data with Codeface database and dump username-to-name/e-mail list
     insert_user_data(issues, __conf, __resdir)
     # 6) dump result to disk
@@ -290,7 +288,7 @@ def reformat_issues(issue_data):
     return
 
 
-def merge_issue_events(issue_data):
+def merge_issue_events(issue_data, external_connected_events):
     """
     All issue events are merged together in the eventsList. This simplifies processing in later steps.
 
@@ -532,8 +530,7 @@ def merge_issue_events(issue_data):
         issue["eventsList"] = sorted(issue["eventsList"], key=lambda k: k["created_at"])
 
     # filter out connected events which cannot be perfectly matched
-    global filtered_connected_events
-    filtered_connected_events = dict(filter(lambda item: filter_connected_events(item[0], item[1]), connected_events.iteritems()))
+    filtered_connected_events = dict(filter(lambda item: filter_connected_events(item[0], item[1], external_connected_events), connected_events.items()))
 
     # updates all the issues by the temporarily stored referenced_by events
     for key, value in issue_data_to_update.iteritems():
@@ -541,12 +538,11 @@ def merge_issue_events(issue_data):
             if issue["number"] == value["number"]:
                 issue["eventsList"] = issue["eventsList"] + value["eventsList"]
 
-    return
+    return filtered_connected_events
 
 
-def filter_connected_events(key, value):
+def filter_connected_events(key, value, external_connected_events):
     num_issues = len(value["issues"])
-    global external_connected_events
     # if only a single connected event exists at this time, it has to be connecting to an external issue
     if num_issues == 1:
         external_connected_events[key] = value
@@ -579,7 +575,7 @@ def filter_connected_events(key, value):
     return False
 
 
-def reformat_events(issue_data):
+def reformat_events(issue_data, filtered_connected_events, external_connected_events):
     """
     Re-format event information dependent on the event type.
 
@@ -611,34 +607,23 @@ def reformat_events(issue_data):
 
             # reconstruction of connections
             if event["event"] == "connected":
-                external = False
-                # check if event is external
-                for key, value in external_connected_events.iteritems():
-                    if issue["number"] in value["issues"]:
-                        if key == event["created_at"]:
-                            external = True
-                            event["event_info_1"] = "external"
-                            value["issues"].remove(issue["number"])
-                # if so, skip the next checks
-                if external:
-                    continue
-                # otherwise, it must be internal
-                for key, value in filtered_connected_events.iteritems():
-                    if issue["number"] in value["issues"]:
-                        if key == event["created_at"]:
-                            if len(value["issues"]) == 2:
-                                # if only 2 events occured at this timestamp, matching the issues is trivial
-                                event["event_info_1"] = value["issues"][0] if value["issues"][1] == issue["number"] else value["issues"][1]
-                            else:
-                                occurances = {x: value["issues"].count(x) for x in set(value["issues"])}
-                                if occurances[issue["number"]] == max(occurances.values()):
-                                    # otherwise, if current issue is the centerpiece of all connected events, use previous copy to match issues
-                                    number = next(x for x in value["multi_issues_copy"] if x != issue["number"])
-                                    value["multi_issues_copy"].remove(number)
-                                    event["event_info_1"] = number
-                                else:
-                                    # if current issue is not the centerpiece, connect it to the centerpiece
-                                    event["event_info_1"] = max(occurances, key = occurances.get)
+                if event["created_at"] in external_connected_events \
+                    and issue["number"] in external_connected_events[event["created_at"]]["issues"]:
+                    event["event_info_1"] = "external"
+                    external_connected_events[event["created_at"]]["issues"].remove(issue["number"])
+                elif event["created_at"] in filtered_connected_events \
+                    and issue["number"] in filtered_connected_events[event["created_at"]]["issues"]:
+                    value = filtered_connected_events[event["created_at"]]
+                    if len(value["issues"]) == 2:
+                        event["event_info_1"] = value["issues"][0] if value["issues"][1] == issue["number"] else value["issues"][1]
+                    else:
+                        occurances = {x: value["issues"].count(x) for x in set(value["issues"])}
+                        if occurances[issue["number"]] == max(occurances.values()):
+                            number = next(x for x in value["multi_issues_copy"] if x != issue["number"])
+                            value["multi_issues_copy"].remove(number)
+                            event["event_info_1"] = number
+                        else:
+                            event["event_info_1"] = max(occurances, key = occurances.get)
 
     # as the user dictionary is created, start re-formating the event information of all issues
     for issue in issue_data:
@@ -674,7 +659,7 @@ def reformat_events(issue_data):
                 event["event_info_1"] = label
 
                 # if the label is in this list, it also is a type of the issue
-                if label in known_types:
+                if label in known_types and label not in issue["type"]:
                     issue["type"].append(str(label))
 
                     # creates an event for type updates and adds it to the eventsList

From 7ae507972788a998f7f51dd37b224e09841641ff Mon Sep 17 00:00:00 2001
From: Leo Sendelbach <s8lesend@stud.uni-saarland.de>
Date: Tue, 14 Oct 2025 17:28:23 +0200
Subject: [PATCH 08/26] Add subissues to results csv

using empty line reserved for jira components

Signed-off-by: Leo Sendelbach <s8lesend@stud.uni-saarland.de>
---
 issue_processing/issue_processing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py
index dce85f1..1fd3d24 100644
--- a/issue_processing/issue_processing.py
+++ b/issue_processing/issue_processing.py
@@ -902,7 +902,7 @@ def print_to_disk(issues, results_folder):
                 json.dumps(issue["resolution"]),
                 issue["created_at"],
                 issue["closed_at"],
-                json.dumps([]),  # components
+                json.dumps([issue["subIssues"]]),  # components
                 event["event"],
                 event["user"]["name"],
                 event["user"]["email"],

From f44a8b729dbd1ee1a6d80555218fa4d5fa97cd23 Mon Sep 17 00:00:00 2001
From: Leo Sendelbach <s8lesend@stud.uni-saarland.de>
Date: Tue, 21 Oct 2025 14:10:18 +0200
Subject: [PATCH 09/26] Remove unneccesary return value

also added copyright header

Signed-off-by: Leo Sendelbach <s8lesend@stud.uni-saarland.de>
---
 issue_processing/jira_issue_processing.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/issue_processing/jira_issue_processing.py b/issue_processing/jira_issue_processing.py
index 6f60beb..9d384c3 100644
--- a/issue_processing/jira_issue_processing.py
+++ b/issue_processing/jira_issue_processing.py
@@ -18,6 +18,7 @@
 # Copyright 2018-2019 by Anselm Fehnker <fehnker@fim.uni-passau.de>
 # Copyright 2020-2021 by Thomas Bock <bockthom@cs.uni-saarland.de>
 # Copyright 2023 by Maximilian Löffler <s8maloef@stud.uni-saarland.de>
+# Copyright 2025 by Leo Sendelbach <s8lesend@stud.uni-saarland.de>
 # All Rights Reserved.
 """
 This file is able to extract Jira issue data from xml files.
@@ -125,7 +126,7 @@ def run():
                 referenced_issue["history"].append(referenced_by)
 
     # 5) update user data with Codeface database
-    processed_issues = insert_user_data(processed_issues, __conf)
+    insert_user_data(processed_issues, __conf)
     # 6) dump result to disk
     print_to_disk(processed_issues, __resdir)
     # # 7) export for Gephi
@@ -689,7 +690,7 @@ def get_user_from_id(idx, buffer_db=user_buffer):
                 event["event_info_2"] = assigned_user["email"]
 
     log.debug("number of issues after insert_user_data: '{}'".format(len(issues)))
-    return issues
+    return
 
 
 def print_to_disk(issues, results_folder):

From e46af3fed9155591b78fab19f30db420cf4900f0 Mon Sep 17 00:00:00 2001
From: Leo Sendelbach <s8lesend@stud.uni-saarland.de>
Date: Tue, 21 Oct 2025 14:11:29 +0200
Subject: [PATCH 10/26] Add comments

also minor fixes and removal of math.ceil

Signed-off-by: Leo Sendelbach <s8lesend@stud.uni-saarland.de>
---
 issue_processing/issue_processing.py | 36 +++++++++++++++++++++-------
 1 file changed, 27 insertions(+), 9 deletions(-)

diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py
index 1fd3d24..3b7ca64 100644
--- a/issue_processing/issue_processing.py
+++ b/issue_processing/issue_processing.py
@@ -77,10 +77,15 @@ def run():
     issues = load(__srcdir)
     # 2) re-format the issues
     reformat_issues(issues)
-    # 3) merges all issue events into one list
+    # create an empty dict for external connected events, meaning connected
+    # events that connect to an issue in another repository
     external_connected_events = dict()
+    # 3) merges all issue events into one list
+    # this step returns a dict containing all connected events that can be matched to the correct issues later
     filtered_connected_events = merge_issue_events(issues, external_connected_events)
     # 4) re-format the eventsList of the issues
+    # this step also reconstructs the connections previously stored
+    # in 'external_connected_events' and 'filtered_connected_events'
     reformat_events(issues, filtered_connected_events, external_connected_events)
     # 5) update user data with Codeface database and dump username-to-name/e-mail list
     insert_user_data(issues, __conf, __resdir)
@@ -503,16 +508,20 @@ def merge_issue_events(issue_data, external_connected_events):
             # if event is connected event, create or add to a matching dict entry by matching timestamps, for later reconstruction
             if event["event"] == "connected":
                 if event["created_at"] in connected_events.keys() and connected_events[event["created_at"]]["user"] == event["user"]:
+                    # if there is already a connected event at this time by this user, add this event to the list
                     connected_events[event["created_at"]]["issues"].append(issue["number"])
                 elif subtract_seconds_from_time(event["created_at"], 1) in connected_events.keys() \
                         and connected_events[subtract_seconds_from_time(event["created_at"], 1)]["user"] == event["user"]:
+                    # same as above, but accounting for a possible difference in timestamps of 1 second between matching events
                     connected_events[subtract_seconds_from_time(event["created_at"], 1)]["issues"].append(issue["number"])
                     event["created_at"] = subtract_seconds_from_time(event["created_at"], 1)
                 elif subtract_seconds_from_time(event["created_at"], -1) in connected_events.keys() \
                         and connected_events[subtract_seconds_from_time(event["created_at"], -1)]["user"] == event["user"]:
+                    # same as above, with offset calculated in the other direction
                     connected_events[subtract_seconds_from_time(event["created_at"], -1)]["issues"].append(issue["number"])
                     event["created_at"] = subtract_seconds_from_time(event["created_at"], -1)
                 else:
+                    # if there is no connected event yet at this timestamp, create a new entry for this event
                     connected_info = dict()
                     connected_info["issues"] = [issue["number"]]
                     connected_info["user"] = issue["user"]
@@ -550,19 +559,19 @@ def filter_connected_events(key, value, external_connected_events):
     # if 2 connected events exist, matching them is trivial
     if num_issues == 2:
         return True
-    occurances = {x: value["issues"].count(x) for x in set(value["issues"])}
+    occurences = {x: value["issues"].count(x) for x in set(value["issues"])}
     # otherwise, if it is an even number, check if it can be easily matched,
     # meaning that exactly half the events occur in the same issue
-    if num_issues % 2 == 0 and num_issues/2 in occurances.values():
+    if num_issues % 2 == 0 and num_issues/2 in occurences.values():
         # duplicate issue list for matching the issues later
         value["multi_issues_copy"] = list(value["issues"])
         return True
     # if it is an odd number, check if it can be easily matched
     # meaning that exactly half (rounded up) the events occur in the same issue
-    if num_issues % 2 == 1 and math.ceil(num_issues/2) in occurances.values():
-        for sub_key, sub_value in occurances.iteritems():
+    if num_issues % 2 == 1 and (num_issues + 1)/2 in occurences.values():
+        for sub_key, sub_value in occurences.iteritems():
             # then, assign one of them as an external connected event and proceed as in previous case
-            if sub_value == math.ceil(num_issues/2):
+            if sub_value == (num_issues + 1)/2:
                 new_entry = dict()
                 new_entry["user"] = value["user"]
                 new_entry["issues"] = [sub_key]
@@ -609,21 +618,30 @@ def reformat_events(issue_data, filtered_connected_events, external_connected_ev
             if event["event"] == "connected":
                 if event["created_at"] in external_connected_events \
                     and issue["number"] in external_connected_events[event["created_at"]]["issues"]:
+                    # if the event is an external connected event, mark it as such and remove this issue from the list
                     event["event_info_1"] = "external"
                     external_connected_events[event["created_at"]]["issues"].remove(issue["number"])
                 elif event["created_at"] in filtered_connected_events \
                     and issue["number"] in filtered_connected_events[event["created_at"]]["issues"]:
+                    # if it is instead an internal connected event
                     value = filtered_connected_events[event["created_at"]]
                     if len(value["issues"]) == 2:
+                        # and we only have 2 issues in the list, connect to the other issue
                         event["event_info_1"] = value["issues"][0] if value["issues"][1] == issue["number"] else value["issues"][1]
                     else:
-                        occurances = {x: value["issues"].count(x) for x in set(value["issues"])}
-                        if occurances[issue["number"]] == max(occurances.values()):
+                        # and we have more than two issues, count each issue's occurences
+                        occurences = {x: value["issues"].count(x) for x in set(value["issues"])}
+                        if occurences[issue["number"]] == max(occurences.values()):
+                            # if our issue is the most common one, that means it is the common denominator
+                            # for all connected events at this time
+                            # so this event connects to any other issue
+                            # which is then removed from a copied list to avoid duplications
                             number = next(x for x in value["multi_issues_copy"] if x != issue["number"])
                             value["multi_issues_copy"].remove(number)
                             event["event_info_1"] = number
                         else:
-                            event["event_info_1"] = max(occurances, key = occurances.get)
+                            # otherwise, connect this event to the common denominator
+                            event["event_info_1"] = max(occurences, key=occurences.get)
 
     # as the user dictionary is created, start re-formating the event information of all issues
     for issue in issue_data:

From cfaba7169dd3cffb6914d5bc0b5679dcd701c58e Mon Sep 17 00:00:00 2001
From: Leo Sendelbach <s8lesend@stud.uni-saarland.de>
Date: Fri, 31 Oct 2025 16:28:42 +0100
Subject: [PATCH 11/26] Add new json field for suggestions to result

comments now each have a boolean field that describes whether the
comment contains a suggestion or not

Signed-off-by: Leo Sendelbach <s8lesend@stud.uni-saarland.de>
---
 issue_processing/issue_processing.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py
index 3b7ca64..c6648c2 100644
--- a/issue_processing/issue_processing.py
+++ b/issue_processing/issue_processing.py
@@ -742,7 +742,10 @@ def reformat_events(issue_data, filtered_connected_events, external_connected_ev
                 # "state_new" and "resolution" of the issue give the information about the state and the resolution of
                 # the issue when the comment was written, because the eventsList is sorted by time
                 event["event_info_1"] = issue["state_new"]
-                event["event_info_2"] = issue["resolution"]
+                if "contains_suggestion" in event:
+                    event["event_info_2"] = event["contains_suggestion"]
+                else:
+                    event["event_info_2"] = False
 
             elif event["event"] == "referenced" and not event["commit"] is None:
                 # remove "referenced" events originating from commits

From fae4c47f98ca8987bd20f202e4cebeb4c1c2dd42 Mon Sep 17 00:00:00 2001
From: Leo Sendelbach <s8lesend@stud.uni-saarland.de>
Date: Fri, 31 Oct 2025 16:36:11 +0100
Subject: [PATCH 12/26] Improve documentation

dicts for reconstructing connected events are now better explained and
the comments do not disruot the workflow in the run function anymore

Signed-off-by: Leo Sendelbach <s8lesend@stud.uni-saarland.de>
---
 issue_processing/issue_processing.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py
index c6648c2..97298cb 100644
--- a/issue_processing/issue_processing.py
+++ b/issue_processing/issue_processing.py
@@ -77,15 +77,10 @@ def run():
     issues = load(__srcdir)
     # 2) re-format the issues
     reformat_issues(issues)
-    # create an empty dict for external connected events, meaning connected
-    # events that connect to an issue in another repository
-    external_connected_events = dict()
     # 3) merges all issue events into one list
-    # this step returns a dict containing all connected events that can be matched to the correct issues later
+    external_connected_events = dict()
     filtered_connected_events = merge_issue_events(issues, external_connected_events)
     # 4) re-format the eventsList of the issues
-    # this step also reconstructs the connections previously stored
-    # in 'external_connected_events' and 'filtered_connected_events'
     reformat_events(issues, filtered_connected_events, external_connected_events)
     # 5) update user data with Codeface database and dump username-to-name/e-mail list
     insert_user_data(issues, __conf, __resdir)
@@ -539,6 +534,8 @@ def merge_issue_events(issue_data, external_connected_events):
         issue["eventsList"] = sorted(issue["eventsList"], key=lambda k: k["created_at"])
 
     # filter out connected events which cannot be perfectly matched
+    # and populate external_connected_events dict
+    # because this happens in place, we do not need to return the external_connected_event dict later
     filtered_connected_events = dict(filter(lambda item: filter_connected_events(item[0], item[1], external_connected_events), connected_events.items()))
 
     # updates all the issues by the temporarily stored referenced_by events
@@ -547,6 +544,7 @@ def merge_issue_events(issue_data, external_connected_events):
             if issue["number"] == value["number"]:
                 issue["eventsList"] = issue["eventsList"] + value["eventsList"]
 
+    # return the filtered_connected_events dict for later reconstruction
     return filtered_connected_events
 
 

From 89f0f0160d78635e824ad09f13904c971f0cbb50 Mon Sep 17 00:00:00 2001
From: Leo Sendelbach <s8lesend@stud.uni-saarland.de>
Date: Tue, 4 Nov 2025 12:46:51 +0100
Subject: [PATCH 13/26] Incorporate requested changes

includes:
- updated comments
- spelling mistake
- fix for potential crash if script is used on old data

Signed-off-by: Leo Sendelbach <s8lesend@stud.uni-saarland.de>
---
 issue_processing/issue_processing.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py
index 97298cb..0ff891f 100644
--- a/issue_processing/issue_processing.py
+++ b/issue_processing/issue_processing.py
@@ -261,7 +261,7 @@ def reformat_issues(issue_data):
         if issue["relatedCommits"] is None:
             issue["relatedCommits"] = []
 
-        # if an issue has no reviewsList, an empty Listgets created
+        # if an issue has no reviewsList, an empty List gets created
         if issue["reviewsList"] is None:
             issue["reviewsList"] = []
 
@@ -269,6 +269,10 @@ def reformat_issues(issue_data):
         if "relatedIssues" not in issue:
             issue["relatedIssues"] = []
 
+        # if an issue has no sub-issue list, an empty List gets created
+        if "subIssues" not in issue:
+            issue["subIssues"] = []
+
         # add "closed_at" information if not present yet
         if issue["closed_at"] is None:
             issue["closed_at"] = ""
@@ -737,9 +741,10 @@ def reformat_events(issue_data, filtered_connected_events, external_connected_ev
                     issue["eventsList"].append(resolution_event)
 
             elif event["event"] == "commented":
-                # "state_new" and "resolution" of the issue give the information about the state and the resolution of
+                # "state_new" of the issue gives the information about the state of
                 # the issue when the comment was written, because the eventsList is sorted by time
                 event["event_info_1"] = issue["state_new"]
+                # if event is a review comment, it can contain suggestions
                 if "contains_suggestion" in event:
                     event["event_info_2"] = event["contains_suggestion"]
                 else:

From 73d5f64ce60dc2056e5469557cf97e62825e3cde Mon Sep 17 00:00:00 2001
From: Leo Sendelbach <s8lesend@stud.uni-saarland.de>
Date: Tue, 27 Jan 2026 13:41:27 +0100
Subject: [PATCH 14/26] Add copilot user unification to author postprocessing

author postprocessing now also contains a list of known copilot use
names that can be extended to unify more different copilot users

Signed-off-by: Leo Sendelbach <s8lesend@stud.uni-saarland.de>
---
 author_postprocessing/author_postprocessing.py | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/author_postprocessing/author_postprocessing.py b/author_postprocessing/author_postprocessing.py
index 42ca247..d46074a 100644
--- a/author_postprocessing/author_postprocessing.py
+++ b/author_postprocessing/author_postprocessing.py
@@ -14,7 +14,7 @@
 #
 # Copyright 2015-2017 by Claus Hunsen <hunsen@fim.uni-passau.de>
 # Copyright 2020-2022 by Thomas Bock <bockthom@cs.uni-saarland.de>
-# Copyright 2025 by Leo Sendelbach <s8lesend@stud.uni-saarland.de>
+# Copyright 2025-2026 by Leo Sendelbach <s8lesend@stud.uni-saarland.de>
 # All Rights Reserved.
 """
 This file is able to disambiguate authors after the extraction from the Codeface database was performed. A manually
@@ -51,6 +51,15 @@
 from csv_writer import csv_writer
 
 
+##
+# GLOBAL VARIABLES
+##
+
+# global variable containing all known copilot users and the name and mail adress copilot users will be assigned
+known_copilot_users = {"Copilot", "copilot-pull-request-reviewer[bot]", "copilot-swe-agentbot"}
+copilot_unified_name = "Copilot"
+copilot_unified_email = "copilot@example.com"
+
 ##
 # RUN POSTPROCESSING
 ##
@@ -79,7 +88,7 @@ def perform_data_backup(results_path, results_path_backup):
                     copy(current_file, backup_file)
 
 
-def fix_github_browser_commits(data_path, issues_github_list, commits_list, authors_list, emails_list, bots_list):
+def fix_github_browser_commits(data_path, issues_github_list, commits_list, authors_list, emails_list, bots_list, unify_copilot_users=True):
     """
     Replace the author "GitHub <noreply@github.com>" in both commit and GitHub issue data by the correct author.
     The author "GitHub <noreply@github.com>" is automatically inserted as the committer of a commit that is made when
@@ -183,6 +192,11 @@ def is_github_noreply_author(name, email):
             issue_data_new = []
 
             for event in issue_data:
+                # unify events to use a single copilot user for all events triggered by a known copilot user
+                if unify_copilot_users and event[9] in known_copilot_users:
+                    event[9] = copilot_unified_name
+                    event[10] = copilot_unified_email
+
                 # replace author if necessary
                 if is_github_noreply_author(event[9], event[10]) and event[8] == commit_added_event:
                     # extract commit hash from event info 1

From befbee3e09b2190565debe3393a6c45f4cc3a7e1 Mon Sep 17 00:00:00 2001
From: Leo Sendelbach <s8lesend@stud.uni-saarland.de>
Date: Tue, 27 Jan 2026 13:48:43 +0100
Subject: [PATCH 15/26] Assign copilot user data in case of specific events

the events 'copilot_work_started' and 'copilot_work_finished' now always
have the standard copilot user data

Signed-off-by: Leo Sendelbach <s8lesend@stud.uni-saarland.de>
---
 issue_processing/issue_processing.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py
index 0ff891f..4144db1 100644
--- a/issue_processing/issue_processing.py
+++ b/issue_processing/issue_processing.py
@@ -18,7 +18,7 @@
 # Copyright 2018-2019 by Anselm Fehnker <fehnker@fim.uni-passau.de>
 # Copyright 2019 by Thomas Bock <bockthom@fim.uni-passau.de>
 # Copyright 2020-2021 by Thomas Bock <bockthom@cs.uni-saarland.de>
-# Copyright 2025 by Leo Sendelbach <s8lesend@stud.uni-saarland.de>
+# Copyright 2025-2026 by Leo Sendelbach <s8lesend@stud.uni-saarland.de>
 # All Rights Reserved.
 """
 This file is able to extract Github issue data from json files.
@@ -54,6 +54,9 @@
 # datetime format string
 datetime_format = "%Y-%m-%d %H:%M:%S"
 
+# Copilot username to be assigned in specific copilot events
+copilot_username = "Copilot"
+
 def run():
     # get all needed paths and arguments for the method call.
     parser = argparse.ArgumentParser(prog='codeface-extraction-issues-github', description='Codeface extraction')
@@ -488,6 +491,12 @@ def merge_issue_events(issue_data, external_connected_events):
             if event["event"] == "review_requested" or event["event"] == "review_request_removed":
                 event["ref_target"] = event["requestedReviewer"]
 
+            # if event is a specific copilot event, assign the copilot user data
+            if event["event"] == "copilot_work_started" or event["event"] == "copilot_work_finished":
+                event["user"]["name"] = None
+                event["user"]["username"] = copilot_username
+                event["user"]["email"] = ""
+
             # if event dismisses a review, we can determine the original state of the corresponding review
             if event["event"] == "review_dismissed":
                 for review in issue["reviewsList"]:

From 23f0dd6e16d4331ca5dc741503462ce5f58661e8 Mon Sep 17 00:00:00 2001
From: Leo Sendelbach <s8lesend@stud.uni-saarland.de>
Date: Tue, 27 Jan 2026 15:27:23 +0100
Subject: [PATCH 16/26] Add documentation for new copilot user unification

Method doc updated to reflect new functionality

Signed-off-by: Leo Sendelbach <s8lesend@stud.uni-saarland.de>
---
 author_postprocessing/author_postprocessing.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/author_postprocessing/author_postprocessing.py b/author_postprocessing/author_postprocessing.py
index d46074a..42d602c 100644
--- a/author_postprocessing/author_postprocessing.py
+++ b/author_postprocessing/author_postprocessing.py
@@ -99,7 +99,7 @@ def fix_github_browser_commits(data_path, issues_github_list, commits_list, auth
     "GitHub <noreply@github.com>" are removed. Also "mentioned" or "subscribed" events in the GitHub issue data which
     reference the author "GitHub <noreply@github.com>" are removed from the GitHub issue data. In addition, remove the
     author "GitHub <noreply@github.com>" also from the author data and bot data and remove e-mails that have been sent
-    by this author.
+    by this author. This method also unifies all known copilot users into a single user if desired.
 
     :param data_path: the path to the project data that is to be fixed
     :param issues_github_list: file name of the github issue data
@@ -107,6 +107,7 @@ def fix_github_browser_commits(data_path, issues_github_list, commits_list, auth
     :param authors_list: file name of the corresponding author data
     :param emails_list: file name of the corresponding email data
     :param bots_list: file name of the corresponding bot data
+    :param unify_copilot_users: whether to unify known copilot users into a single user
     """
     github_user = "GitHub"
     github_email = "noreply@github.com"

From eb53c790009af5a71c7d9504ecad0b737f669a22 Mon Sep 17 00:00:00 2001
From: Leo Sendelbach <s8lesend@stud.uni-saarland.de>
Date: Tue, 27 Jan 2026 15:48:39 +0100
Subject: [PATCH 17/26] Fix connected event assignment

previously, the creator of the issues was falsely matched to the
connected event instead of the user triggering the event

Signed-off-by: Leo Sendelbach <s8lesend@stud.uni-saarland.de>
---
 issue_processing/issue_processing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py
index 4144db1..dc05682 100644
--- a/issue_processing/issue_processing.py
+++ b/issue_processing/issue_processing.py
@@ -532,7 +532,7 @@ def merge_issue_events(issue_data, external_connected_events):
                     # if there is no connected event yet at this timestamp, create a new entry for this event
                     connected_info = dict()
                     connected_info["issues"] = [issue["number"]]
-                    connected_info["user"] = issue["user"]
+                    connected_info["user"] = event["user"]
                     connected_events[event["created_at"]] = connected_info
 
         # merge events, relatedCommits, relatedIssues and comment lists

From 9e6ccceb3caa91966ef0a9f56a696a9fe7a609e6 Mon Sep 17 00:00:00 2001
From: Leo Sendelbach <s8lesend@stud.uni-saarland.de>
Date: Wed, 18 Feb 2026 14:06:28 +0100
Subject: [PATCH 18/26] Unify copilot users in all files

unification now done on all files, which should prevent any issues
arising from unknown authors during anonymization
also move all global variables to a new utils file

Signed-off-by: Leo Sendelbach <s8lesend@stud.uni-saarland.de>
---
 .../author_postprocessing.py                  | 69 +++++++++++--------
 github_user_utils/github_user_utils.py        | 54 +++++++++++++++
 issue_processing/issue_processing.py          |  5 +-
 3 files changed, 95 insertions(+), 33 deletions(-)
 create mode 100644 github_user_utils/github_user_utils.py

diff --git a/author_postprocessing/author_postprocessing.py b/author_postprocessing/author_postprocessing.py
index 42d602c..c4a9e24 100644
--- a/author_postprocessing/author_postprocessing.py
+++ b/author_postprocessing/author_postprocessing.py
@@ -50,15 +50,10 @@
 
 from csv_writer import csv_writer
 
+from github_user_utils import known_copilot_users, copilot_unified_name, copilot_unified_email, \
+                              is_github_noreply_author, github_user, github_email, \
+                              commit_added_event, mentioned_event, subscribed_event
 
-##
-# GLOBAL VARIABLES
-##
-
-# global variable containing all known copilot users and the name and mail adress copilot users will be assigned
-known_copilot_users = {"Copilot", "copilot-pull-request-reviewer[bot]", "copilot-swe-agentbot"}
-copilot_unified_name = "Copilot"
-copilot_unified_email = "copilot@example.com"
 
 ##
 # RUN POSTPROCESSING
@@ -109,25 +104,6 @@ def fix_github_browser_commits(data_path, issues_github_list, commits_list, auth
     :param bots_list: file name of the corresponding bot data
     :param unify_copilot_users: whether to unify known copilot users into a single user
     """
-    github_user = "GitHub"
-    github_email = "noreply@github.com"
-    commit_added_event = "commit_added"
-    mentioned_event = "mentioned"
-    subscribed_event = "subscribed"
-
-    """
-    Helper function to check whether a (name, e-mail) pair belongs to the author "GitHub <noreply@github.com>".
-    There are two options in Codeface how this can happen:
-    (1) Username is "GitHub" and e-mail address is "noreply@github.com"
-    (2) Username is "GitHub" and e-mail address has been replaced by Codeface, resulting in "GitHub.noreply@github.com"
-
-    :param name: the name of the author to be checked
-    :param email: the email address of the author to be checked
-    :return: whether the given (name, email) pair belongs to the "GitHub <noreply@github.com>" author
-    """
-    def is_github_noreply_author(name, email):
-        return (name == github_user and (email == github_email or email == (github_user + "." + github_email)))
-
 
     # Check for all files in the result directory of the project whether they need to be adjusted
     for filepath, dirnames, filenames in walk(data_path):
@@ -136,20 +112,32 @@ def is_github_noreply_author(name, email):
         if authors_list in filenames:
             f = path.join(filepath, authors_list)
             log.info("Remove author %s <%s> in %s ...", github_user, github_email, f)
+            if unify_copilot_users:
+                log.info("Also unify copilot users to %s <%s> in %s ...", copilot_unified_name, copilot_unified_email, f)
             author_data = csv_writer.read_from_csv(f)
 
             author_data_new = []
-
+            copilot_user_added = False
             for author in author_data:
                 # keep author entry only if it should not be removed
                 if not is_github_noreply_author(author[1], author[2]):
-                    author_data_new.append(author)
+                    # unify copilot author if desired
+                    if unify_copilot_users and author[1] in known_copilot_users:
+                        if not copilot_user_added:
+                            author[1] = copilot_unified_name
+                            author[2] = copilot_unified_email
+                            copilot_user_added = True
+                            author_data_new.append(author)
+                    else:
+                        author_data_new.append(author)
             csv_writer.write_to_csv(f, author_data_new)
 
         # (2) Remove e-mails from author 'GitHub <noreply@github.com>' from all emails.list files
         if emails_list in filenames:
             f = path.join(filepath, emails_list)
             log.info("Remove emails from author %s <%s> in %s ...", github_user, github_email, f)
+            if unify_copilot_users:
+                log.info("Also unify copilot users to %s <%s> in %s ...", copilot_unified_name, copilot_unified_email, f)
             email_data = csv_writer.read_from_csv(f)
 
             email_data_new = []
@@ -157,6 +145,10 @@ def is_github_noreply_author(name, email):
             for email in email_data:
                 # keep author entry only if it should not be removed
                 if not is_github_noreply_author(email[0], email[1]):
+                    # unify copilot users if desired
+                    if unify_copilot_users and email[0] in known_copilot_users:
+                        email[0] = copilot_unified_name
+                        email[1] = copilot_unified_email
                     email_data_new.append(email)
                 else:
                     log.warn("Remove email %s as it was sent by %s <%s>.", email[2], email[0], email[1])
@@ -167,6 +159,8 @@ def is_github_noreply_author(name, email):
         if commits_list in filenames:
             f = path.join(filepath, commits_list)
             log.info("Replace author %s <%s> in %s ...", github_user, github_email, f)
+            if unify_copilot_users:
+                log.info("Also unify copilot users to %s <%s> in %s ...", copilot_unified_name, copilot_unified_email, f)
             commit_data = csv_writer.read_from_csv(f)
 
             for commit in commit_data:
@@ -175,6 +169,10 @@ def is_github_noreply_author(name, email):
                 if is_github_noreply_author(commit[5], commit[6]):
                     commit[5] = commit[2]
                     commit[6] = commit[3]
+                # unify copilot author if desired
+                if unify_copilot_users and commit[5] in known_copilot_users:
+                    commit[5] = copilot_unified_name
+                    commit[6] = copilot_unified_email
 
             csv_writer.write_to_csv(f, commit_data)
 
@@ -183,6 +181,8 @@ def is_github_noreply_author(name, email):
         if issues_github_list in filenames:
             f = path.join(filepath, issues_github_list)
             log.info("Replace author %s <%s> in %s ...", github_user, github_email, f)
+            if unify_copilot_users:
+                log.info("Also unify copilot users to %s <%s> in %s ...", copilot_unified_name, copilot_unified_email, f)
             issue_data = csv_writer.read_from_csv(f)
 
             # read commit data
@@ -197,7 +197,13 @@ def is_github_noreply_author(name, email):
                 if unify_copilot_users and event[9] in known_copilot_users:
                     event[9] = copilot_unified_name
                     event[10] = copilot_unified_email
-
+                    if event[8] == commit_added_event and event[13][-1:1] in known_copilot_users:
+                        # for commit added events, also unify the referenced author in event info 2 if it is a known copilot user
+                        event[13] = '"' + copilot_unified_name + '"'
+                    elif event[8] in (mentioned_event, subscribed_event) and event[12][-1:1] in known_copilot_users:
+                        # for mentioned/subscribed events, also unify the referenced user in event info 1 and 2 if it is a known copilot user
+                        event[12] = '"' + copilot_unified_name + '"'
+                        event[13] = '"' + copilot_unified_email + '"'
                 # replace author if necessary
                 if is_github_noreply_author(event[9], event[10]) and event[8] == commit_added_event:
                     # extract commit hash from event info 1
@@ -377,6 +383,9 @@ def run_postprocessing(conf, resdir, backup_data):
                     if person[4] == issue_event[12] and (quot_m + person[5] + quot_m) == issue_event[13]:
                         issue_event[12] = person[1]
                         issue_event[13] = quot_m + person[2] + quot_m
+                    # replace name in event info 2 if necessary
+                    if person[4] == issue_event[13]:
+                        issue_event[13] = person[1]
 
             csv_writer.write_to_csv(f, issue_data)
 
diff --git a/github_user_utils/github_user_utils.py b/github_user_utils/github_user_utils.py
new file mode 100644
index 0000000..20a3aa3
--- /dev/null
+++ b/github_user_utils/github_user_utils.py
@@ -0,0 +1,54 @@
+# coding=utf-8
+# This file is part of codeface-extraction, which is free software: you
+# can redistribute it and/or modify it under the terms of the GNU General
+# Public License as published by the Free Software Foundation, version 2.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+# Copyright 2026 by Leo Sendelbach <s8lesend@stud.uni-saarland.de>
+# All Rights Reserved.
+"""
+This file serves as a collection of global variables and utility functions, which are used throughout the 
+issue data extraction and post-processing, in particular for the processing of GitHub and Copilot user data.
+"""
+
+##
+# GLOBAL VARIABLES
+##
+
+# global variables containing all known copilot users and the name and mail adress copilot users will be assigned
+known_copilot_users = {"Copilot", "copilot-pull-request-reviewer[bot]", "copilot-swe-agentbot"}
+copilot_unified_name = "Copilot"
+copilot_unified_email = "copilot@example.com"
+
+## global variables for the GitHub author
+github_user = "GitHub"
+github_email = "noreply@github.com"
+commit_added_event = "commit_added"
+mentioned_event = "mentioned"
+subscribed_event = "subscribed"
+
+##
+# UTILITY FUNCTIONS
+##
+
+def is_github_noreply_author(name, email):
+    """
+    Helper function to check whether a (name, e-mail) pair belongs to the author "GitHub <noreply@github.com>".
+    There are two options in Codeface how this can happen:
+    (1) Username is "GitHub" and e-mail address is "noreply@github.com"
+    (2) Username is "GitHub" and e-mail address has been replaced by Codeface, resulting in "GitHub.noreply@github.com"
+
+    :param name: the name of the author to be checked
+    :param email: the email address of the author to be checked
+    :return: whether the given (name, email) pair belongs to the "GitHub <noreply@github.com>" author
+    """
+
+    return (name == github_user and (email == github_email or email == (github_user + "." + github_email)))
\ No newline at end of file
diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py
index dc05682..25669cd 100644
--- a/issue_processing/issue_processing.py
+++ b/issue_processing/issue_processing.py
@@ -41,6 +41,7 @@
 from dateutil import parser as dateparser
 
 from csv_writer import csv_writer
+from github_user_utils import copilot_unified_name
 
 # known types from JIRA and GitHub default labels
 known_types = {"bug", "improvement", "enhancement", "feature", "task", "test", "wish"}
@@ -54,8 +55,6 @@
 # datetime format string
 datetime_format = "%Y-%m-%d %H:%M:%S"
 
-# Copilot username to be assigned in specific copilot events
-copilot_username = "Copilot"
 
 def run():
     # get all needed paths and arguments for the method call.
@@ -494,7 +493,7 @@ def merge_issue_events(issue_data, external_connected_events):
             # if event is a specific copilot event, assign the copilot user data
             if event["event"] == "copilot_work_started" or event["event"] == "copilot_work_finished":
                 event["user"]["name"] = None
-                event["user"]["username"] = copilot_username
+                event["user"]["username"] = copilot_unified_name
                 event["user"]["email"] = ""
 
             # if event dismisses a review, we can determine the original state of the corresponding review

From a3558a6e8a8d208dcb39135176c02c80326034b4 Mon Sep 17 00:00:00 2001
From: Leo Sendelbach <s8lesend@stud.uni-saarland.de>
Date: Wed, 18 Feb 2026 14:14:11 +0100
Subject: [PATCH 19/26] Add support for 'known agents'

Known agentsc such as 'copilot' or 'claude' can now be read, similar to
known bots. They will be flagged as agents during bot processing.

Signed-off-by: Leo Sendelbach <s8lesend@stud.uni-saarland.de>
---
 bot_processing/bot_processing.py | 34 ++++++++++++++++++++++++++++----
 1 file changed, 30 insertions(+), 4 deletions(-)

diff --git a/bot_processing/bot_processing.py b/bot_processing/bot_processing.py
index 53a397e..ba3fa61 100644
--- a/bot_processing/bot_processing.py
+++ b/bot_processing/bot_processing.py
@@ -13,6 +13,7 @@
 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 #
 # Copyright 2021-2022 by Thomas Bock <bockthom@cs.uni-saarland.de>
+# Copyright 2026 by Leo Sendelbach <s8lesend@stud.uni-saarland.de>
 # All Rights Reserved.
 """
 This file is able to extract information on bot/human users from csv files.
@@ -52,6 +53,7 @@ def run():
     # (the known bots file is the file in which known bots have been added manually and project independent)
     __confdir = os.path.join(args.resdir, os.path.dirname(args.config))
     __known_bots_file = os.path.abspath(os.path.join(__confdir, "known_github_bots.list"))
+    __known_agents_file = os.path.abspath(os.path.join(__confdir, "known_github_agents.list"))
 
     # run processing of bot data:
     # 1) load bot data
@@ -59,7 +61,7 @@ def run():
     # 2) load user data
     users = load_user_data(os.path.join(__resdir, "usernames.list"))
     # 3) update bot data with user data and additionally add known bots if they occur in the project
-    bots = add_user_data(bots, users, __known_bots_file)
+    bots = add_user_data(bots, users, __known_bots_file, __known_agents_file)
     # 4) dump result to disk
     print_to_disk(bots, __resdir)
 
@@ -111,12 +113,13 @@ def load_user_data(user_data_file):
     return user_data
 
 
-def check_with_known_bot_list(known_bots_file, bot_data, user_data, bot_data_reduced):
+def check_with_known_bot_or_agent_list(known_bots_file, known_agents_file, bot_data, user_data, bot_data_reduced):
     """
     Check whether there are known bots occurring in the project. If so, add them to the bots list
     or update the bots list accordingly.
 
     :param known_bots_file: the file path to the list of known bot data
+    :param known_agents_file: the file path to the list of known agent data
     :param bot_data: the bot data originating from the bot prediction
     :param user_data: a dictionary from the issue data which maps GitHub usernames to authors
     :param bot_data_reduced: the bot data after mapping GitHub user names to authors
@@ -126,6 +129,7 @@ def check_with_known_bot_list(known_bots_file, bot_data, user_data, bot_data_red
 
     # Read the list of known bots
     known_bots = load_bot_data(known_bots_file, header = False)
+    known_agents = load_bot_data(known_agents_file, header = False)
 
     # Get the GitHub usernames of the bots predicted to be a bot
     predicted_bots = [bot[0] if len(bot) > 0 else "" for bot in bot_data]
@@ -152,11 +156,33 @@ def check_with_known_bot_list(known_bots_file, bot_data, user_data, bot_data_red
                     log.info("Mark user '{}' as bot in the bot data.".format(user_data[bot[0]]))
                     break
 
+    for agent in known_agents:
+
+        # (1) check if a known agent occurs in the GitHub issue data but has not been predicted
+        if agent[0] not in predicted_bots and agent[0] in user_data:
+
+            # add the known agent as a bot to the bots list
+            additional_agent = dict()
+            additional_agent["user"] = user_data[agent[0]]
+            additional_agent["prediction"] = "Agent"
+            bot_data_reduced.append(additional_agent)
+            log.info("Add known agent '{}' to bot data.".format(additional_agent["user"]))
+
+        # (2) handle known agents that are already present in the bots list
+        elif agent[0] in predicted_bots and agent[0] in user_data:
+
+            # make sure that this bot has also been predicited to be an agent
+            for predicted_bot in bot_data_reduced:
+                if predicted_bot["user"] == user_data[agent[0]]:
+                    predicted_bot["prediction"] = "Agent"
+                    log.info("Mark user '{}' as agent in the bot data.".format(user_data[agent[0]]))
+                    break
+
     # return the updated bot data
     return bot_data_reduced
 
 
-def add_user_data(bot_data, user_data, known_bots_file):
+def add_user_data(bot_data, user_data, known_bots_file, known_agents_file):
     """
     Add user data to bot data, i.e., replace username by name and e-mail.
     In addition, check in the global bots list whether there are authors in the projects which are
@@ -200,7 +226,7 @@ def add_user_data(bot_data, user_data, known_bots_file):
             log.warn("User '{}' in bot data does not occur in GitHub user data. Remove user...".format(user[0]))
 
     # check whether known GitHub bots occur in the GitHub issue data and, if so, update the bot data accordingly
-    bot_data_reduced = check_with_known_bot_list(known_bots_file, bot_data, user_buffer, bot_data_reduced)
+    bot_data_reduced = check_with_known_bot_or_agent_list(known_bots_file, known_agents_file, bot_data, user_buffer, bot_data_reduced)
 
     return bot_data_reduced
 

From 0bde8a0e54940b7a6798da503a0af805a3d3afe9 Mon Sep 17 00:00:00 2001
From: Leo Sendelbach <s8lesend@stud.uni-saarland.de>
Date: Wed, 18 Feb 2026 14:31:59 +0100
Subject: [PATCH 20/26] Add better bot name variant support

Add a helper function for creating bot name variants utilizing either
'[bot]' or 'bot' suffix. Also update bot processing to check user buffer
for all variants.

Signed-off-by: Leo Sendelbach <s8lesend@stud.uni-saarland.de>
---
 .../author_postprocessing.py                  | 16 +++++++-------
 bot_processing/bot_processing.py              |  8 +++++++
 github_user_utils/github_user_utils.py        | 21 ++++++++++++++++++-
 3 files changed, 36 insertions(+), 9 deletions(-)

diff --git a/author_postprocessing/author_postprocessing.py b/author_postprocessing/author_postprocessing.py
index c4a9e24..7168e00 100644
--- a/author_postprocessing/author_postprocessing.py
+++ b/author_postprocessing/author_postprocessing.py
@@ -52,9 +52,9 @@
 
 from github_user_utils import known_copilot_users, copilot_unified_name, copilot_unified_email, \
                               is_github_noreply_author, github_user, github_email, \
-                              commit_added_event, mentioned_event, subscribed_event
-
+                              commit_added_event, mentioned_event, subscribed_event, generate_botname_variants
 
+known_copilot_users_extended = generate_botname_variants(known_copilot_users)
 ##
 # RUN POSTPROCESSING
 ##
@@ -122,7 +122,7 @@ def fix_github_browser_commits(data_path, issues_github_list, commits_list, auth
                 # keep author entry only if it should not be removed
                 if not is_github_noreply_author(author[1], author[2]):
                     # unify copilot author if desired
-                    if unify_copilot_users and author[1] in known_copilot_users:
+                    if unify_copilot_users and author[1] in known_copilot_users_extended:
                         if not copilot_user_added:
                             author[1] = copilot_unified_name
                             author[2] = copilot_unified_email
@@ -146,7 +146,7 @@ def fix_github_browser_commits(data_path, issues_github_list, commits_list, auth
                 # keep author entry only if it should not be removed
                 if not is_github_noreply_author(email[0], email[1]):
                     # unify copilot users if desired
-                    if unify_copilot_users and email[0] in known_copilot_users:
+                    if unify_copilot_users and email[0] in known_copilot_users_extended:
                         email[0] = copilot_unified_name
                         email[1] = copilot_unified_email
                     email_data_new.append(email)
@@ -170,7 +170,7 @@ def fix_github_browser_commits(data_path, issues_github_list, commits_list, auth
                     commit[5] = commit[2]
                     commit[6] = commit[3]
                 # unify copilot author if desired
-                if unify_copilot_users and commit[5] in known_copilot_users:
+                if unify_copilot_users and commit[5] in known_copilot_users_extended:
                     commit[5] = copilot_unified_name
                     commit[6] = copilot_unified_email
 
@@ -194,13 +194,13 @@ def fix_github_browser_commits(data_path, issues_github_list, commits_list, auth
 
             for event in issue_data:
                 # unify events to use a single copilot user for all events triggered by a known copilot user
-                if unify_copilot_users and event[9] in known_copilot_users:
+                if unify_copilot_users and event[9] in known_copilot_users_extended:
                     event[9] = copilot_unified_name
                     event[10] = copilot_unified_email
-                    if event[8] == commit_added_event and event[13][-1:1] in known_copilot_users:
+                    if event[8] == commit_added_event and event[13][-1:1] in known_copilot_users_extended:
                         # for commit added events, also unify the referenced author in event info 2 if it is a known copilot user
                         event[13] = '"' + copilot_unified_name + '"'
-                    elif event[8] in (mentioned_event, subscribed_event) and event[12][-1:1] in known_copilot_users:
+                    elif event[8] in (mentioned_event, subscribed_event) and event[12][-1:1] in known_copilot_users_extended:
                         # for mentioned/subscribed events, also unify the referenced user in event info 1 and 2 if it is a known copilot user
                         event[12] = '"' + copilot_unified_name + '"'
                         event[13] = '"' + copilot_unified_email + '"'
diff --git a/bot_processing/bot_processing.py b/bot_processing/bot_processing.py
index ba3fa61..3681c5d 100644
--- a/bot_processing/bot_processing.py
+++ b/bot_processing/bot_processing.py
@@ -222,6 +222,14 @@ def add_user_data(bot_data, user_data, known_bots_file, known_agents_file):
             bot_reduced["user"] = user_buffer[user[0]]
             bot_reduced["prediction"] = user[-1]
             bot_data_reduced.append(bot_reduced)
+        elif user[0] + "bot" in user_buffer.keys():
+            bot_reduced["user"] = user_buffer[user[0] + "bot"]
+            bot_reduced["prediction"] = user[-1]
+            bot_data_reduced.append(bot_reduced)
+        elif user[0] + "[bot]" in user_buffer.keys():
+            bot_reduced["user"] = user_buffer[user[0] + "[bot]"]
+            bot_reduced["prediction"] = user[-1]
+            bot_data_reduced.append(bot_reduced)
         else:
             log.warn("User '{}' in bot data does not occur in GitHub user data. Remove user...".format(user[0]))
 
diff --git a/github_user_utils/github_user_utils.py b/github_user_utils/github_user_utils.py
index 20a3aa3..561a345 100644
--- a/github_user_utils/github_user_utils.py
+++ b/github_user_utils/github_user_utils.py
@@ -51,4 +51,23 @@ def is_github_noreply_author(name, email):
     :return: whether the given (name, email) pair belongs to the "GitHub <noreply@github.com>" author
     """
 
-    return (name == github_user and (email == github_email or email == (github_user + "." + github_email)))
\ No newline at end of file
+    return (name == github_user and (email == github_email or email == (github_user + "." + github_email)))
+
+def generate_botname_variants(botnames):
+    """
+    Helper function to generate variants of bot names, which are used in the list of
+    known bots and agents as well as during author postprocessing.
+
+    :param botnames: the list of bot names for which variants should be generated
+    :return: a set of bot name variants
+    """
+
+    botname_variants = set()
+    for botname in botnames:
+        botname_variants.add(botname)
+        if botname.endswith("[bot]"):
+            botname_variants.add(botname[:-5] + "bot")
+        elif botname.endswith("bot"):
+            botname_variants.add(botname[:-3] + "[bot]")
+
+    return botname_variants

From 1776a13fb2ea6c86695c6215b55148dd7965e764 Mon Sep 17 00:00:00 2001
From: Leo Sendelbach <s8lesend@stud.uni-saarland.de>
Date: Fri, 20 Feb 2026 17:17:46 +0100
Subject: [PATCH 21/26] Add better bot name handling

Add a helper function that given a botname and a list of names, returns
which bot name variant is contained in the list (or None). This is used
whenever we check if a known bot is in the userdata or has been
predicted to be a bot, and means that botnames in the known_bots file do
not need to be duplicated for each variant.
Also, automatically add all known coplilot users to the known_agents
list, and then unify those during author postprocessing.

Signed-off-by: Leo Sendelbach <s8lesend@stud.uni-saarland.de>
---
 .../author_postprocessing.py                  | 13 ++++-
 bot_processing/bot_processing.py              | 55 ++++++++++++++-----
 github_user_utils/github_user_utils.py        |  8 +--
 3 files changed, 55 insertions(+), 21 deletions(-)

diff --git a/author_postprocessing/author_postprocessing.py b/author_postprocessing/author_postprocessing.py
index 7168e00..d05015b 100644
--- a/author_postprocessing/author_postprocessing.py
+++ b/author_postprocessing/author_postprocessing.py
@@ -247,6 +247,9 @@ def fix_github_browser_commits(data_path, issues_github_list, commits_list, auth
         if bots_list in filenames:
             f = path.join(filepath, bots_list)
             log.info("Remove author %s <%s> from %s ...", github_user, github_email, f)
+            if unify_copilot_users:
+                log.info("Also unify copilot users to %s <%s> in %s ...", copilot_unified_name, copilot_unified_email, f)
+            copilot_user_added = False
             bot_data = csv_writer.read_from_csv(f)
 
             bot_data_new = []
@@ -254,7 +257,15 @@ def fix_github_browser_commits(data_path, issues_github_list, commits_list, auth
             for entry in bot_data:
                 # keep bot entry only if it should not be removed
                 if not is_github_noreply_author(entry[0], entry[1]):
-                    bot_data_new.append(entry)
+                    # unify copilot users if desired
+                    if unify_copilot_users and entry[0] in known_copilot_users_extended:
+                        if not copilot_user_added:
+                            entry[0] = copilot_unified_name
+                            entry[1] = copilot_unified_email
+                            copilot_user_added = True
+                            bot_data_new.append(entry)
+                    else:
+                        bot_data_new.append(entry)
                 else:
                     log.warn("Remove entry %s <%s> from bots list.", entry[0], entry[1])
 
diff --git a/bot_processing/bot_processing.py b/bot_processing/bot_processing.py
index 3681c5d..113405a 100644
--- a/bot_processing/bot_processing.py
+++ b/bot_processing/bot_processing.py
@@ -30,6 +30,7 @@
 from codeface.configuration import Configuration
 
 from csv_writer import csv_writer
+from github_user_utils import known_copilot_users, generate_botname_variants
 
 def run():
     # get all needed paths and arguments for the method call.
@@ -137,25 +138,35 @@ def check_with_known_bot_or_agent_list(known_bots_file, known_agents_file, bot_d
     for bot in known_bots:
 
         # (1) check if a known bot occurs in the GitHub issue data but has not been predicted
-        if bot[0] not in predicted_bots and bot[0] in user_data:
+        bot_variation_predicted_bots = containing_bot_variation(bot[0], predicted_bots)
+        bot_variation_user_data = containing_bot_variation(bot[0], user_data)
+        if bot_variation_predicted_bots is None and bot_variation_user_data is not None:
 
             # add the known bot as a bot to the bots list
             additional_bot = dict()
-            additional_bot["user"] = user_data[bot[0]]
+            additional_bot["user"] = user_data[bot_variation_user_data]
             additional_bot["prediction"] = "Bot"
             bot_data_reduced.append(additional_bot)
             log.info("Add known bot '{}' to bot data.".format(additional_bot["user"]))
 
         # (2) handle known bots that are already present in the bots list
-        elif bot[0] in predicted_bots and bot[0] in user_data:
+        elif bot_variation_predicted_bots is not None and bot_variation_user_data is not None:
 
             # make sure that this bot has also been predicited to be bot
             for predicted_bot in bot_data_reduced:
-                if predicted_bot["user"] == user_data[bot[0]]:
+                if predicted_bot["user"] == user_data[bot_variation_user_data]:
                     predicted_bot["prediction"] = "Bot"
-                    log.info("Mark user '{}' as bot in the bot data.".format(user_data[bot[0]]))
+                    log.info("Mark user '{}' as bot in the bot data.".format(user_data[bot_variation_user_data]))
                     break
 
+    # get list of known agents and combine it with the list of known copilot users
+    copilot_users_variants = generate_botname_variants(known_copilot_users)
+    # get list of known agent names
+    known_agents_names = [agent[0] for agent in known_agents]
+    for copilot_user in copilot_users_variants:
+        if copilot_user not in known_agents_names:
+            known_agents.append([copilot_user])
+
     for agent in known_agents:
 
         # (1) check if a known agent occurs in the GitHub issue data but has not been predicted
@@ -218,16 +229,9 @@ def add_user_data(bot_data, user_data, known_bots_file, known_agents_file):
             continue
 
         # get user information if available
-        if user[0] in user_buffer.keys():
-            bot_reduced["user"] = user_buffer[user[0]]
-            bot_reduced["prediction"] = user[-1]
-            bot_data_reduced.append(bot_reduced)
-        elif user[0] + "bot" in user_buffer.keys():
-            bot_reduced["user"] = user_buffer[user[0] + "bot"]
-            bot_reduced["prediction"] = user[-1]
-            bot_data_reduced.append(bot_reduced)
-        elif user[0] + "[bot]" in user_buffer.keys():
-            bot_reduced["user"] = user_buffer[user[0] + "[bot]"]
+        bot_variation = containing_bot_variation(user[0], user_buffer.keys())
+        if bot_variation is not None:
+            bot_reduced["user"] = user_buffer[bot_variation]
             bot_reduced["prediction"] = user[-1]
             bot_data_reduced.append(bot_reduced)
         else:
@@ -239,6 +243,27 @@ def add_user_data(bot_data, user_data, known_bots_file, known_agents_file):
     return bot_data_reduced
 
 
+def containing_bot_variation(botname, name_list):
+    """
+    Helper function to return the variation of a given bot name that occurs in a list of names.
+
+    :param botname: the bot name for which the variation should be returned
+    :param name_list: the list of names to be checked for containing the bot name or a variation of it
+    :return: the variation of the given bot name that occurs in the given list of names, or None if no such variation exists
+    """
+
+    if botname in name_list:
+        return botname
+    elif botname + "bot" in name_list:
+        return botname + "bot"
+    elif botname + "[bot]" in name_list:
+        return botname + "[bot]"
+    elif botname.replace("[", "").replace("]", "") in name_list:
+        return botname.replace("[", "").replace("]", "")
+    else:
+        return None
+
+
 def print_to_disk(bot_data, results_folder):
     """
     Print bot data to file "bots.list" in result folder.
diff --git a/github_user_utils/github_user_utils.py b/github_user_utils/github_user_utils.py
index 561a345..652b63b 100644
--- a/github_user_utils/github_user_utils.py
+++ b/github_user_utils/github_user_utils.py
@@ -24,7 +24,7 @@
 ##
 
 # global variables containing all known copilot users and the name and mail adress copilot users will be assigned
-known_copilot_users = {"Copilot", "copilot-pull-request-reviewer[bot]", "copilot-swe-agentbot"}
+known_copilot_users = {"Copilot", "copilot-pull-request-reviewer[bot]", "copilot-swe-agent[bot]"}
 copilot_unified_name = "Copilot"
 copilot_unified_email = "copilot@example.com"
 
@@ -65,9 +65,7 @@ def generate_botname_variants(botnames):
     botname_variants = set()
     for botname in botnames:
         botname_variants.add(botname)
-        if botname.endswith("[bot]"):
-            botname_variants.add(botname[:-5] + "bot")
-        elif botname.endswith("bot"):
-            botname_variants.add(botname[:-3] + "[bot]")
+        botname = botname.replace("[", "").replace("]", "")
+        botname_variants.add(botname)
 
     return botname_variants

From 105f88c5d22836ab9de332449c6ea0ae3fba8350 Mon Sep 17 00:00:00 2001
From: Leo Sendelbach <s8lesend@stud.uni-saarland.de>
Date: Tue, 3 Mar 2026 15:43:11 +0100
Subject: [PATCH 22/26] Add copilot user unification for more events

also add agents to bot handling, fix formatting for event_info_2 and
subissues
also fix a typo where strings would not have their quotes correctly
removed

Signed-off-by: Leo Sendelbach <s8lesend@stud.uni-saarland.de>
---
 .../author_postprocessing.py                  | 39 +++++++++++--------
 bot_processing/bot_processing.py              |  2 +-
 github_user_utils/__init__.py                 |  1 +
 github_user_utils/github_user_utils.py        |  7 ++++
 issue_processing/issue_processing.py          |  8 ++--
 5 files changed, 36 insertions(+), 21 deletions(-)
 create mode 100644 github_user_utils/__init__.py

diff --git a/author_postprocessing/author_postprocessing.py b/author_postprocessing/author_postprocessing.py
index d05015b..7aa9526 100644
--- a/author_postprocessing/author_postprocessing.py
+++ b/author_postprocessing/author_postprocessing.py
@@ -50,9 +50,11 @@
 
 from csv_writer import csv_writer
 
-from github_user_utils import known_copilot_users, copilot_unified_name, copilot_unified_email, \
+from github_user_utils.github_user_utils import known_copilot_users, copilot_unified_name, copilot_unified_email, \
                               is_github_noreply_author, github_user, github_email, \
-                              commit_added_event, mentioned_event, subscribed_event, generate_botname_variants
+                              commit_added_event, mentioned_event, subscribed_event, \
+                              assigned_event, unassigned_event, review_requested_event, \
+                              review_request_removed_event, generate_botname_variants, quot_m
 
 known_copilot_users_extended = generate_botname_variants(known_copilot_users)
 ##
@@ -173,6 +175,9 @@ def fix_github_browser_commits(data_path, issues_github_list, commits_list, auth
                 if unify_copilot_users and commit[5] in known_copilot_users_extended:
                     commit[5] = copilot_unified_name
                     commit[6] = copilot_unified_email
+                if unify_copilot_users and commit[2] in known_copilot_users_extended:
+                    commit[2] = copilot_unified_name
+                    commit[3] = copilot_unified_email
 
             csv_writer.write_to_csv(f, commit_data)
 
@@ -191,19 +196,20 @@ def fix_github_browser_commits(data_path, issues_github_list, commits_list, auth
             commit_hash_to_author = {commit[7]: commit[2:4] for commit in commit_data}
             author_name_to_data = {author[1]: author[1:3] for author in author_data_new}
             issue_data_new = []
-
             for event in issue_data:
                 # unify events to use a single copilot user for all events triggered by a known copilot user
                 if unify_copilot_users and event[9] in known_copilot_users_extended:
                     event[9] = copilot_unified_name
                     event[10] = copilot_unified_email
-                    if event[8] == commit_added_event and event[13][-1:1] in known_copilot_users_extended:
-                        # for commit added events, also unify the referenced author in event info 2 if it is a known copilot user
-                        event[13] = '"' + copilot_unified_name + '"'
-                    elif event[8] in (mentioned_event, subscribed_event) and event[12][-1:1] in known_copilot_users_extended:
-                        # for mentioned/subscribed events, also unify the referenced user in event info 1 and 2 if it is a known copilot user
-                        event[12] = '"' + copilot_unified_name + '"'
-                        event[13] = '"' + copilot_unified_email + '"'
+                if unify_copilot_users and event[8] == commit_added_event and event[13][1:-1] in known_copilot_users_extended:
+                    # for commit added events, also unify the referenced author in event info 2 if it is a known copilot user
+                    event[13] = quot_m + copilot_unified_name + quot_m
+                elif unify_copilot_users and event[8] in (mentioned_event, subscribed_event, assigned_event, unassigned_event,
+                                                          review_requested_event, review_request_removed_event) \
+                                         and event[12] in known_copilot_users_extended:
+                    # for mentioned/subscribed events, also unify the referenced user in event info 1 and 2 if it is a known copilot user
+                    event[12] = copilot_unified_name
+                    event[13] = quot_m + copilot_unified_email + quot_m
                 # replace author if necessary
                 if is_github_noreply_author(event[9], event[10]) and event[8] == commit_added_event:
                     # extract commit hash from event info 1
@@ -302,9 +308,6 @@ def run_postprocessing(conf, resdir, backup_data):
     bugs_jira_list = "bugs-jira.list"
     bots_list = "bots.list"
 
-    # When looking at elements originating from json lists, we need to consider quotation marks around the string
-    quot_m = "\""
-
     data_path = path.join(resdir, conf["project"], conf["tagging"])
 
     # Correctly replace author 'GitHub <noreply@github.com>' in the commit data and in "commit_added" events of the
@@ -395,8 +398,8 @@ def run_postprocessing(conf, resdir, backup_data):
                         issue_event[12] = person[1]
                         issue_event[13] = quot_m + person[2] + quot_m
                     # replace name in event info 2 if necessary
-                    if person[4] == issue_event[13]:
-                        issue_event[13] = person[1]
+                    if quot_m + person[4] + quot_m == issue_event[13]:
+                        issue_event[13] = quot_m + person[1] + quot_m
 
             csv_writer.write_to_csv(f, issue_data)
 
@@ -463,8 +466,12 @@ def run_postprocessing(conf, resdir, backup_data):
                     # the bot is already in the list, check if there are different predictions
                     stored_bot = bot_names_and_emails[(bot[0], bot[1])]
                     if stored_bot[2] != bot[2]:
+                        # if either of the predictions is agent, keep agent
+                        if (stored_bot[2] == "Agent" or bot[2] == "Agent"):
+                            stored_bot[2] = "Agent"
+                            bot_names_and_emails[(bot[0], bot[1])] = stored_bot
                         # if either of the predictions is bot, keep bot
-                        if (stored_bot[2] == "Bot" or bot[2] == "Bot"):
+                        elif (stored_bot[2] == "Bot" or bot[2] == "Bot"):
                             stored_bot[2] = "Bot"
                             bot_names_and_emails[(bot[0], bot[1])] = stored_bot
                         # otherwise, if either of the predictions is human, keep human
diff --git a/bot_processing/bot_processing.py b/bot_processing/bot_processing.py
index 113405a..ad76d85 100644
--- a/bot_processing/bot_processing.py
+++ b/bot_processing/bot_processing.py
@@ -30,7 +30,7 @@
 from codeface.configuration import Configuration
 
 from csv_writer import csv_writer
-from github_user_utils import known_copilot_users, generate_botname_variants
+from github_user_utils.github_user_utils import known_copilot_users, generate_botname_variants
 
 def run():
     # get all needed paths and arguments for the method call.
diff --git a/github_user_utils/__init__.py b/github_user_utils/__init__.py
new file mode 100644
index 0000000..9bad579
--- /dev/null
+++ b/github_user_utils/__init__.py
@@ -0,0 +1 @@
+# coding=utf-8
diff --git a/github_user_utils/github_user_utils.py b/github_user_utils/github_user_utils.py
index 652b63b..20fa8d8 100644
--- a/github_user_utils/github_user_utils.py
+++ b/github_user_utils/github_user_utils.py
@@ -34,6 +34,13 @@
 commit_added_event = "commit_added"
 mentioned_event = "mentioned"
 subscribed_event = "subscribed"
+assigned_event = "assigned"
+unassigned_event = "unassigned"
+review_requested_event = "review_requested"
+review_request_removed_event = "review_request_removed"
+
+# When looking at elements originating from json lists, we need to consider quotation marks around the string
+quot_m = "\""
 
 ##
 # UTILITY FUNCTIONS
diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py
index 25669cd..1513c06 100644
--- a/issue_processing/issue_processing.py
+++ b/issue_processing/issue_processing.py
@@ -41,7 +41,7 @@
 from dateutil import parser as dateparser
 
 from csv_writer import csv_writer
-from github_user_utils import copilot_unified_name
+from github_user_utils.github_user_utils import copilot_unified_name
 
 # known types from JIRA and GitHub default labels
 known_types = {"bug", "improvement", "enhancement", "feature", "task", "test", "wish"}
@@ -754,9 +754,9 @@ def reformat_events(issue_data, filtered_connected_events, external_connected_ev
                 event["event_info_1"] = issue["state_new"]
                 # if event is a review comment, it can contain suggestions
                 if "contains_suggestion" in event:
-                    event["event_info_2"] = event["contains_suggestion"]
+                    event["event_info_2"] = str(event["contains_suggestion"])
                 else:
-                    event["event_info_2"] = False
+                    event["event_info_2"] = str(False)
 
             elif event["event"] == "referenced" and not event["commit"] is None:
                 # remove "referenced" events originating from commits
@@ -934,7 +934,7 @@ def print_to_disk(issues, results_folder):
                 json.dumps(issue["resolution"]),
                 issue["created_at"],
                 issue["closed_at"],
-                json.dumps([issue["subIssues"]]),  # components
+                json.dumps(issue["subIssues"]),  # components
                 event["event"],
                 event["user"]["name"],
                 event["user"]["email"],

From 2ea7392061aab90b1a28aff18e718c4c3a8a6401 Mon Sep 17 00:00:00 2001
From: Leo Sendelbach <s8lesend@stud.uni-saarland.de>
Date: Tue, 10 Mar 2026 13:58:06 +0100
Subject: [PATCH 23/26] Add reason for conversation locking

lock reason is saved in event_info_1

Signed-off-by: Leo Sendelbach <s8lesend@stud.uni-saarland.de>
---
 issue_processing/issue_processing.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py
index 1513c06..1de194e 100644
--- a/issue_processing/issue_processing.py
+++ b/issue_processing/issue_processing.py
@@ -534,6 +534,10 @@ def merge_issue_events(issue_data, external_connected_events):
                     connected_info["user"] = event["user"]
                     connected_events[event["created_at"]] = connected_info
 
+            # if event is a locked event, save the lock reason in event_info_1
+            if event["event"] == "locked":
+                event["event_info_1"] = event["lock_reason"]
+
         # merge events, relatedCommits, relatedIssues and comment lists
         issue["eventsList"] = issue["commentsList"] + issue["eventsList"] + issue["relatedIssues"] + issue[
             "relatedCommits"] + issue["reviewsList"]

From 977e86187e3097e25e2b7d22a3643ac6956a061c Mon Sep 17 00:00:00 2001
From: Leo Sendelbach <s8lesend@stud.uni-saarland.de>
Date: Tue, 10 Mar 2026 14:09:27 +0100
Subject: [PATCH 24/26] Fix spelling and documentation

docstrings should now more accurately reflect parameters and return
values

Signed-off-by: Leo Sendelbach <s8lesend@stud.uni-saarland.de>
---
 issue_processing/issue_processing.py | 25 ++++++++++++-------------
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py
index 1de194e..4dc8c63 100644
--- a/issue_processing/issue_processing.py
+++ b/issue_processing/issue_processing.py
@@ -31,7 +31,6 @@
 import sys
 import urllib
 from datetime import datetime, timedelta
-import math
 
 import operator
 from codeface.cli import log
@@ -234,7 +233,6 @@ def reformat_issues(issue_data):
     Re-arrange issue data structure.
 
     :param issue_data: the issue data to re-arrange
-    :return: the re-arranged issue data
     """
 
     log.devinfo("Re-arranging Github issues...")
@@ -299,7 +297,8 @@ def merge_issue_events(issue_data, external_connected_events):
     All issue events are merged together in the eventsList. This simplifies processing in later steps.
 
     :param issue_data: the issue data from which the events shall be merged
-    :return: the issue data with merged eventsList
+    :param external_connected_events: a dict to store connected events to external issues
+    :return: a filtered dict of connected events for future reconstruction
     """
 
     log.info("Merge issue events ...")
@@ -573,17 +572,17 @@ def filter_connected_events(key, value, external_connected_events):
     # if 2 connected events exist, matching them is trivial
     if num_issues == 2:
         return True
-    occurences = {x: value["issues"].count(x) for x in set(value["issues"])}
+    occurrences = {x: value["issues"].count(x) for x in set(value["issues"])}
     # otherwise, if it is an even number, check if it can be easily matched,
     # meaning that exactly half the events occur in the same issue
-    if num_issues % 2 == 0 and num_issues/2 in occurences.values():
+    if num_issues % 2 == 0 and num_issues/2 in occurrences.values():
         # duplicate issue list for matching the issues later
         value["multi_issues_copy"] = list(value["issues"])
         return True
     # if it is an odd number, check if it can be easily matched
     # meaning that exactly half (rounded up) the events occur in the same issue
-    if num_issues % 2 == 1 and (num_issues + 1)/2 in occurences.values():
-        for sub_key, sub_value in occurences.iteritems():
+    if num_issues % 2 == 1 and (num_issues + 1)/2 in occurrences.values():
+        for sub_key, sub_value in occurrences.iteritems():
             # then, assign one of them as an external connected event and proceed as in previous case
             if sub_value == (num_issues + 1)/2:
                 new_entry = dict()
@@ -603,7 +602,8 @@ def reformat_events(issue_data, filtered_connected_events, external_connected_ev
     Re-format event information dependent on the event type.
 
     :param issue_data: the data of all issues that shall be re-formatted
-    :return: the issue data with updated event information
+    :param filtered_connected_events: the dict of connected events which can be reconstructed
+    :param external_connected_events: the dict of connected events to external issues
     """
 
     log.info("Update event information ...")
@@ -643,9 +643,9 @@ def reformat_events(issue_data, filtered_connected_events, external_connected_ev
                         # and we only have 2 issues in the list, connect to the other issue
                         event["event_info_1"] = value["issues"][0] if value["issues"][1] == issue["number"] else value["issues"][1]
                     else:
-                        # and we have more than two issues, count each issue's occurences
-                        occurences = {x: value["issues"].count(x) for x in set(value["issues"])}
-                        if occurences[issue["number"]] == max(occurences.values()):
+                        # and we have more than two issues, count each issue's occurrences
+                        occurrences = {x: value["issues"].count(x) for x in set(value["issues"])}
+                        if occurrences[issue["number"]] == max(occurrences.values()):
                             # if our issue is the most common one, that means it is the common denominator
                             # for all connected events at this time
                             # so this event connects to any other issue
@@ -655,7 +655,7 @@ def reformat_events(issue_data, filtered_connected_events, external_connected_ev
                             event["event_info_1"] = number
                         else:
                             # otherwise, connect this event to the common denominator
-                            event["event_info_1"] = max(occurences, key=occurences.get)
+                            event["event_info_1"] = max(occurrences, key=occurrences.get)
 
     # as the user dictionary is created, start re-formating the event information of all issues
     for issue in issue_data:
@@ -785,7 +785,6 @@ def insert_user_data(issues, conf, resdir):
     :param issues: the issues to retrieve user data from
     :param conf: the project configuration
     :param resdir: the directory in which the username-to-user-list should be dumped
-    :return: the updated issue data
     """
 
     log.info("Syncing users with ID service...")

From d7ea47eed23be4d060ed35b6354723116a4acaa1 Mon Sep 17 00:00:00 2001
From: Leo Sendelbach <s8lesend@stud.uni-saarland.de>
Date: Tue, 10 Mar 2026 14:12:43 +0100
Subject: [PATCH 25/26] Remove old state from jira state_updated events

For consistency with github events

Signed-off-by: Leo Sendelbach <s8lesend@stud.uni-saarland.de>
---
 issue_processing/jira_issue_processing.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/issue_processing/jira_issue_processing.py b/issue_processing/jira_issue_processing.py
index 9d384c3..7b85076 100644
--- a/issue_processing/jira_issue_processing.py
+++ b/issue_processing/jira_issue_processing.py
@@ -464,21 +464,18 @@ def load_issues_via_api(issues, persons, url, referenced_bys):
             for change in changelog.histories:
 
                 # default values for state and resolution
-                old_state, new_state, old_resolution, new_resolution = "open", "open", "unresolved", "unresolved"
+                new_state, old_resolution, new_resolution = "open", "unresolved", "unresolved"
 
                 # all changes in the issue changelog are checked if they contain a useful information
                 for item in change.items:
 
                     # state_updated event gets created and added to the issue history
                     if item.field == "status":
-                        if item.fromString is not None:
-                            old_state = item.fromString.lower()
                         if item.toString is not None:
                             new_state = item.toString.lower()
                         history = dict()
                         history["event"] = "state_updated"
                         history["event_info_1"] = new_state
-                        history["event_info_2"] = old_state
                         if hasattr(change, "author"):
                             user = create_user(change.author.displayName, change.author.name, "")
                         else:

From 8ab46a0103b7926e890032c49c9c248c7bb2622e Mon Sep 17 00:00:00 2001
From: Leo Sendelbach <s8lesend@stud.uni-saarland.de>
Date: Fri, 13 Mar 2026 11:32:36 +0100
Subject: [PATCH 26/26] Fix jira processing error

previously removed event_info_2 for state_updated event, leading to
crashes of the issue processing. Now, it instead contains an empty
string.
Also fix a minor spelling mistake

Signed-off-by: <s8lesend@stud.uni-saarland.de>
---
 bot_processing/bot_processing.py          | 2 +-
 issue_processing/jira_issue_processing.py | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/bot_processing/bot_processing.py b/bot_processing/bot_processing.py
index ad76d85..d5800b3 100644
--- a/bot_processing/bot_processing.py
+++ b/bot_processing/bot_processing.py
@@ -82,7 +82,7 @@ def load_bot_data(bot_file, header = True):
 
     # check if file exists and exit early if not
     if not os.path.exists(bot_file):
-        log.error("Bot file '{}' does not exist! Exiting early...".format(bot_file))
+        log.error("Bot/Agent file '{}' does not exist (can be empty)! Exiting early...".format(bot_file))
         sys.exit(-1)
 
     bot_data = csv_writer.read_from_csv(bot_file, delimiter=',')
diff --git a/issue_processing/jira_issue_processing.py b/issue_processing/jira_issue_processing.py
index 7b85076..3b12a93 100644
--- a/issue_processing/jira_issue_processing.py
+++ b/issue_processing/jira_issue_processing.py
@@ -18,7 +18,7 @@
 # Copyright 2018-2019 by Anselm Fehnker <fehnker@fim.uni-passau.de>
 # Copyright 2020-2021 by Thomas Bock <bockthom@cs.uni-saarland.de>
 # Copyright 2023 by Maximilian Löffler <s8maloef@stud.uni-saarland.de>
-# Copyright 2025 by Leo Sendelbach <s8lesend@stud.uni-saarland.de>
+# Copyright 2025-2026 by Leo Sendelbach <s8lesend@stud.uni-saarland.de>
 # All Rights Reserved.
 """
 This file is able to extract Jira issue data from xml files.
@@ -476,6 +476,7 @@ def load_issues_via_api(issues, persons, url, referenced_bys):
                         history = dict()
                         history["event"] = "state_updated"
                         history["event_info_1"] = new_state
+                        history["event_info_2"] = ""
                         if hasattr(change, "author"):
                             user = create_user(change.author.displayName, change.author.name, "")
                         else: