diff --git a/author_postprocessing/author_postprocessing.py b/author_postprocessing/author_postprocessing.py
index 13b1e38..7aa9526 100644
--- a/author_postprocessing/author_postprocessing.py
+++ b/author_postprocessing/author_postprocessing.py
@@ -14,6 +14,7 @@
 #
 # Copyright 2015-2017 by Claus Hunsen <hunsen@fim.uni-passau.de>
 # Copyright 2020-2022 by Thomas Bock <bockthom@cs.uni-saarland.de>
+# Copyright 2025-2026 by Leo Sendelbach <s8lesend@stud.uni-saarland.de>
 # All Rights Reserved.
 """
 This file is able to disambiguate authors after the extraction from the Codeface database was performed. A manually
@@ -49,7 +50,13 @@
 
 from csv_writer import csv_writer
 
+from github_user_utils.github_user_utils import known_copilot_users, copilot_unified_name, copilot_unified_email, \
+                              is_github_noreply_author, github_user, github_email, \
+                              commit_added_event, mentioned_event, subscribed_event, \
+                              assigned_event, unassigned_event, review_requested_event, \
+                              review_request_removed_event, generate_botname_variants, quot_m
 
+known_copilot_users_extended = generate_botname_variants(known_copilot_users)
 ##
 # RUN POSTPROCESSING
 ##
@@ -78,7 +85,7 @@ def perform_data_backup(results_path, results_path_backup):
                     copy(current_file, backup_file)
 
 
-def fix_github_browser_commits(data_path, issues_github_list, commits_list, authors_list, emails_list, bots_list):
+def fix_github_browser_commits(data_path, issues_github_list, commits_list, authors_list, emails_list, bots_list, unify_copilot_users=True):
     """
     Replace the author "GitHub <noreply@github.com>" in both commit and GitHub issue data by the correct author.
     The author "GitHub <noreply@github.com>" is automatically inserted as the committer of a commit that is made when
@@ -89,7 +96,7 @@ def fix_github_browser_commits(data_path, issues_github_list, commits_list, auth
     "GitHub <noreply@github.com>" are removed. Also "mentioned" or "subscribed" events in the GitHub issue data which
     reference the author "GitHub <noreply@github.com>" are removed from the GitHub issue data. In addition, remove the
     author "GitHub <noreply@github.com>" also from the author data and bot data and remove e-mails that have been sent
-    by this author.
+    by this author. This method also unifies all known copilot users into a single user if desired.
 
     :param data_path: the path to the project data that is to be fixed
     :param issues_github_list: file name of the github issue data
@@ -97,26 +104,8 @@ def fix_github_browser_commits(data_path, issues_github_list, commits_list, auth
     :param authors_list: file name of the corresponding author data
     :param emails_list: file name of the corresponding email data
     :param bots_list: file name of the corresponding bot data
+    :param unify_copilot_users: whether to unify known copilot users into a single user
     """
-    github_user = "GitHub"
-    github_email = "noreply@github.com"
-    commit_added_event = "commit_added"
-    mentioned_event = "mentioned"
-    subscribed_event = "subscribed"
-
-    """
-    Helper function to check whether a (name, e-mail) pair belongs to the author "GitHub <noreply@github.com>".
-    There are two options in Codeface how this can happen:
-    (1) Username is "GitHub" and e-mail address is "noreply@github.com"
-    (2) Username is "GitHub" and e-mail address has been replaced by Codeface, resulting in "GitHub.noreply@github.com"
-
-    :param name: the name of the author to be checked
-    :param email: the email address of the author to be checked
-    :return: whether the given (name, email) pair belongs to the "GitHub <noreply@github.com>" author
-    """
-    def is_github_noreply_author(name, email):
-        return (name == github_user and (email == github_email or email == (github_user + "." + github_email)))
-
 
     # Check for all files in the result directory of the project whether they need to be adjusted
     for filepath, dirnames, filenames in walk(data_path):
@@ -125,20 +114,32 @@ def is_github_noreply_author(name, email):
         if authors_list in filenames:
             f = path.join(filepath, authors_list)
             log.info("Remove author %s <%s> in %s ...", github_user, github_email, f)
+            if unify_copilot_users:
+                log.info("Also unify copilot users to %s <%s> in %s ...", copilot_unified_name, copilot_unified_email, f)
             author_data = csv_writer.read_from_csv(f)
 
             author_data_new = []
-
+            copilot_user_added = False
             for author in author_data:
                 # keep author entry only if it should not be removed
                 if not is_github_noreply_author(author[1], author[2]):
-                    author_data_new.append(author)
+                    # unify copilot author if desired
+                    if unify_copilot_users and author[1] in known_copilot_users_extended:
+                        if not copilot_user_added:
+                            author[1] = copilot_unified_name
+                            author[2] = copilot_unified_email
+                            copilot_user_added = True
+                            author_data_new.append(author)
+                    else:
+                        author_data_new.append(author)
             csv_writer.write_to_csv(f, author_data_new)
 
         # (2) Remove e-mails from author 'GitHub <noreply@github.com>' from all emails.list files
         if emails_list in filenames:
             f = path.join(filepath, emails_list)
             log.info("Remove emails from author %s <%s> in %s ...", github_user, github_email, f)
+            if unify_copilot_users:
+                log.info("Also unify copilot users to %s <%s> in %s ...", copilot_unified_name, copilot_unified_email, f)
             email_data = csv_writer.read_from_csv(f)
 
             email_data_new = []
@@ -146,6 +147,10 @@ def is_github_noreply_author(name, email):
             for email in email_data:
                 # keep author entry only if it should not be removed
                 if not is_github_noreply_author(email[0], email[1]):
+                    # unify copilot users if desired
+                    if unify_copilot_users and email[0] in known_copilot_users_extended:
+                        email[0] = copilot_unified_name
+                        email[1] = copilot_unified_email
                     email_data_new.append(email)
                 else:
                     log.warn("Remove email %s as it was sent by %s <%s>.", email[2], email[0], email[1])
@@ -156,6 +161,8 @@ def is_github_noreply_author(name, email):
         if commits_list in filenames:
             f = path.join(filepath, commits_list)
             log.info("Replace author %s <%s> in %s ...", github_user, github_email, f)
+            if unify_copilot_users:
+                log.info("Also unify copilot users to %s <%s> in %s ...", copilot_unified_name, copilot_unified_email, f)
             commit_data = csv_writer.read_from_csv(f)
 
             for commit in commit_data:
@@ -164,6 +171,13 @@ def is_github_noreply_author(name, email):
                 if is_github_noreply_author(commit[5], commit[6]):
                     commit[5] = commit[2]
                     commit[6] = commit[3]
+                # unify copilot author if desired
+                if unify_copilot_users and commit[5] in known_copilot_users_extended:
+                    commit[5] = copilot_unified_name
+                    commit[6] = copilot_unified_email
+                if unify_copilot_users and commit[2] in known_copilot_users_extended:
+                    commit[2] = copilot_unified_name
+                    commit[3] = copilot_unified_email
 
             csv_writer.write_to_csv(f, commit_data)
 
@@ -172,26 +186,45 @@ def is_github_noreply_author(name, email):
         if issues_github_list in filenames:
             f = path.join(filepath, issues_github_list)
             log.info("Replace author %s <%s> in %s ...", github_user, github_email, f)
+            if unify_copilot_users:
+                log.info("Also unify copilot users to %s <%s> in %s ...", copilot_unified_name, copilot_unified_email, f)
             issue_data = csv_writer.read_from_csv(f)
 
             # read commit data
             commit_data_file = path.join(data_path, commits_list)
             commit_data = csv_writer.read_from_csv(commit_data_file)
             commit_hash_to_author = {commit[7]: commit[2:4] for commit in commit_data}
-
+            author_name_to_data = {author[1]: author[1:3] for author in author_data_new}
             issue_data_new = []
-
             for event in issue_data:
+                # unify events to use a single copilot user for all events triggered by a known copilot user
+                if unify_copilot_users and event[9] in known_copilot_users_extended:
+                    event[9] = copilot_unified_name
+                    event[10] = copilot_unified_email
+                if unify_copilot_users and event[8] == commit_added_event and event[13][1:-1] in known_copilot_users_extended:
+                    # for commit added events, also unify the referenced author in event info 2 if it is a known copilot user
+                    event[13] = quot_m + copilot_unified_name + quot_m
+                elif unify_copilot_users and event[8] in (mentioned_event, subscribed_event, assigned_event, unassigned_event,
+                                                          review_requested_event, review_request_removed_event) \
+                                         and event[12] in known_copilot_users_extended:
+                    # for mentioned/subscribed events, also unify the referenced user in event info 1 and 2 if it is a known copilot user
+                    event[12] = copilot_unified_name
+                    event[13] = quot_m + copilot_unified_email + quot_m
                 # replace author if necessary
                 if is_github_noreply_author(event[9], event[10]) and event[8] == commit_added_event:
                     # extract commit hash from event info 1
                     commit_hash = event[12]
-
+                    # extract author name from event info 2 while cutting excess '"'
+                    name = event[13][1:-1]
                     # extract commit author from commit data, if available
                     if commit_hash in commit_hash_to_author:
                         event[9] = commit_hash_to_author[commit_hash][0]
                         event[10] = commit_hash_to_author[commit_hash][1]
                         issue_data_new.append(event)
+                    elif name in author_name_to_data:
+                        event[9] = author_name_to_data[name][0]
+                        event[10] = author_name_to_data[name][1]
+                        issue_data_new.append(event)
                     else:
                         # the added commit is not part of the commit data. In most cases, this is due to merge commits
                         # appearing in another pull request, as Codeface does not keep track of merge commits. As we
@@ -220,6 +253,9 @@ def is_github_noreply_author(name, email):
         if bots_list in filenames:
             f = path.join(filepath, bots_list)
             log.info("Remove author %s <%s> from %s ...", github_user, github_email, f)
+            if unify_copilot_users:
+                log.info("Also unify copilot users to %s <%s> in %s ...", copilot_unified_name, copilot_unified_email, f)
+            copilot_user_added = False
             bot_data = csv_writer.read_from_csv(f)
 
             bot_data_new = []
@@ -227,7 +263,15 @@ def is_github_noreply_author(name, email):
             for entry in bot_data:
                 # keep bot entry only if it should not be removed
                 if not is_github_noreply_author(entry[0], entry[1]):
-                    bot_data_new.append(entry)
+                    # unify copilot users if desired
+                    if unify_copilot_users and entry[0] in known_copilot_users_extended:
+                        if not copilot_user_added:
+                            entry[0] = copilot_unified_name
+                            entry[1] = copilot_unified_email
+                            copilot_user_added = True
+                            bot_data_new.append(entry)
+                    else:
+                        bot_data_new.append(entry)
                 else:
                     log.warn("Remove entry %s <%s> from bots list.", entry[0], entry[1])
 
@@ -264,9 +308,6 @@ def run_postprocessing(conf, resdir, backup_data):
     bugs_jira_list = "bugs-jira.list"
     bots_list = "bots.list"
 
-    # When looking at elements originating from json lists, we need to consider quotation marks around the string
-    quot_m = "\""
-
     data_path = path.join(resdir, conf["project"], conf["tagging"])
 
     # Correctly replace author 'GitHub <noreply@github.com>' in the commit data and in "commit_added" events of the
@@ -356,6 +397,9 @@ def run_postprocessing(conf, resdir, backup_data):
                     if person[4] == issue_event[12] and (quot_m + person[5] + quot_m) == issue_event[13]:
                         issue_event[12] = person[1]
                         issue_event[13] = quot_m + person[2] + quot_m
+                    # replace name in event info 2 if necessary
+                    if quot_m + person[4] + quot_m == issue_event[13]:
+                        issue_event[13] = quot_m + person[1] + quot_m
 
             csv_writer.write_to_csv(f, issue_data)
 
@@ -422,8 +466,12 @@ def run_postprocessing(conf, resdir, backup_data):
                     # the bot is already in the list, check if there are different predictions
                     stored_bot = bot_names_and_emails[(bot[0], bot[1])]
                     if stored_bot[2] != bot[2]:
+                        # if either of the predictions is agent, keep agent
+                        if (stored_bot[2] == "Agent" or bot[2] == "Agent"):
+                            stored_bot[2] = "Agent"
+                            bot_names_and_emails[(bot[0], bot[1])] = stored_bot
                         # if either of the predictions is bot, keep bot
-                        if (stored_bot[2] == "Bot" or bot[2] == "Bot"):
+                        elif (stored_bot[2] == "Bot" or bot[2] == "Bot"):
                             stored_bot[2] = "Bot"
                             bot_names_and_emails[(bot[0], bot[1])] = stored_bot
                         # otherwise, if either of the predictions is human, keep human
diff --git a/bot_processing/bot_processing.py b/bot_processing/bot_processing.py
index 53a397e..d5800b3 100644
--- a/bot_processing/bot_processing.py
+++ b/bot_processing/bot_processing.py
@@ -13,6 +13,7 @@
 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 #
 # Copyright 2021-2022 by Thomas Bock <bockthom@cs.uni-saarland.de>
+# Copyright 2026 by Leo Sendelbach <s8lesend@stud.uni-saarland.de>
 # All Rights Reserved.
 """
 This file is able to extract information on bot/human users from csv files.
@@ -29,6 +30,7 @@
 from codeface.configuration import Configuration
 
 from csv_writer import csv_writer
+from github_user_utils.github_user_utils import known_copilot_users, generate_botname_variants
 
 def run():
     # get all needed paths and arguments for the method call.
@@ -52,6 +54,7 @@ def run():
     # (the known bots file is the file in which known bots have been added manually and project independent)
     __confdir = os.path.join(args.resdir, os.path.dirname(args.config))
     __known_bots_file = os.path.abspath(os.path.join(__confdir, "known_github_bots.list"))
+    __known_agents_file = os.path.abspath(os.path.join(__confdir, "known_github_agents.list"))
 
     # run processing of bot data:
     # 1) load bot data
@@ -59,7 +62,7 @@ def run():
     # 2) load user data
     users = load_user_data(os.path.join(__resdir, "usernames.list"))
     # 3) update bot data with user data and additionally add known bots if they occur in the project
-    bots = add_user_data(bots, users, __known_bots_file)
+    bots = add_user_data(bots, users, __known_bots_file, __known_agents_file)
     # 4) dump result to disk
     print_to_disk(bots, __resdir)
 
@@ -79,7 +82,7 @@ def load_bot_data(bot_file, header = True):
 
     # check if file exists and exit early if not
     if not os.path.exists(bot_file):
-        log.error("Bot file '{}' does not exist! Exiting early...".format(bot_file))
+        log.error("Bot/Agent file '{}' does not exist (can be empty)! Exiting early...".format(bot_file))
         sys.exit(-1)
 
     bot_data = csv_writer.read_from_csv(bot_file, delimiter=',')
@@ -111,12 +114,13 @@ def load_user_data(user_data_file):
     return user_data
 
 
-def check_with_known_bot_list(known_bots_file, bot_data, user_data, bot_data_reduced):
+def check_with_known_bot_or_agent_list(known_bots_file, known_agents_file, bot_data, user_data, bot_data_reduced):
     """
     Check whether there are known bots occurring in the project. If so, add them to the bots list
     or update the bots list accordingly.
 
     :param known_bots_file: the file path to the list of known bot data
+    :param known_agents_file: the file path to the list of known agent data
     :param bot_data: the bot data originating from the bot prediction
     :param user_data: a dictionary from the issue data which maps GitHub usernames to authors
     :param bot_data_reduced: the bot data after mapping GitHub user names to authors
@@ -126,6 +130,7 @@ def check_with_known_bot_list(known_bots_file, bot_data, user_data, bot_data_red
 
     # Read the list of known bots
     known_bots = load_bot_data(known_bots_file, header = False)
+    known_agents = load_bot_data(known_agents_file, header = False)
 
     # Get the GitHub usernames of the bots predicted to be a bot
     predicted_bots = [bot[0] if len(bot) > 0 else "" for bot in bot_data]
@@ -133,30 +138,62 @@ def check_with_known_bot_list(known_bots_file, bot_data, user_data, bot_data_red
     for bot in known_bots:
 
         # (1) check if a known bot occurs in the GitHub issue data but has not been predicted
-        if bot[0] not in predicted_bots and bot[0] in user_data:
+        bot_variation_predicted_bots = containing_bot_variation(bot[0], predicted_bots)
+        bot_variation_user_data = containing_bot_variation(bot[0], user_data)
+        if bot_variation_predicted_bots is None and bot_variation_user_data is not None:
 
             # add the known bot as a bot to the bots list
             additional_bot = dict()
-            additional_bot["user"] = user_data[bot[0]]
+            additional_bot["user"] = user_data[bot_variation_user_data]
             additional_bot["prediction"] = "Bot"
             bot_data_reduced.append(additional_bot)
             log.info("Add known bot '{}' to bot data.".format(additional_bot["user"]))
 
         # (2) handle known bots that are already present in the bots list
-        elif bot[0] in predicted_bots and bot[0] in user_data:
+        elif bot_variation_predicted_bots is not None and bot_variation_user_data is not None:
 
             # make sure that this bot has also been predicited to be bot
             for predicted_bot in bot_data_reduced:
-                if predicted_bot["user"] == user_data[bot[0]]:
+                if predicted_bot["user"] == user_data[bot_variation_user_data]:
                     predicted_bot["prediction"] = "Bot"
-                    log.info("Mark user '{}' as bot in the bot data.".format(user_data[bot[0]]))
+                    log.info("Mark user '{}' as bot in the bot data.".format(user_data[bot_variation_user_data]))
+                    break
+
+    # get list of known agents and combine it with the list of known copilot users
+    copilot_users_variants = generate_botname_variants(known_copilot_users)
+    # get list of known agent names
+    known_agents_names = [agent[0] for agent in known_agents]
+    for copilot_user in copilot_users_variants:
+        if copilot_user not in known_agents_names:
+            known_agents.append([copilot_user])
+
+    for agent in known_agents:
+
+        # (1) check if a known agent occurs in the GitHub issue data but has not been predicted
+        if agent[0] not in predicted_bots and agent[0] in user_data:
+
+            # add the known agent as a bot to the bots list
+            additional_agent = dict()
+            additional_agent["user"] = user_data[agent[0]]
+            additional_agent["prediction"] = "Agent"
+            bot_data_reduced.append(additional_agent)
+            log.info("Add known agent '{}' to bot data.".format(additional_agent["user"]))
+
+        # (2) handle known agents that are already present in the bots list
+        elif agent[0] in predicted_bots and agent[0] in user_data:
+
+            # make sure that this bot has also been predicited to be an agent
+            for predicted_bot in bot_data_reduced:
+                if predicted_bot["user"] == user_data[agent[0]]:
+                    predicted_bot["prediction"] = "Agent"
+                    log.info("Mark user '{}' as agent in the bot data.".format(user_data[agent[0]]))
                     break
 
     # return the updated bot data
     return bot_data_reduced
 
 
-def add_user_data(bot_data, user_data, known_bots_file):
+def add_user_data(bot_data, user_data, known_bots_file, known_agents_file):
     """
     Add user data to bot data, i.e., replace username by name and e-mail.
     In addition, check in the global bots list whether there are authors in the projects which are
@@ -192,19 +229,41 @@ def add_user_data(bot_data, user_data, known_bots_file):
             continue
 
         # get user information if available
-        if user[0] in user_buffer.keys():
-            bot_reduced["user"] = user_buffer[user[0]]
+        bot_variation = containing_bot_variation(user[0], user_buffer.keys())
+        if bot_variation is not None:
+            bot_reduced["user"] = user_buffer[bot_variation]
             bot_reduced["prediction"] = user[-1]
             bot_data_reduced.append(bot_reduced)
         else:
             log.warn("User '{}' in bot data does not occur in GitHub user data. Remove user...".format(user[0]))
 
     # check whether known GitHub bots occur in the GitHub issue data and, if so, update the bot data accordingly
-    bot_data_reduced = check_with_known_bot_list(known_bots_file, bot_data, user_buffer, bot_data_reduced)
+    bot_data_reduced = check_with_known_bot_or_agent_list(known_bots_file, known_agents_file, bot_data, user_buffer, bot_data_reduced)
 
     return bot_data_reduced
 
 
+def containing_bot_variation(botname, name_list):
+    """
+    Helper function to return the variation of a given bot name that occurs in a list of names.
+
+    :param botname: the bot name for which the variation should be returned
+    :param name_list: the list of names to be checked for containing the bot name or a variation of it
+    :return: the variation of the given bot name that occurs in the given list of names, or None if no such variation exists
+    """
+
+    if botname in name_list:
+        return botname
+    elif botname + "bot" in name_list:
+        return botname + "bot"
+    elif botname + "[bot]" in name_list:
+        return botname + "[bot]"
+    elif botname.replace("[", "").replace("]", "") in name_list:
+        return botname.replace("[", "").replace("]", "")
+    else:
+        return None
+
+
 def print_to_disk(bot_data, results_folder):
     """
     Print bot data to file "bots.list" in result folder.
diff --git a/github_user_utils/__init__.py b/github_user_utils/__init__.py
new file mode 100644
index 0000000..9bad579
--- /dev/null
+++ b/github_user_utils/__init__.py
@@ -0,0 +1 @@
+# coding=utf-8
diff --git a/github_user_utils/github_user_utils.py b/github_user_utils/github_user_utils.py
new file mode 100644
index 0000000..20fa8d8
--- /dev/null
+++ b/github_user_utils/github_user_utils.py
@@ -0,0 +1,78 @@
+# coding=utf-8
+# This file is part of codeface-extraction, which is free software: you
+# can redistribute it and/or modify it under the terms of the GNU General
+# Public License as published by the Free Software Foundation, version 2.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+# Copyright 2026 by Leo Sendelbach <s8lesend@stud.uni-saarland.de>
+# All Rights Reserved.
+"""
+This file serves as a collection of global variables and utility functions, which are used throughout the 
+issue data extraction and post-processing, in particular for the processing of GitHub and Copilot user data.
+"""
+
+##
+# GLOBAL VARIABLES
+##
+
+# global variables containing all known copilot users and the name and mail adress copilot users will be assigned
+known_copilot_users = {"Copilot", "copilot-pull-request-reviewer[bot]", "copilot-swe-agent[bot]"}
+copilot_unified_name = "Copilot"
+copilot_unified_email = "copilot@example.com"
+
+## global variables for the GitHub author
+github_user = "GitHub"
+github_email = "noreply@github.com"
+commit_added_event = "commit_added"
+mentioned_event = "mentioned"
+subscribed_event = "subscribed"
+assigned_event = "assigned"
+unassigned_event = "unassigned"
+review_requested_event = "review_requested"
+review_request_removed_event = "review_request_removed"
+
+# When looking at elements originating from json lists, we need to consider quotation marks around the string
+quot_m = "\""
+
+##
+# UTILITY FUNCTIONS
+##
+
+def is_github_noreply_author(name, email):
+    """
+    Helper function to check whether a (name, e-mail) pair belongs to the author "GitHub <noreply@github.com>".
+    There are two options in Codeface how this can happen:
+    (1) Username is "GitHub" and e-mail address is "noreply@github.com"
+    (2) Username is "GitHub" and e-mail address has been replaced by Codeface, resulting in "GitHub.noreply@github.com"
+
+    :param name: the name of the author to be checked
+    :param email: the email address of the author to be checked
+    :return: whether the given (name, email) pair belongs to the "GitHub <noreply@github.com>" author
+    """
+
+    return (name == github_user and (email == github_email or email == (github_user + "." + github_email)))
+
+def generate_botname_variants(botnames):
+    """
+    Helper function to generate variants of bot names, which are used in the list of
+    known bots and agents as well as during author postprocessing.
+
+    :param botnames: the list of bot names for which variants should be generated
+    :return: a set of bot name variants
+    """
+
+    botname_variants = set()
+    for botname in botnames:
+        botname_variants.add(botname)
+        botname = botname.replace("[", "").replace("]", "")
+        botname_variants.add(botname)
+
+    return botname_variants
diff --git a/issue_processing/issue_processing.py b/issue_processing/issue_processing.py
index a901e19..4dc8c63 100644
--- a/issue_processing/issue_processing.py
+++ b/issue_processing/issue_processing.py
@@ -18,6 +18,7 @@
 # Copyright 2018-2019 by Anselm Fehnker <fehnker@fim.uni-passau.de>
 # Copyright 2019 by Thomas Bock <bockthom@fim.uni-passau.de>
 # Copyright 2020-2021 by Thomas Bock <bockthom@cs.uni-saarland.de>
+# Copyright 2025-2026 by Leo Sendelbach <s8lesend@stud.uni-saarland.de>
 # All Rights Reserved.
 """
 This file is able to extract Github issue data from json files.
@@ -39,9 +40,10 @@
 from dateutil import parser as dateparser
 
 from csv_writer import csv_writer
+from github_user_utils.github_user_utils import copilot_unified_name
 
 # known types from JIRA and GitHub default labels
-known_types = {"bug", "improvement", "enhancement", "new feature", "task", "test", "wish"}
+known_types = {"bug", "improvement", "enhancement", "feature", "task", "test", "wish"}
 
 # known resolutions from JIRA and GitHub default labels
 known_resolutions = {"unresolved", "fixed", "wontfix", "duplicate", "invalid", "incomplete", "cannot reproduce",
@@ -52,6 +54,7 @@
 # datetime format string
 datetime_format = "%Y-%m-%d %H:%M:%S"
 
+
 def run():
     # get all needed paths and arguments for the method call.
     parser = argparse.ArgumentParser(prog='codeface-extraction-issues-github', description='Codeface extraction')
@@ -74,13 +77,14 @@ def run():
     # 1) load the list of issues
     issues = load(__srcdir)
     # 2) re-format the issues
-    issues = reformat_issues(issues)
+    reformat_issues(issues)
     # 3) merges all issue events into one list
-    issues = merge_issue_events(issues)
+    external_connected_events = dict()
+    filtered_connected_events = merge_issue_events(issues, external_connected_events)
     # 4) re-format the eventsList of the issues
-    issues = reformat_events(issues)
+    reformat_events(issues, filtered_connected_events, external_connected_events)
     # 5) update user data with Codeface database and dump username-to-name/e-mail list
-    issues = insert_user_data(issues, __conf, __resdir)
+    insert_user_data(issues, __conf, __resdir)
     # 6) dump result to disk
     print_to_disk(issues, __resdir)
 
@@ -229,7 +233,6 @@ def reformat_issues(issue_data):
     Re-arrange issue data structure.
 
     :param issue_data: the issue data to re-arrange
-    :return: the re-arranged issue data
     """
 
     log.devinfo("Re-arranging Github issues...")
@@ -238,7 +241,10 @@ def reformat_issues(issue_data):
     for issue in issue_data:
 
         # empty container for issue types
-        issue["type"] = []
+        if issue["type"] is None:
+            issue["type"] = []
+        else:
+            issue["type"] = [issue["type"]["name"].lower()]
 
         # empty container for issue resolutions
         issue["resolution"] = []
@@ -255,7 +261,7 @@ def reformat_issues(issue_data):
         if issue["relatedCommits"] is None:
             issue["relatedCommits"] = []
 
-        # if an issue has no reviewsList, an empty Listgets created
+        # if an issue has no reviewsList, an empty List gets created
         if issue["reviewsList"] is None:
             issue["reviewsList"] = []
 
@@ -263,6 +269,10 @@ def reformat_issues(issue_data):
         if "relatedIssues" not in issue:
             issue["relatedIssues"] = []
 
+        # if an issue has no sub-issue list, an empty List gets created
+        if "subIssues" not in issue:
+            issue["subIssues"] = []
+
         # add "closed_at" information if not present yet
         if issue["closed_at"] is None:
             issue["closed_at"] = ""
@@ -279,20 +289,22 @@ def reformat_issues(issue_data):
         else:
             issue["type"].append("issue")
 
-    return issue_data
+    return
 
 
-def merge_issue_events(issue_data):
+def merge_issue_events(issue_data, external_connected_events):
     """
     All issue events are merged together in the eventsList. This simplifies processing in later steps.
 
     :param issue_data: the issue data from which the events shall be merged
-    :return: the issue data with merged eventsList
+    :param external_connected_events: a dict to store connected events to external issues
+    :return: a filtered dict of connected events for future reconstruction
     """
 
     log.info("Merge issue events ...")
 
     issue_data_to_update = dict()
+    connected_events = dict()
 
     for issue in issue_data:
 
@@ -361,6 +373,7 @@ def merge_issue_events(issue_data):
             # it is a commit which was added to the pull request
             if rel_commit["type"] == "commitAddedToPullRequest":
                 rel_commit["event"] = "commit_added"
+                rel_commit["event_info_2"] = rel_commit["commit"]["author"]
 
             # if the related commit was mentioned in an issue comment:
             elif rel_commit["type"] == "commitMentionedInIssue":
@@ -476,6 +489,12 @@ def merge_issue_events(issue_data):
             if event["event"] == "review_requested" or event["event"] == "review_request_removed":
                 event["ref_target"] = event["requestedReviewer"]
 
+            # if event is a specific copilot event, assign the copilot user data
+            if event["event"] == "copilot_work_started" or event["event"] == "copilot_work_finished":
+                event["user"]["name"] = None
+                event["user"]["username"] = copilot_unified_name
+                event["user"]["email"] = ""
+
             # if event dismisses a review, we can determine the original state of the corresponding review
             if event["event"] == "review_dismissed":
                 for review in issue["reviewsList"]:
@@ -488,6 +507,36 @@ def merge_issue_events(issue_data):
                 event["ref_target"] = event["user"]
                 event["user"] = event["assigner"]
 
+            # if event is merged event, save the hash of the merge commit in event_info_1
+            if event["event"] == "merged":
+                event["event_info_1"] = event["commit"]["hash"]
+
+            # if event is connected event, create or add to a matching dict entry by matching timestamps, for later reconstruction
+            if event["event"] == "connected":
+                if event["created_at"] in connected_events.keys() and connected_events[event["created_at"]]["user"] == event["user"]:
+                    # if there is already a connected event at this time by this user, add this event to the list
+                    connected_events[event["created_at"]]["issues"].append(issue["number"])
+                elif subtract_seconds_from_time(event["created_at"], 1) in connected_events.keys() \
+                        and connected_events[subtract_seconds_from_time(event["created_at"], 1)]["user"] == event["user"]:
+                    # same as above, but accounting for a possible difference in timestamps of 1 second between matching events
+                    connected_events[subtract_seconds_from_time(event["created_at"], 1)]["issues"].append(issue["number"])
+                    event["created_at"] = subtract_seconds_from_time(event["created_at"], 1)
+                elif subtract_seconds_from_time(event["created_at"], -1) in connected_events.keys() \
+                        and connected_events[subtract_seconds_from_time(event["created_at"], -1)]["user"] == event["user"]:
+                    # same as above, with offset calculated in the other direction
+                    connected_events[subtract_seconds_from_time(event["created_at"], -1)]["issues"].append(issue["number"])
+                    event["created_at"] = subtract_seconds_from_time(event["created_at"], -1)
+                else:
+                    # if there is no connected event yet at this timestamp, create a new entry for this event
+                    connected_info = dict()
+                    connected_info["issues"] = [issue["number"]]
+                    connected_info["user"] = event["user"]
+                    connected_events[event["created_at"]] = connected_info
+
+            # if event is a locked event, save the lock reason in event_info_1
+            if event["event"] == "locked":
+                event["event_info_1"] = event["lock_reason"]
+
         # merge events, relatedCommits, relatedIssues and comment lists
         issue["eventsList"] = issue["commentsList"] + issue["eventsList"] + issue["relatedIssues"] + issue[
             "relatedCommits"] + issue["reviewsList"]
@@ -499,21 +548,62 @@ def merge_issue_events(issue_data):
         # sorts eventsList by time
         issue["eventsList"] = sorted(issue["eventsList"], key=lambda k: k["created_at"])
 
+    # filter out connected events which cannot be perfectly matched
+    # and populate external_connected_events dict
+    # because this happens in place, we do not need to return the external_connected_event dict later
+    filtered_connected_events = dict(filter(lambda item: filter_connected_events(item[0], item[1], external_connected_events), connected_events.items()))
+
     # updates all the issues by the temporarily stored referenced_by events
     for key, value in issue_data_to_update.iteritems():
         for issue in issue_data:
             if issue["number"] == value["number"]:
                 issue["eventsList"] = issue["eventsList"] + value["eventsList"]
 
-    return issue_data
-
-
-def reformat_events(issue_data):
+    # return the filtered_connected_events dict for later reconstruction
+    return filtered_connected_events
+
+
+def filter_connected_events(key, value, external_connected_events):
+    num_issues = len(value["issues"])
+    # if only a single connected event exists at this time, it has to be connecting to an external issue
+    if num_issues == 1:
+        external_connected_events[key] = value
+        return False
+    # if 2 connected events exist, matching them is trivial
+    if num_issues == 2:
+        return True
+    occurrences = {x: value["issues"].count(x) for x in set(value["issues"])}
+    # otherwise, if it is an even number, check if it can be easily matched,
+    # meaning that exactly half the events occur in the same issue
+    if num_issues % 2 == 0 and num_issues/2 in occurrences.values():
+        # duplicate issue list for matching the issues later
+        value["multi_issues_copy"] = list(value["issues"])
+        return True
+    # if it is an odd number, check if it can be easily matched
+    # meaning that exactly half (rounded up) the events occur in the same issue
+    if num_issues % 2 == 1 and (num_issues + 1)/2 in occurrences.values():
+        for sub_key, sub_value in occurrences.iteritems():
+            # then, assign one of them as an external connected event and proceed as in previous case
+            if sub_value == (num_issues + 1)/2:
+                new_entry = dict()
+                new_entry["user"] = value["user"]
+                new_entry["issues"] = [sub_key]
+                external_connected_events[key] = new_entry
+                value["issues"].remove(sub_key)
+                # duplicate issue list for matching the issues later
+                value["multi_issues_copy"] = list(value["issues"])
+                return True
+    # no other variants can be easily matched
+    return False
+
+
+def reformat_events(issue_data, filtered_connected_events, external_connected_events):
     """
     Re-format event information dependent on the event type.
 
     :param issue_data: the data of all issues that shall be re-formatted
-    :return: the issue data with updated event information
+    :param filtered_connected_events: the dict of connected events which can be reconstructed
+    :param external_connected_events: the dict of connected events to external issues
     """
 
     log.info("Update event information ...")
@@ -538,6 +628,35 @@ def reformat_events(issue_data):
             if not event["ref_target"] is None and not event["ref_target"] == "":
                 users = update_user_dict(users, event["ref_target"])
 
+            # reconstruction of connections
+            if event["event"] == "connected":
+                if event["created_at"] in external_connected_events \
+                    and issue["number"] in external_connected_events[event["created_at"]]["issues"]:
+                    # if the event is an external connected event, mark it as such and remove this issue from the list
+                    event["event_info_1"] = "external"
+                    external_connected_events[event["created_at"]]["issues"].remove(issue["number"])
+                elif event["created_at"] in filtered_connected_events \
+                    and issue["number"] in filtered_connected_events[event["created_at"]]["issues"]:
+                    # if it is instead an internal connected event
+                    value = filtered_connected_events[event["created_at"]]
+                    if len(value["issues"]) == 2:
+                        # and we only have 2 issues in the list, connect to the other issue
+                        event["event_info_1"] = value["issues"][0] if value["issues"][1] == issue["number"] else value["issues"][1]
+                    else:
+                        # and we have more than two issues, count each issue's occurrences
+                        occurrences = {x: value["issues"].count(x) for x in set(value["issues"])}
+                        if occurrences[issue["number"]] == max(occurrences.values()):
+                            # if our issue is the most common one, that means it is the common denominator
+                            # for all connected events at this time
+                            # so this event connects to any other issue
+                            # which is then removed from a copied list to avoid duplications
+                            number = next(x for x in value["multi_issues_copy"] if x != issue["number"])
+                            value["multi_issues_copy"].remove(number)
+                            event["event_info_1"] = number
+                        else:
+                            # otherwise, connect this event to the common denominator
+                            event["event_info_1"] = max(occurrences, key=occurrences.get)
+
     # as the user dictionary is created, start re-formating the event information of all issues
     for issue in issue_data:
 
@@ -555,13 +674,16 @@ def reformat_events(issue_data):
             if event["event"] == "closed":
                 event["event"] = "state_updated"
                 event["event_info_1"] = "closed"  # new state
-                event["event_info_2"] = "open"  # old state
+                if event["commit"] is not None:
+                    event["event_info_2"] = event["commit"]["hash"]
+                else:
+                    event["event_info_2"] = event["state_reason"]
                 issue["state_new"] = "closed"
 
             elif event["event"] == "reopened":
                 event["event"] = "state_updated"
                 event["event_info_1"] = "open"  # new state
-                event["event_info_2"] = "closed"  # old state
+                event["event_info_2"] = event["state_reason"]
                 issue["state_new"] = "reopened"
 
             elif event["event"] == "labeled":
@@ -569,7 +691,7 @@ def reformat_events(issue_data):
                 event["event_info_1"] = label
 
                 # if the label is in this list, it also is a type of the issue
-                if label in known_types:
+                if label in known_types and label not in issue["type"]:
                     issue["type"].append(str(label))
 
                     # creates an event for type updates and adds it to the eventsList
@@ -631,10 +753,14 @@ def reformat_events(issue_data):
                     issue["eventsList"].append(resolution_event)
 
             elif event["event"] == "commented":
-                # "state_new" and "resolution" of the issue give the information about the state and the resolution of
+                # "state_new" of the issue gives the information about the state of
                 # the issue when the comment was written, because the eventsList is sorted by time
                 event["event_info_1"] = issue["state_new"]
-                event["event_info_2"] = issue["resolution"]
+                # if event is a review comment, it can contain suggestions
+                if "contains_suggestion" in event:
+                    event["event_info_2"] = str(event["contains_suggestion"])
+                else:
+                    event["event_info_2"] = str(False)
 
             elif event["event"] == "referenced" and not event["commit"] is None:
                 # remove "referenced" events originating from commits
@@ -648,7 +774,7 @@ def reformat_events(issue_data):
         for event_to_remove in events_to_remove:
             issue["eventsList"].remove(event_to_remove)
 
-    return issue_data
+    return
 
 
 def insert_user_data(issues, conf, resdir):
@@ -659,7 +785,6 @@ def insert_user_data(issues, conf, resdir):
     :param issues: the issues to retrieve user data from
     :param conf: the project configuration
     :param resdir: the directory in which the username-to-user-list should be dumped
-    :return: the updated issue data
     """
 
     log.info("Syncing users with ID service...")
@@ -745,6 +870,9 @@ def get_user_from_id(idx, buffer_db=user_buffer):
         for event in issue["eventsList"]:
             event["user"] = get_id_and_update_user(event["user"])
 
+            if event["event"] == "commit_added":
+                event["event_info_2"] = get_id_and_update_user(event["event_info_2"])
+
             # check database for the reference-target user if needed
             if event["ref_target"] != "":
                 event["ref_target"] = get_id_and_update_user(event["ref_target"])
@@ -758,6 +886,10 @@ def get_user_from_id(idx, buffer_db=user_buffer):
         for event in issue["eventsList"]:
             event["user"] = get_user_from_id(event["user"])
 
+            # for commit_added events, save the commit's author's name in event_info_2
+            if event["event"] == "commit_added":
+                event["event_info_2"] = get_user_from_id(event["event_info_2"])["name"]
+
             # get the reference-target user if needed
             if event["ref_target"] != "":
                 event["ref_target"] = get_user_from_id(event["ref_target"])
@@ -778,7 +910,7 @@ def get_user_from_id(idx, buffer_db=user_buffer):
     username_dump = os.path.join(resdir, "usernames.list")
     csv_writer.write_to_csv(username_dump, sorted(set(lines), key=lambda line: line[0]))
 
-    return issues
+    return
 
 
 def print_to_disk(issues, results_folder):
@@ -805,7 +937,7 @@ def print_to_disk(issues, results_folder):
                 json.dumps(issue["resolution"]),
                 issue["created_at"],
                 issue["closed_at"],
-                json.dumps([]),  # components
+                json.dumps(issue["subIssues"]),  # components
                 event["event"],
                 event["user"]["name"],
                 event["user"]["email"],
diff --git a/issue_processing/jira_issue_processing.py b/issue_processing/jira_issue_processing.py
index d9748ae..3b12a93 100644
--- a/issue_processing/jira_issue_processing.py
+++ b/issue_processing/jira_issue_processing.py
@@ -18,6 +18,7 @@
 # Copyright 2018-2019 by Anselm Fehnker <fehnker@fim.uni-passau.de>
 # Copyright 2020-2021 by Thomas Bock <bockthom@cs.uni-saarland.de>
 # Copyright 2023 by Maximilian Löffler <s8maloef@stud.uni-saarland.de>
+# Copyright 2025-2026 by Leo Sendelbach <s8lesend@stud.uni-saarland.de>
 # All Rights Reserved.
 """
 This file is able to extract Jira issue data from xml files.
@@ -125,7 +126,7 @@ def run():
                 referenced_issue["history"].append(referenced_by)
 
     # 5) update user data with Codeface database
-    processed_issues = insert_user_data(processed_issues, __conf)
+    insert_user_data(processed_issues, __conf)
     # 6) dump result to disk
     print_to_disk(processed_issues, __resdir)
     # # 7) export for Gephi
@@ -300,9 +301,12 @@ def parse_xml(issue_data, persons, skip_history, referenced_bys):
         link = issue_x.getElementsByTagName("link")[0]
         issue["url"] = link.firstChild.data
 
-        type = issue_x.getElementsByTagName("type")[0]
-        issue["type"] = type.firstChild.data
-        issue["type_list"] = ["issue", str(type.firstChild.data.lower())]
+        type = issue_x.getElementsByTagName("type")[0].firstChild.data
+        # rename 'new feature' type to 'feature' to be in line with the github original issue type
+        if type == "New Feature":
+            type = "Feature"
+        issue["type"] = type
+        issue["type_list"] = ["issue", str(type.lower())]
 
         status = issue_x.getElementsByTagName("status")[0]
         issue["state"] = status.firstChild.data
@@ -460,21 +464,19 @@ def load_issues_via_api(issues, persons, url, referenced_bys):
             for change in changelog.histories:
 
                 # default values for state and resolution
-                old_state, new_state, old_resolution, new_resolution = "open", "open", "unresolved", "unresolved"
+                new_state, old_resolution, new_resolution = "open", "unresolved", "unresolved"
 
                 # all changes in the issue changelog are checked if they contain a useful information
                 for item in change.items:
 
                     # state_updated event gets created and added to the issue history
                     if item.field == "status":
-                        if item.fromString is not None:
-                            old_state = item.fromString.lower()
                         if item.toString is not None:
                             new_state = item.toString.lower()
                         history = dict()
                         history["event"] = "state_updated"
                         history["event_info_1"] = new_state
-                        history["event_info_2"] = old_state
+                        history["event_info_2"] = ""
                         if hasattr(change, "author"):
                             user = create_user(change.author.displayName, change.author.name, "")
                         else:
@@ -686,7 +688,7 @@ def get_user_from_id(idx, buffer_db=user_buffer):
                 event["event_info_2"] = assigned_user["email"]
 
     log.debug("number of issues after insert_user_data: '{}'".format(len(issues)))
-    return issues
+    return
 
 
 def print_to_disk(issues, results_folder):