From 6c420c37852ce97fe2edea690e743427f1733850 Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Mon, 21 Jul 2025 14:49:14 +0200 Subject: [PATCH 1/8] Add functionality to fetch external commits external commits that reference an issue can now be fetched. They will be marked with 'commitReferencesIssueExternal'. To that end, this commit adds an 'external' field to each commit and a new method to extract commits using the full url instead of just the relative one within the repo. Signed-off-by: Leo Sendelbach --- .../fim/gitwrapper/EventDataProcessor.java | 8 +++-- .../fim/gitwrapper/GitHubCommit.java | 9 +++++ .../fim/gitwrapper/GitHubRepository.java | 36 ++++++++++++++++++- .../fim/gitwrapper/IssueDataProcessor.java | 6 +++- 4 files changed, 55 insertions(+), 4 deletions(-) diff --git a/src/de/uni_passau/fim/gitwrapper/EventDataProcessor.java b/src/de/uni_passau/fim/gitwrapper/EventDataProcessor.java index 40d49a0..f52adf6 100644 --- a/src/de/uni_passau/fim/gitwrapper/EventDataProcessor.java +++ b/src/de/uni_passau/fim/gitwrapper/EventDataProcessor.java @@ -86,8 +86,12 @@ public void postDeserialize(EventData.ReferencedEventData result, JsonElement sr } result.commit = repo.getGithubCommit(hash.getAsString()).orElseGet(() -> { - LOG.warning("Found commit unknown to GitHub and local git repo: " + hash); - return null; + LOG.warning("Found commit unknown to GitHub and local git repo: " + hash + " Retry using url..."); + JsonElement url = src.getAsJsonObject().get("commit_url"); + return repo.getGithubCommitUrl(hash.getAsString(), url.getAsString()).orElseGet(() -> { + LOG.warning("Could not find commit: " + hash); + return null; + }); }); } diff --git a/src/de/uni_passau/fim/gitwrapper/GitHubCommit.java b/src/de/uni_passau/fim/gitwrapper/GitHubCommit.java index 16a2429..76d0585 100644 --- a/src/de/uni_passau/fim/gitwrapper/GitHubCommit.java +++ b/src/de/uni_passau/fim/gitwrapper/GitHubCommit.java @@ -26,6 +26,7 @@ public class GitHubCommit extends Commit { private String authorUsername; private String committerUsername; private boolean addedToPullRequest = false; + private boolean isExternal = false; /** * Constructs a new {@link GitHubCommit} with the given id made in the repo. @@ -119,4 +120,12 @@ public boolean isAddedToPullRequest() { void setAddedToPullRequest(boolean added) { this.addedToPullRequest = added; } + + void setExternal(boolean external) { + this.isExternal = true; + } + + boolean getExternal() { + return this.isExternal; + } } diff --git a/src/de/uni_passau/fim/gitwrapper/GitHubRepository.java b/src/de/uni_passau/fim/gitwrapper/GitHubRepository.java index 335f6cf..fa8f341 100644 --- a/src/de/uni_passau/fim/gitwrapper/GitHubRepository.java +++ b/src/de/uni_passau/fim/gitwrapper/GitHubRepository.java @@ -352,6 +352,8 @@ public Optional> getIssues(boolean includePullRequests, OffsetDa } else timeLimit = ""; Type finalType = type; + // For debugging, you may add additional parameters to the string. For example, '/issues?creator=sleo&state=all' + // will fetch issues created by user 'sleo' and all related issues and commits. getJSONStringFromPath("/issues?state=all" + timeLimit).map(json -> { List data; try { @@ -367,7 +369,7 @@ public Optional> getIssues(boolean includePullRequests, OffsetDa threadPool.submit(() -> data.parallelStream().forEach(IssueData::freeze)); } catch (JsonSyntaxException e) { - LOG.warning("Encountered invalid JSON: " + json); + LOG.warning("Encountered invalid JSON: " + json + "\n\n" + e.getMessage() + "\n\n" + e); return null; } return data; @@ -1028,6 +1030,38 @@ Optional getGithubCommit(String hash) { }); } + Optional getGithubCommitUrl(String hash, String url) { + if (offline.get()) { + return Optional.of(getGHCommitUnchecked(DummyCommit.DUMMY_COMMIT_ID)); + } else { + try { + Optional res = getJSONStringFromURL(url).map(commitInfo -> + gson.fromJson(commitInfo, new TypeToken() {}.getType())); + checkedHashes.put(hash, res); + if (res.isPresent()) { + res.get().setExternal(true); + } + return res; + } catch (JsonSyntaxException e) { + /* For whatever reason, the JSON String is malformed, perhaps due to ill-encoded characters + * in patches within the files element of the JSON String. + * Due to that, get the JSON String again and remove the content of the files element of the + * JSON String, as it is not needed for further processing. + */ + LOG.info("Malformed JSON String when querying data for commit " + url + ". Neglect files element."); + String jsonStringFromURL = getJSONStringFromURL(url).get(); + jsonStringFromURL = StringUtils.substringBefore(jsonStringFromURL, "\"files\":["); + jsonStringFromURL = jsonStringFromURL + "\"files\":[]}"; + Optional res = Optional.of(gson.fromJson(jsonStringFromURL, new TypeToken() {}.getType())); + checkedHashes.put(hash, res); + if (res.isPresent()) { + res.get().setExternal(true); + } + return res; + } + } + } + /** * Creates a new Commit with the given data, and tries to fill in the missing data from the local Repository * diff --git a/src/de/uni_passau/fim/gitwrapper/IssueDataProcessor.java b/src/de/uni_passau/fim/gitwrapper/IssueDataProcessor.java index b9ced2f..9306895 100644 --- a/src/de/uni_passau/fim/gitwrapper/IssueDataProcessor.java +++ b/src/de/uni_passau/fim/gitwrapper/IssueDataProcessor.java @@ -76,7 +76,11 @@ private List> parseCommits(IssueData issue) { .filter(eventData -> eventData instanceof EventData.ReferencedEventData) // filter out errors from referencing commits .filter(eventData -> ((EventData.ReferencedEventData) eventData).commit != null) - .map(eventData -> new ReferencedLink<>(Collections.singletonList(((EventData.ReferencedEventData) eventData).commit.getId()), eventData.user, eventData.created_at, "commitReferencesIssue")); + .map(eventData -> {if (((GitHubCommit) ((EventData.ReferencedEventData) eventData).commit).getExternal()) + { return new ReferencedLink<>(Collections.singletonList(((EventData.ReferencedEventData) eventData).commit.getId()), eventData.user, eventData.created_at, "commitReferencesIssueExternal") ; + } else { + return new ReferencedLink<>(Collections.singletonList(((EventData.ReferencedEventData) eventData).commit.getId()), eventData.user, eventData.created_at, "commitReferencesIssue"); + }}); // Parse commits from reviews and reviews' comments if (issue.isPullRequest()) { From f198a6e5f5bfe534a750d2201a14b76fd9ebb6d3 Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Mon, 21 Jul 2025 14:52:02 +0200 Subject: [PATCH 2/8] Change 'extractHashtags' to ignore codeblocks references to issues are now only found if they are ouside a code environment (starting and ending with three '`'). This mirrors GitHubs behaviour. Signed-off-by: Leo Sendelbach --- .../uni_passau/fim/gitwrapper/IssueDataProcessor.java | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/de/uni_passau/fim/gitwrapper/IssueDataProcessor.java b/src/de/uni_passau/fim/gitwrapper/IssueDataProcessor.java index 9306895..4c3e00e 100644 --- a/src/de/uni_passau/fim/gitwrapper/IssueDataProcessor.java +++ b/src/de/uni_passau/fim/gitwrapper/IssueDataProcessor.java @@ -266,6 +266,15 @@ private List extractHashtags(String text, boolean onlyInSameRepo) { } Pattern hashtagPattern; + // filter out everything in code block + String[] texts = text.split("```"); + text = ""; + for (int i = 0; i < texts.length; i++) { + if (i % 2 == 0) { + text = text + texts[i]; + } + } + if (onlyInSameRepo) { String repoName = repo.getRepoName(); String repoUser = repo.getRepoUser(); @@ -383,6 +392,7 @@ public void postDeserialize(IssueData result, JsonElement src, Gson gson) { Optional>> comments = repo.getComments(lookup); result.setComments(comments.orElse(Collections.emptyList())); } + if (result.getEventsList() == null) { Optional> events = repo.getEvents(lookup); result.setEvents(events.orElse(Collections.emptyList())); From e537267ea4ec29bb6f55be486d347fd7392b767c Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Mon, 21 Jul 2025 14:55:10 +0200 Subject: [PATCH 3/8] Document findings in 'README.md' Add paragraph that documents intended and unintended behaviour for 'referenced' events Signed-off-by: Leo Sendelbach --- README.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/README.md b/README.md index 1a7e487..43efc25 100644 --- a/README.md +++ b/README.md @@ -43,6 +43,16 @@ java -Xmx100G -jar "build/libs/GitHubWrapper-1.0-SNAPSHOT.jar" \ - Using the `-repo` parameter, you specify the file path of the repo you want to analyze. Notice that you need to have cloned the repo locally, such that the origin can be derived from this file path. - Using the `-workDir` parameter, you specify the working directory, which usually is the directory which contains the repository directory specified at `-repo`. +### `Referenced` events + +`Referenced` events are events generated in an issue if a commit references that issue in its commit message. The intended behavior is, that the event is present in the issue's event data, and the commit is again present in the related commits of the issue. This does not work if it is not possible to fetch that commit. In this case, the event still exists, but it contains a link to a commit that the api cannot resolve, meaning that no data about the commit can be accessed. This may lead to incorrect data points if the resulting data is automatically processed, for example using the tool `codeface-extraction`. Known causes of this include: + +- a commit was rebased and changed/removed +- an external repository was deleted +- the commit's branch was deleted + +Note that the commit might still be reachable until the automatic garbage collection has removed it from the remote repository. + ### Integration into other projects There is also an option to use the implementation of GitHubWrapper in your code without using the provided `IssueRunner`. From 2f33e8bb4334663ede55c1ef0b35913d7a44973d Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Sat, 9 Aug 2025 13:45:13 +0200 Subject: [PATCH 4/8] Add Section to README.md reformat README and nested if statement Signed-off-by: Leo Sendelbach --- README.md | 26 ++++++++++++------- .../fim/gitwrapper/IssueDataProcessor.java | 12 +++++---- 2 files changed, 23 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 43efc25..c326dd0 100644 --- a/README.md +++ b/README.md @@ -43,16 +43,6 @@ java -Xmx100G -jar "build/libs/GitHubWrapper-1.0-SNAPSHOT.jar" \ - Using the `-repo` parameter, you specify the file path of the repo you want to analyze. Notice that you need to have cloned the repo locally, such that the origin can be derived from this file path. - Using the `-workDir` parameter, you specify the working directory, which usually is the directory which contains the repository directory specified at `-repo`. -### `Referenced` events - -`Referenced` events are events generated in an issue if a commit references that issue in its commit message. The intended behavior is, that the event is present in the issue's event data, and the commit is again present in the related commits of the issue. This does not work if it is not possible to fetch that commit. In this case, the event still exists, but it contains a link to a commit that the api cannot resolve, meaning that no data about the commit can be accessed. This may lead to incorrect data points if the resulting data is automatically processed, for example using the tool `codeface-extraction`. Known causes of this include: - -- a commit was rebased and changed/removed -- an external repository was deleted -- the commit's branch was deleted - -Note that the commit might still be reachable until the automatic garbage collection has removed it from the remote repository. - ### Integration into other projects There is also an option to use the implementation of GitHubWrapper in your code without using the provided `IssueRunner`. @@ -101,3 +91,19 @@ repo.getIssues(false).ifPresent(issueData -> issueData.forEach(issue -> { System.out.println(comment.user.username + ": " + comment.body)); })); ``` + +### Further data processing + +The data extracted by this tool can be further processed, for example using the `run-issues.py` skript from the tool [`codeface-extraction`](https://github.com/se-sic/codeface-extraction). This organises and unifies the issue data into a single .list file. It also allows for synchronisation with data from other data extraction tools, such as `codeface`. + +### `Referenced` events + +`Referenced` events are events generated in an issue if a commit references that issue in its commit message. The intended behavior is that the event is present in the issue's event data, and the commit is again present in the related commits of the issue. This does not work if it is not possible to fetch that commit. In this case, the event still exists, but it contains a link to a commit that the api cannot resolve, meaning that no data about the commit can be accessed. +Known causes of this include: + +- a commit was rebased and changed/removed +- an external repository was deleted +- the commit's branch was deleted + +Note that the commit might still be reachable until the automatic garbage collection has removed it from the remote repository. +In itself, this is not problematic. However, when further processing the data using `codeface-extraction`, this may lead to these `referenced` events being present in the final data, even though they should be filtered out as part of the issue processing. diff --git a/src/de/uni_passau/fim/gitwrapper/IssueDataProcessor.java b/src/de/uni_passau/fim/gitwrapper/IssueDataProcessor.java index 4c3e00e..cf87142 100644 --- a/src/de/uni_passau/fim/gitwrapper/IssueDataProcessor.java +++ b/src/de/uni_passau/fim/gitwrapper/IssueDataProcessor.java @@ -76,11 +76,13 @@ private List> parseCommits(IssueData issue) { .filter(eventData -> eventData instanceof EventData.ReferencedEventData) // filter out errors from referencing commits .filter(eventData -> ((EventData.ReferencedEventData) eventData).commit != null) - .map(eventData -> {if (((GitHubCommit) ((EventData.ReferencedEventData) eventData).commit).getExternal()) - { return new ReferencedLink<>(Collections.singletonList(((EventData.ReferencedEventData) eventData).commit.getId()), eventData.user, eventData.created_at, "commitReferencesIssueExternal") ; - } else { - return new ReferencedLink<>(Collections.singletonList(((EventData.ReferencedEventData) eventData).commit.getId()), eventData.user, eventData.created_at, "commitReferencesIssue"); - }}); + .map(eventData -> { + if (((GitHubCommit) ((EventData.ReferencedEventData) eventData).commit).getExternal()) { + return new ReferencedLink<>(Collections.singletonList(((EventData.ReferencedEventData) eventData).commit.getId()), eventData.user, eventData.created_at, "commitReferencesIssueExternal"); + } else { + return new ReferencedLink<>(Collections.singletonList(((EventData.ReferencedEventData) eventData).commit.getId()), eventData.user, eventData.created_at, "commitReferencesIssue"); + } + }); // Parse commits from reviews and reviews' comments if (issue.isPullRequest()) { From 443dcdc049df77a9a3c1bc32f219057224e862b5 Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Tue, 12 Aug 2025 14:41:13 +0200 Subject: [PATCH 5/8] Fix spelling in 'README.md' change BE words for consistency Signed-off-by: Leo Sendelbach --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index c326dd0..1a467d1 100644 --- a/README.md +++ b/README.md @@ -94,11 +94,11 @@ repo.getIssues(false).ifPresent(issueData -> issueData.forEach(issue -> { ### Further data processing -The data extracted by this tool can be further processed, for example using the `run-issues.py` skript from the tool [`codeface-extraction`](https://github.com/se-sic/codeface-extraction). This organises and unifies the issue data into a single .list file. It also allows for synchronisation with data from other data extraction tools, such as `codeface`. +The data extracted by this tool can be further processed, for example using the `run-issues.py` script from the tool [`codeface-extraction`](https://github.com/se-sic/codeface-extraction). This organizes and unifies the issue data into a single csv-like .list file. It also allows for synchronization with data from other data extraction tools, such as `codeface`. -### `Referenced` events +### `referenced` events -`Referenced` events are events generated in an issue if a commit references that issue in its commit message. The intended behavior is that the event is present in the issue's event data, and the commit is again present in the related commits of the issue. This does not work if it is not possible to fetch that commit. In this case, the event still exists, but it contains a link to a commit that the api cannot resolve, meaning that no data about the commit can be accessed. +`referenced` events are events generated in an issue if a commit references that issue in its commit message. The intended behavior is that the event is present in the issue's event data, and the commit is again present in the related commits of the issue. This does not work if it is not possible to fetch that commit. In this case, the event still exists, but it contains a link to a commit that the api cannot resolve, meaning that no data about the commit can be accessed. Known causes of this include: - a commit was rebased and changed/removed From 038bbf7182bcab06137b26ae513e11cdfe236768 Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Tue, 26 Aug 2025 11:04:03 +0200 Subject: [PATCH 6/8] Update Copyright headers Signed-off-by: Leo Sendelbach --- src/de/uni_passau/fim/gitwrapper/EventDataProcessor.java | 1 + src/de/uni_passau/fim/gitwrapper/GitHubCommit.java | 1 + src/de/uni_passau/fim/gitwrapper/GitHubRepository.java | 1 + src/de/uni_passau/fim/gitwrapper/IssueDataProcessor.java | 1 + 4 files changed, 4 insertions(+) diff --git a/src/de/uni_passau/fim/gitwrapper/EventDataProcessor.java b/src/de/uni_passau/fim/gitwrapper/EventDataProcessor.java index f52adf6..bc4b878 100644 --- a/src/de/uni_passau/fim/gitwrapper/EventDataProcessor.java +++ b/src/de/uni_passau/fim/gitwrapper/EventDataProcessor.java @@ -1,6 +1,7 @@ /** * Copyright (C) 2016-2018 Florian Heck * Copyright (C) 2019 Thomas Bock + * Copyright (C) 2025 Leo Sendelbach * * This file is part of GitHubWrapper. * diff --git a/src/de/uni_passau/fim/gitwrapper/GitHubCommit.java b/src/de/uni_passau/fim/gitwrapper/GitHubCommit.java index 76d0585..1897aa2 100644 --- a/src/de/uni_passau/fim/gitwrapper/GitHubCommit.java +++ b/src/de/uni_passau/fim/gitwrapper/GitHubCommit.java @@ -1,5 +1,6 @@ /** * Copyright (C) 2019 Thomas Bock + * Copyright (C) 2025 Leo Sendelbach * * This file is part of GitHubWrapper. * diff --git a/src/de/uni_passau/fim/gitwrapper/GitHubRepository.java b/src/de/uni_passau/fim/gitwrapper/GitHubRepository.java index fa8f341..30c0f72 100644 --- a/src/de/uni_passau/fim/gitwrapper/GitHubRepository.java +++ b/src/de/uni_passau/fim/gitwrapper/GitHubRepository.java @@ -2,6 +2,7 @@ * Copyright (C) 2016-2020 Florian Heck * Copyright (C) 2018 Claus Hunsen * Copyright (C) 2019-2021 Thomas Bock + * Copyright (C) 2025 Leo Sendelbach * * This file is part of GitHubWrapper. * diff --git a/src/de/uni_passau/fim/gitwrapper/IssueDataProcessor.java b/src/de/uni_passau/fim/gitwrapper/IssueDataProcessor.java index cf87142..e615db1 100644 --- a/src/de/uni_passau/fim/gitwrapper/IssueDataProcessor.java +++ b/src/de/uni_passau/fim/gitwrapper/IssueDataProcessor.java @@ -1,6 +1,7 @@ /** * Copyright (C) 2016-2018 Florian Heck * Copyright (C) 2019-2020 Thomas Bock + * Copyright (C) 2025 Leo Sendelbach * * This file is part of GitHubWrapper. * From 55828ff79ee5b7b2e9d0c62b8858771a81abedd9 Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Tue, 28 Oct 2025 13:50:34 +0100 Subject: [PATCH 7/8] Rename field and setter for external commits Getter previously did not use method parameter. Also renamed and added docs to match existing methods. Signed-off-by: Leo Sendelbach --- .../fim/gitwrapper/GitHubCommit.java | 21 ++++++++++++++----- .../fim/gitwrapper/IssueDataProcessor.java | 2 +- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/src/de/uni_passau/fim/gitwrapper/GitHubCommit.java b/src/de/uni_passau/fim/gitwrapper/GitHubCommit.java index 1897aa2..29f8501 100644 --- a/src/de/uni_passau/fim/gitwrapper/GitHubCommit.java +++ b/src/de/uni_passau/fim/gitwrapper/GitHubCommit.java @@ -27,7 +27,7 @@ public class GitHubCommit extends Commit { private String authorUsername; private String committerUsername; private boolean addedToPullRequest = false; - private boolean isExternal = false; + private boolean external = false; /** * Constructs a new {@link GitHubCommit} with the given id made in the repo. @@ -122,11 +122,22 @@ void setAddedToPullRequest(boolean added) { this.addedToPullRequest = added; } - void setExternal(boolean external) { - this.isExternal = true; + /** + * Returns whether this commit is an external commit. + * + * @return whether this commit is an external commit + */ + boolean isExternal() { + return this.external; } - boolean getExternal() { - return this.isExternal; + /** + * Sets whether this commit is an external commit + * + * @param external this commit is an external commit + */ + void setExternal(boolean external) { + this.external = external; } + } diff --git a/src/de/uni_passau/fim/gitwrapper/IssueDataProcessor.java b/src/de/uni_passau/fim/gitwrapper/IssueDataProcessor.java index e615db1..1575c0a 100644 --- a/src/de/uni_passau/fim/gitwrapper/IssueDataProcessor.java +++ b/src/de/uni_passau/fim/gitwrapper/IssueDataProcessor.java @@ -78,7 +78,7 @@ private List> parseCommits(IssueData issue) { // filter out errors from referencing commits .filter(eventData -> ((EventData.ReferencedEventData) eventData).commit != null) .map(eventData -> { - if (((GitHubCommit) ((EventData.ReferencedEventData) eventData).commit).getExternal()) { + if (((GitHubCommit) ((EventData.ReferencedEventData) eventData).commit).isExternal()) { return new ReferencedLink<>(Collections.singletonList(((EventData.ReferencedEventData) eventData).commit.getId()), eventData.user, eventData.created_at, "commitReferencesIssueExternal"); } else { return new ReferencedLink<>(Collections.singletonList(((EventData.ReferencedEventData) eventData).commit.getId()), eventData.user, eventData.created_at, "commitReferencesIssue"); From de8b9b756867276d60fe24b11927927053e9bff8 Mon Sep 17 00:00:00 2001 From: Leo Sendelbach Date: Fri, 31 Oct 2025 15:41:49 +0100 Subject: [PATCH 8/8] Improve performance for extracting hashtags using stringbuilder instead of appending, which would result in copying the string Signed-off-by: Leo Sendelbach --- src/de/uni_passau/fim/gitwrapper/IssueDataProcessor.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/de/uni_passau/fim/gitwrapper/IssueDataProcessor.java b/src/de/uni_passau/fim/gitwrapper/IssueDataProcessor.java index 1575c0a..dcc32e9 100644 --- a/src/de/uni_passau/fim/gitwrapper/IssueDataProcessor.java +++ b/src/de/uni_passau/fim/gitwrapper/IssueDataProcessor.java @@ -271,12 +271,13 @@ private List extractHashtags(String text, boolean onlyInSameRepo) { // filter out everything in code block String[] texts = text.split("```"); - text = ""; + StringBuilder sb = new StringBuilder(); for (int i = 0; i < texts.length; i++) { if (i % 2 == 0) { - text = text + texts[i]; + sb.append(texts[i]); } } + text = sb.toString(); if (onlyInSameRepo) { String repoName = repo.getRepoName();