diff --git a/README.md b/README.md
index 1a7e487..1a467d1 100644
--- a/README.md
+++ b/README.md
@@ -91,3 +91,19 @@ repo.getIssues(false).ifPresent(issueData -> issueData.forEach(issue -> {
System.out.println(comment.user.username + ": " + comment.body));
}));
```
+
+### Further data processing
+
+The data extracted by this tool can be further processed, for example using the `run-issues.py` script from the tool [`codeface-extraction`](https://github.com/se-sic/codeface-extraction). This organizes and unifies the issue data into a single csv-like .list file. It also allows for synchronization with data from other data extraction tools, such as `codeface`.
+
+### `referenced` events
+
+`referenced` events are events generated in an issue if a commit references that issue in its commit message. The intended behavior is that the event is present in the issue's event data, and the commit is again present in the related commits of the issue. This does not work if it is not possible to fetch that commit. In this case, the event still exists, but it contains a link to a commit that the api cannot resolve, meaning that no data about the commit can be accessed.
+Known causes of this include:
+
+- a commit was rebased and changed/removed
+- an external repository was deleted
+- the commit's branch was deleted
+
+Note that the commit might still be reachable until the automatic garbage collection has removed it from the remote repository.
+In itself, this is not problematic. However, when further processing the data using `codeface-extraction`, this may lead to these `referenced` events being present in the final data, even though they should be filtered out as part of the issue processing.
diff --git a/src/de/uni_passau/fim/gitwrapper/EventDataProcessor.java b/src/de/uni_passau/fim/gitwrapper/EventDataProcessor.java
index 40d49a0..bc4b878 100644
--- a/src/de/uni_passau/fim/gitwrapper/EventDataProcessor.java
+++ b/src/de/uni_passau/fim/gitwrapper/EventDataProcessor.java
@@ -1,6 +1,7 @@
/**
* Copyright (C) 2016-2018 Florian Heck
* Copyright (C) 2019 Thomas Bock
+ * Copyright (C) 2025 Leo Sendelbach
*
* This file is part of GitHubWrapper.
*
@@ -86,8 +87,12 @@ public void postDeserialize(EventData.ReferencedEventData result, JsonElement sr
}
result.commit = repo.getGithubCommit(hash.getAsString()).orElseGet(() -> {
- LOG.warning("Found commit unknown to GitHub and local git repo: " + hash);
- return null;
+ LOG.warning("Found commit unknown to GitHub and local git repo: " + hash + " Retry using url...");
+ JsonElement url = src.getAsJsonObject().get("commit_url");
+ return repo.getGithubCommitUrl(hash.getAsString(), url.getAsString()).orElseGet(() -> {
+ LOG.warning("Could not find commit: " + hash);
+ return null;
+ });
});
}
diff --git a/src/de/uni_passau/fim/gitwrapper/GitHubCommit.java b/src/de/uni_passau/fim/gitwrapper/GitHubCommit.java
index 16a2429..29f8501 100644
--- a/src/de/uni_passau/fim/gitwrapper/GitHubCommit.java
+++ b/src/de/uni_passau/fim/gitwrapper/GitHubCommit.java
@@ -1,5 +1,6 @@
/**
* Copyright (C) 2019 Thomas Bock
+ * Copyright (C) 2025 Leo Sendelbach
*
* This file is part of GitHubWrapper.
*
@@ -26,6 +27,7 @@ public class GitHubCommit extends Commit {
private String authorUsername;
private String committerUsername;
private boolean addedToPullRequest = false;
+ private boolean external = false;
/**
* Constructs a new {@link GitHubCommit} with the given id made in the repo.
@@ -119,4 +121,23 @@ public boolean isAddedToPullRequest() {
void setAddedToPullRequest(boolean added) {
this.addedToPullRequest = added;
}
+
+ /**
+ * Returns whether this commit is an external commit.
+ *
+ * @return whether this commit is an external commit
+ */
+ boolean isExternal() {
+ return this.external;
+ }
+
+ /**
+ * Sets whether this commit is an external commit
+ *
+ * @param external this commit is an external commit
+ */
+ void setExternal(boolean external) {
+ this.external = external;
+ }
+
}
diff --git a/src/de/uni_passau/fim/gitwrapper/GitHubRepository.java b/src/de/uni_passau/fim/gitwrapper/GitHubRepository.java
index 335f6cf..30c0f72 100644
--- a/src/de/uni_passau/fim/gitwrapper/GitHubRepository.java
+++ b/src/de/uni_passau/fim/gitwrapper/GitHubRepository.java
@@ -2,6 +2,7 @@
* Copyright (C) 2016-2020 Florian Heck
* Copyright (C) 2018 Claus Hunsen
* Copyright (C) 2019-2021 Thomas Bock
+ * Copyright (C) 2025 Leo Sendelbach
*
* This file is part of GitHubWrapper.
*
@@ -352,6 +353,8 @@ public Optional> getIssues(boolean includePullRequests, OffsetDa
}
else timeLimit = "";
Type finalType = type;
+ // For debugging, you may add additional parameters to the string. For example, '/issues?creator=sleo&state=all'
+ // will fetch issues created by user 'sleo' and all related issues and commits.
getJSONStringFromPath("/issues?state=all" + timeLimit).map(json -> {
List data;
try {
@@ -367,7 +370,7 @@ public Optional> getIssues(boolean includePullRequests, OffsetDa
threadPool.submit(() -> data.parallelStream().forEach(IssueData::freeze));
} catch (JsonSyntaxException e) {
- LOG.warning("Encountered invalid JSON: " + json);
+ LOG.warning("Encountered invalid JSON: " + json + "\n\n" + e.getMessage() + "\n\n" + e);
return null;
}
return data;
@@ -1028,6 +1031,38 @@ Optional getGithubCommit(String hash) {
});
}
+ Optional getGithubCommitUrl(String hash, String url) {
+ if (offline.get()) {
+ return Optional.of(getGHCommitUnchecked(DummyCommit.DUMMY_COMMIT_ID));
+ } else {
+ try {
+ Optional res = getJSONStringFromURL(url).map(commitInfo ->
+ gson.fromJson(commitInfo, new TypeToken() {}.getType()));
+ checkedHashes.put(hash, res);
+ if (res.isPresent()) {
+ res.get().setExternal(true);
+ }
+ return res;
+ } catch (JsonSyntaxException e) {
+ /* For whatever reason, the JSON String is malformed, perhaps due to ill-encoded characters
+ * in patches within the files element of the JSON String.
+ * Due to that, get the JSON String again and remove the content of the files element of the
+ * JSON String, as it is not needed for further processing.
+ */
+ LOG.info("Malformed JSON String when querying data for commit " + url + ". Neglect files element.");
+ String jsonStringFromURL = getJSONStringFromURL(url).get();
+ jsonStringFromURL = StringUtils.substringBefore(jsonStringFromURL, "\"files\":[");
+ jsonStringFromURL = jsonStringFromURL + "\"files\":[]}";
+ Optional res = Optional.of(gson.fromJson(jsonStringFromURL, new TypeToken() {}.getType()));
+ checkedHashes.put(hash, res);
+ if (res.isPresent()) {
+ res.get().setExternal(true);
+ }
+ return res;
+ }
+ }
+ }
+
/**
* Creates a new Commit with the given data, and tries to fill in the missing data from the local Repository
*
diff --git a/src/de/uni_passau/fim/gitwrapper/IssueDataProcessor.java b/src/de/uni_passau/fim/gitwrapper/IssueDataProcessor.java
index b9ced2f..dcc32e9 100644
--- a/src/de/uni_passau/fim/gitwrapper/IssueDataProcessor.java
+++ b/src/de/uni_passau/fim/gitwrapper/IssueDataProcessor.java
@@ -1,6 +1,7 @@
/**
* Copyright (C) 2016-2018 Florian Heck
* Copyright (C) 2019-2020 Thomas Bock
+ * Copyright (C) 2025 Leo Sendelbach
*
* This file is part of GitHubWrapper.
*
@@ -76,7 +77,13 @@ private List> parseCommits(IssueData issue) {
.filter(eventData -> eventData instanceof EventData.ReferencedEventData)
// filter out errors from referencing commits
.filter(eventData -> ((EventData.ReferencedEventData) eventData).commit != null)
- .map(eventData -> new ReferencedLink<>(Collections.singletonList(((EventData.ReferencedEventData) eventData).commit.getId()), eventData.user, eventData.created_at, "commitReferencesIssue"));
+ .map(eventData -> {
+ if (((GitHubCommit) ((EventData.ReferencedEventData) eventData).commit).isExternal()) {
+ return new ReferencedLink<>(Collections.singletonList(((EventData.ReferencedEventData) eventData).commit.getId()), eventData.user, eventData.created_at, "commitReferencesIssueExternal");
+ } else {
+ return new ReferencedLink<>(Collections.singletonList(((EventData.ReferencedEventData) eventData).commit.getId()), eventData.user, eventData.created_at, "commitReferencesIssue");
+ }
+ });
// Parse commits from reviews and reviews' comments
if (issue.isPullRequest()) {
@@ -262,6 +269,16 @@ private List extractHashtags(String text, boolean onlyInSameRepo) {
}
Pattern hashtagPattern;
+ // filter out everything in code block
+ String[] texts = text.split("```");
+ StringBuilder sb = new StringBuilder();
+ for (int i = 0; i < texts.length; i++) {
+ if (i % 2 == 0) {
+ sb.append(texts[i]);
+ }
+ }
+ text = sb.toString();
+
if (onlyInSameRepo) {
String repoName = repo.getRepoName();
String repoUser = repo.getRepoUser();
@@ -379,6 +396,7 @@ public void postDeserialize(IssueData result, JsonElement src, Gson gson) {
Optional>> comments = repo.getComments(lookup);
result.setComments(comments.orElse(Collections.emptyList()));
}
+
if (result.getEventsList() == null) {
Optional> events = repo.getEvents(lookup);
result.setEvents(events.orElse(Collections.emptyList()));