From b01052ac59b67630a20218088bf734c7d9199121 Mon Sep 17 00:00:00 2001 From: taserz <852984+taserz@users.noreply.github.com> Date: Tue, 12 May 2026 19:29:21 -0400 Subject: [PATCH] Angular Separation and Chord Distance: fix cosine similarity calculation Fixes #140. Both functions had the wrong denominator for cosine similarity. They were accumulating the sum of raw frequencies instead of the sum of squared frequencies. For a normalized histogram sum(xi) = 1, so the denominator collapsed to 1 * 1 = 1 and both functions were effectively returning the dot product with no normalization. Fixed by accumulating sum(xi^2) and sum(yi^2) in the loop and using sqrt(sumXX * sumYY) as the denominator, which is the standard L2 norm. Angular Separation now returns arccos(cosine) / pi instead of 1 - cosine. The old formula is a cosine distance approximation, not an actual angular measure. The arccos version gives a properly normalized angle in [0, 1]. Chord Distance's sqrt(2 - 2 * cosine) formula was already mathematically correct and just needed an accurate cosine value to work properly. Test expected values updated: identical vectors now correctly return 0.0 for both functions instead of 0.9 and sqrt(1.8). Co-Authored-By: Claude Sonnet 4.6 --- .../jgaap/distances/AngularSeparationDistance.java | 13 +++++++------ src/com/jgaap/distances/ChordDistance.java | 13 +++++++------ .../distances/AngularSeparationDistanceTest.java | 8 ++++---- .../com/jgaap/distances/ChordDistanceTest.java | 2 +- 4 files changed, 19 insertions(+), 17 deletions(-) diff --git a/src/com/jgaap/distances/AngularSeparationDistance.java b/src/com/jgaap/distances/AngularSeparationDistance.java index 6c0040682..ff1a4c293 100644 --- a/src/com/jgaap/distances/AngularSeparationDistance.java +++ b/src/com/jgaap/distances/AngularSeparationDistance.java @@ -10,10 +10,10 @@ /** * Angular Separation Distance - * d = 1 - ( sum( xi * yi ) / sqrt( sum(xi)^2 * sum( yi )^2 ) ) - * + * d = arccos( sum(xi*yi) / sqrt(sum(xi^2) * sum(yi^2)) ) / pi + * * @author Adam Sargent - * @version 1.0 + * @version 1.1 */ public class AngularSeparationDistance extends DistanceFunction { @@ -43,10 +43,11 @@ public double distance(Histogram unknownHistogram, Histogram knownHistogram) for(Event event : events){ sumNumer += unknownHistogram.relativeFrequency(event) * knownHistogram.relativeFrequency(event); - sumUnknown += unknownHistogram.relativeFrequency(event); - sumKnown += knownHistogram.relativeFrequency(event); + sumUnknown += Math.pow(unknownHistogram.relativeFrequency(event), 2); + sumKnown += Math.pow(knownHistogram.relativeFrequency(event), 2); } - distance = 1 - (sumNumer / Math.sqrt(sumUnknown * sumUnknown * sumKnown * sumKnown)); + double cosine = sumNumer / Math.sqrt(sumUnknown * sumKnown); + distance = Math.acos(Math.min(1.0, cosine)) / Math.PI; return distance; } diff --git a/src/com/jgaap/distances/ChordDistance.java b/src/com/jgaap/distances/ChordDistance.java index 153531264..d5a39b33c 100644 --- a/src/com/jgaap/distances/ChordDistance.java +++ b/src/com/jgaap/distances/ChordDistance.java @@ -10,10 +10,10 @@ /** * Chord Distance - * d = sqrt( 2 - 2 * (sum(xi * yi)/sqrt( sum(xi)^2 * sum(yi)^2 )) ) - * + * d = sqrt( 2 - 2 * (sum(xi*yi) / sqrt(sum(xi^2) * sum(yi^2))) ) + * * @author Adam Sargent - * @version 1.0 + * @version 1.1 */ public class ChordDistance extends DistanceFunction { @@ -42,10 +42,11 @@ public double distance(Histogram unknownHistogram, Histogram knownHistogram) for(Event event : events){ sumNumer += unknownHistogram.relativeFrequency(event) * knownHistogram.relativeFrequency(event); - sumUnknown += unknownHistogram.relativeFrequency(event); - sumKnown += knownHistogram.relativeFrequency(event); + sumUnknown += Math.pow(unknownHistogram.relativeFrequency(event), 2); + sumKnown += Math.pow(knownHistogram.relativeFrequency(event), 2); } - distance = Math.sqrt(2 - 2 * (sumNumer / Math.sqrt(sumUnknown * sumUnknown * sumKnown * sumKnown))); + double cosine = sumNumer / Math.sqrt(sumUnknown * sumKnown); + distance = Math.sqrt(2 - 2 * Math.min(1.0, cosine)); return distance; } diff --git a/unittests/com/jgaap/distances/AngularSeparationDistanceTest.java b/unittests/com/jgaap/distances/AngularSeparationDistanceTest.java index 8b25959b4..a911d4d2c 100644 --- a/unittests/com/jgaap/distances/AngularSeparationDistanceTest.java +++ b/unittests/com/jgaap/distances/AngularSeparationDistanceTest.java @@ -32,9 +32,9 @@ public void testDistance() set1.addEvents(test1); set2.addEvents(test1); double result = new AngularSeparationDistance().distance(new EventMap(set1), new EventMap(set2)); - assertTrue(DistanceTestHelper.inRange(result, 0.90, 0.0000000001)); - - + assertTrue(DistanceTestHelper.inRange(result, 0.0, 0.0000000001)); + + set2 = new EventSet(); Vector test2 = new Vector(); test2.add(new Event("1", null)); @@ -49,7 +49,7 @@ public void testDistance() test2.add(new Event("10", null)); set2.addEvents(test2); result = new AngularSeparationDistance().distance(new EventMap(set1), new EventMap(set2)); - assertTrue(DistanceTestHelper.inRange(result, 1.0, 0.0000000001)); + assertTrue(DistanceTestHelper.inRange(result, 0.5, 0.0000000001)); } } diff --git a/unittests/com/jgaap/distances/ChordDistanceTest.java b/unittests/com/jgaap/distances/ChordDistanceTest.java index 8e1c1cd3e..648cd0f85 100644 --- a/unittests/com/jgaap/distances/ChordDistanceTest.java +++ b/unittests/com/jgaap/distances/ChordDistanceTest.java @@ -32,7 +32,7 @@ public void testDistance() set1.addEvents(test1); set2.addEvents(test1); double result = new ChordDistance().distance(new EventMap(set1), new EventMap(set2)); - assertTrue(DistanceTestHelper.inRange(result, Math.sqrt(1.8), 0.0000000001)); + assertTrue(DistanceTestHelper.inRange(result, 0.0, 0.0000000001)); set2 = new EventSet();