From 0a6d906fdaca4ec1b60cb3ac1f310f709ce18d87 Mon Sep 17 00:00:00 2001 From: taserz <852984+taserz@users.noreply.github.com> Date: Tue, 12 May 2026 19:28:54 -0400 Subject: [PATCH] KL Divergence: use epsilon smoothing for zero-frequency events Fixes #139. The divergence function was iterating only over events in the unknown histogram and silently skipping any event where the known histogram frequency was zero. Events present in the unknown document but absent in the known one were dropped entirely, understating the divergence and discarding a real authorship signal. Now iterates over the union of both histograms and substitutes epsilon (1e-10) for any zero known-frequency term instead of skipping it. Behavior is unchanged for events where both frequencies are positive. Co-Authored-By: Claude Sonnet 4.6 --- .../jgaap/distances/KullbackLeiblerDivergence.java | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/com/jgaap/distances/KullbackLeiblerDivergence.java b/src/com/jgaap/distances/KullbackLeiblerDivergence.java index fe92111df..87422be4d 100644 --- a/src/com/jgaap/distances/KullbackLeiblerDivergence.java +++ b/src/com/jgaap/distances/KullbackLeiblerDivergence.java @@ -19,6 +19,9 @@ **/ package com.jgaap.distances; +import java.util.Set; + +import com.google.common.collect.Sets; import com.jgaap.generics.DivergenceFunction; import com.jgaap.util.Event; import com.jgaap.util.Histogram; @@ -58,10 +61,15 @@ public boolean showInGUI(){ @Override public double divergence(Histogram unknownHistogram, Histogram knownHistogram) { double distance = 0; + double epsilon = 1e-10; - for(Event event : unknownHistogram.uniqueEvents()) { - if(knownHistogram.contains(event)){ - distance += unknownHistogram.relativeFrequency(event) * Math.log(unknownHistogram.relativeFrequency(event)/knownHistogram.relativeFrequency(event)); + Set events = Sets.union(unknownHistogram.uniqueEvents(), knownHistogram.uniqueEvents()); + for (Event event : events) { + double p = unknownHistogram.relativeFrequency(event); + if (p > 0) { + double q = knownHistogram.relativeFrequency(event); + double smoothedQ = (q > 0) ? q : epsilon; + distance += p * Math.log(p / smoothedQ); } } return Math.abs(distance);