microsoft · Melirius · Jan 18, 2025 · Jan 18, 2025 · Jan 30, 2025 · mcroomp
diff --git a/src/structs/div.rs b/src/structs/div.rs
@@ -0,0 +1,113 @@
+use std::cmp::max;
+
+/// Fast division of |divident| < (1 << precision) by i32 divisor D.
+// We need for edge DCT coefs prediction precision of 32 - 13 = 19 bits
+// for quantization table values 0 < D < 65536.
+
+/// Usage for n / D:
+// let mult: i64 = recip(D);
+// let shift = mult & 0xFF;
+// let mut res = ((n as i64) * (mult & !0xFF)) >> shift;
+// res += if res < 0 { 1 } else { 0 };
+/// or variant lower through absolute values.
+
+// We use some excessive precision here, effectively precision for division
+// of i32 by (q << 13) is 32 - 13 = 19 bit, but we use 20 instead
+const DIV_PRECISION: i32 = 20;
+
+// Modified from https://web.archive.org/web/20160927064638/https://raw.githubusercontent.com/ridiculousfish/libdivide/master/divide_by_constants_codegen_reference.c
+// Comments from original author ridiculousfish
+// See also https://ridiculousfish.com/blog/posts/labor-of-division-episode-iii.html
+/*
+  Reference implementations of computing and using the "magic number" approach to dividing
+  by constants, including codegen instructions. The unsigned division incorporates the
+  "round down" optimization per ridiculous_fish.
+
+  This is free and unencumbered software. Any copyright is dedicated to the Public Domain.
+*/
+pub fn recip(d: i32) -> i32 {
+    if d == 0 || d == 1 {
+        let mult = (1 << DIV_PRECISION) + 1;
+        let shift = DIV_PRECISION + 8;
+        return (mult << 8) | shift;
+    }
+
+    // Absolute value of D
+    let abs_d = d.unsigned_abs() as u64;
+
+    // The initial power of 2 is one less than the first one that can possibly work
+    // "two31" in Warren
+    let mut exponent = DIV_PRECISION + 1; //precision - 1;
+    let initial_power_of_2 = 1u64 << exponent;
+
+    // Compute the absolute value of our "test numerator,"
+    // which is the largest dividend whose remainder with d is d-1.
+    // This is called anc in Warren.
+    let tmp: u64 = (1u64 << DIV_PRECISION) + if d < 0 { 1 } else { 0 };
+    let abs_test_numer: u64 = max(tmp - 1 - tmp % abs_d, abs_d - 1);
+
+    // Initialize our quotients and remainders (q1, r1, q2, r2 in Warren)
+    let mut quotient1: u64 = initial_power_of_2 / abs_test_numer;
+    let mut remainder1: u64 = initial_power_of_2 % abs_test_numer;
+    let mut quotient2: u64 = initial_power_of_2 / abs_d;
+    let mut remainder2: u64 = initial_power_of_2 % abs_d;
+    let mut delta: u64;
+
+    // Begin our loop
+    loop {
+        // Update the exponent
+        exponent += 1;
+
+        // Update quotient1 and remainder1
+        quotient1 *= 2;
+        remainder1 *= 2;
+        if remainder1 >= abs_test_numer {
+            quotient1 += 1;
+            remainder1 -= abs_test_numer;
+        }
+
+        // Update quotient2 and remainder2
+        quotient2 *= 2;
+        remainder2 *= 2;
+        if remainder2 >= abs_d {
+            quotient2 += 1;
+            remainder2 -= abs_d;
+        }
+
+        // Keep going as long as (2**exponent) / abs_d <= delta
+        delta = abs_d - remainder2;
+        // !(quotient1 < delta || (quotient1 == delta && remainder1 == 0))
+        if quotient1 >= delta && (quotient1 != delta || remainder1 != 0) {
+            break;
+        }
+    }
+
+    let mult = (quotient2 + 1) as i32;
+    debug_assert!((mult > 0) && (mult < (1 << 23)));
+    let shift = exponent + 8;
+
+    return (mult << 8) | shift;
+}
+
+/// Precalculated arrays of reciprocals
+//let recip_array: [i32; 1 << 16] = core::array::from_fn(|i| recip(i));
+
+#[inline(always)]
+pub fn div(dividend: i32, divisor_recip: i32) -> i32 {
+    let abs_d = dividend.unsigned_abs() as i64;
+    let shift = divisor_recip & 0xFF;
+    let mult = (divisor_recip & !0xFF) as i64;
+    let res = (abs_d * mult >> shift) as i32;
+    if dividend < 0 {
+        -res
+    } else {
+        res
+    }
+}
+
+// pub fn div(dividend: i32x8, divisor_recip: i32x8)
+// {
+//     let shift = preliminary_shift + (divisor_recip & 0xFF);
+//     let res = (dividend as i64) * ((divisor_recip & !0xFF) as i64) >> shift;
+//     return (res + if res < 0 { 1 } else { 0 }) as i32;
+// }
diff --git a/src/structs/lepton_decoder.rs b/src/structs/lepton_decoder.rs
@@ -334,8 +334,7 @@ pub fn read_coefficient_block<const ALL_PRESENT: bool, R: Read>(
     )?;
 
     // step 3, read the DC coefficient (0,0 of the block)
-    let q0 = qt.get_quantization_table()[0] as i32;
-    let predicted_dc = pt.adv_predict_dc_pix::<ALL_PRESENT>(&raster, q0, &neighbor_data, features);
+    let predicted_dc = pt.adv_predict_dc_pix::<ALL_PRESENT>(&raster, qt, &neighbor_data, features);
 
     let coef = model.read_dc(
         bool_reader,
@@ -351,6 +350,7 @@ pub fn read_coefficient_block<const ALL_PRESENT: bool, R: Read>(
     ) as i16);
 
     // neighbor summary is used as a predictor for the next block
+    let q0 = qt.get_quantization_table()[0] as i32;
     let neighbor_summary = NeighborSummary::new(
         predicted_dc.next_edge_pixels_h,
         predicted_dc.next_edge_pixels_v,

diff --git a/src/structs/lepton_encoder.rs b/src/structs/lepton_encoder.rs
@@ -358,9 +358,8 @@ pub fn write_coefficient_block<const ALL_PRESENT: bool, W: Write>(
     .context()?;
 
     // finally the DC coefficient (at 0,0)
-    let q0 = qt.get_quantization_table()[0] as i32;
     let predicted_val =
-        pt.adv_predict_dc_pix::<ALL_PRESENT>(&raster, q0, &neighbors_data, features);
+        pt.adv_predict_dc_pix::<ALL_PRESENT>(&raster, qt, &neighbors_data, features);
 
     let avg_predicted_dc = ProbabilityTables::adv_predict_or_unpredict_dc(
         here_tr.get_dc(),
@@ -389,6 +388,7 @@ pub fn write_coefficient_block<const ALL_PRESENT: bool, W: Write>(
         .context()?;
 
     // neighbor summary is used as a predictor for the next block
+    let q0 = qt.get_quantization_table()[0] as i32;
     let neighbor_summary = NeighborSummary::new(
         predicted_val.next_edge_pixels_h,
         predicted_val.next_edge_pixels_v,

diff --git a/src/structs/mod.rs b/src/structs/mod.rs
@@ -11,6 +11,7 @@
 
 mod block_context;
 mod branch;
+mod div;
 mod idct;
 mod lepton_decoder;
 mod lepton_encoder;

diff --git a/src/structs/probability_tables.rs b/src/structs/probability_tables.rs
@@ -11,6 +11,7 @@ use crate::consts::*;
 use crate::enabled_features;
 use crate::jpeg::block_based_image::AlignedBlock;
 use crate::structs::block_context::NeighborData;
+use crate::structs::div::div;
 use crate::structs::idct::*;
 use crate::structs::model::*;
 use crate::structs::quantization_tables::*;
@@ -219,20 +220,30 @@ impl ProbabilityTables {
             return 0;
         }
 
-        let mut best_prior: i32 = pred[if HORIZONTAL {
+        let best_prior: i32 = pred[if HORIZONTAL {
             coefficient_tr >> 3
         } else {
             coefficient_tr
         }];
-        best_prior /= (qt.get_quantization_table_transposed()[coefficient_tr] as i32) << 13;
+        //best_prior / ((qt.get_quantization_table_transposed()[coefficient_tr] as i32) << 13)
+        let recip = qt.get_recip(if HORIZONTAL {
+            coefficient_tr >> 3
+        } else {
+            coefficient_tr + 8
+        });
+        let prior = div((best_prior.unsigned_abs() >> 13) as i32, recip);
 
-        best_prior
+        if best_prior < 0 {
+            -prior
+        } else {
+            prior
+        }
     }
 
     pub fn adv_predict_dc_pix<const ALL_PRESENT: bool>(
         &self,
         raster_cols: &[i32x8; 8],
-        q0: i32,
+        qt: &QuantizationTables, //q0: i32,
         neighbor_data: &NeighborData,
         enabled_features: &enabled_features::EnabledFeatures,
     ) -> PredictDCResult {
@@ -326,8 +337,12 @@ impl ProbabilityTables {
 
         let uncertainty2_val = (far_afield_value >> 3) as i16;
 
+        //let q0 = qt.get_quantization_table()[0] as i32;
+        let q0 = qt.get_recip(0);
+        let div = div(avgmed, q0);
         return PredictDCResult {
-            predicted_dc: (avgmed / q0 + 4) >> 3,
+            //predicted_dc: (avgmed / q0 + 4) >> 3,
+            predicted_dc: (div + 4) >> 3,
             uncertainty: uncertainty_val,
             uncertainty2: uncertainty2_val,
             next_edge_pixels_h,

diff --git a/src/structs/quantization_tables.rs b/src/structs/quantization_tables.rs
@@ -8,6 +8,7 @@ use crate::consts::*;
 use crate::helpers::*;
 use crate::jpeg::jpeg_header::JpegHeader;
 use crate::lepton_error::err_exit_code;
+use crate::structs::div::recip;
 use crate::{ExitCode, Result};
 
 pub struct QuantizationTables {
@@ -18,6 +19,8 @@ pub struct QuantizationTables {
     // Calculated using approximate maximal magnitudes
     // of these coefficients `FREQ_MAX`
     min_noise_threshold: [u8; 14],
+    // Reciprocals for division-by-multiplicatio scheme
+    recip: [i32; 16],
 }
 
 impl QuantizationTables {
@@ -32,6 +35,7 @@ impl QuantizationTables {
             quantization_table: [0; 64],
             quantization_table_transposed: [0; 64],
             min_noise_threshold: [0; 14],
+            recip: [0; 16],
         };
 
         for pixel_row in 0..8 {
@@ -59,6 +63,10 @@ impl QuantizationTables {
                 }
             }
         }
+        for i in 0..16 {
+            let coord = if i < 8 { i } else { (i - 8) * 8 };
+            retval.recip[i] = recip(retval.quantization_table[coord] as i32);
+        }
 
         retval
     }
@@ -97,4 +105,8 @@ impl QuantizationTables {
     pub fn get_min_noise_threshold(&self, coef: usize) -> u8 {
         self.min_noise_threshold[coef]
     }
+
+    pub fn get_recip(&self, coef: usize) -> i32 {
+        self.recip[coef]
+    }
 }