diff --git a/src/structs/div.rs b/src/structs/div.rs new file mode 100644 index 00000000..5574ee1d --- /dev/null +++ b/src/structs/div.rs @@ -0,0 +1,113 @@ +use std::cmp::max; + +/// Fast division of |divident| < (1 << precision) by i32 divisor D. +// We need for edge DCT coefs prediction precision of 32 - 13 = 19 bits +// for quantization table values 0 < D < 65536. + +/// Usage for n / D: +// let mult: i64 = recip(D); +// let shift = mult & 0xFF; +// let mut res = ((n as i64) * (mult & !0xFF)) >> shift; +// res += if res < 0 { 1 } else { 0 }; +/// or variant lower through absolute values. + +// We use some excessive precision here, effectively precision for division +// of i32 by (q << 13) is 32 - 13 = 19 bit, but we use 20 instead +const DIV_PRECISION: i32 = 20; + +// Modified from https://web.archive.org/web/20160927064638/https://raw.githubusercontent.com/ridiculousfish/libdivide/master/divide_by_constants_codegen_reference.c +// Comments from original author ridiculousfish +// See also https://ridiculousfish.com/blog/posts/labor-of-division-episode-iii.html +/* + Reference implementations of computing and using the "magic number" approach to dividing + by constants, including codegen instructions. The unsigned division incorporates the + "round down" optimization per ridiculous_fish. + + This is free and unencumbered software. Any copyright is dedicated to the Public Domain. +*/ +pub fn recip(d: i32) -> i32 { + if d == 0 || d == 1 { + let mult = (1 << DIV_PRECISION) + 1; + let shift = DIV_PRECISION + 8; + return (mult << 8) | shift; + } + + // Absolute value of D + let abs_d = d.unsigned_abs() as u64; + + // The initial power of 2 is one less than the first one that can possibly work + // "two31" in Warren + let mut exponent = DIV_PRECISION + 1; //precision - 1; + let initial_power_of_2 = 1u64 << exponent; + + // Compute the absolute value of our "test numerator," + // which is the largest dividend whose remainder with d is d-1. + // This is called anc in Warren. + let tmp: u64 = (1u64 << DIV_PRECISION) + if d < 0 { 1 } else { 0 }; + let abs_test_numer: u64 = max(tmp - 1 - tmp % abs_d, abs_d - 1); + + // Initialize our quotients and remainders (q1, r1, q2, r2 in Warren) + let mut quotient1: u64 = initial_power_of_2 / abs_test_numer; + let mut remainder1: u64 = initial_power_of_2 % abs_test_numer; + let mut quotient2: u64 = initial_power_of_2 / abs_d; + let mut remainder2: u64 = initial_power_of_2 % abs_d; + let mut delta: u64; + + // Begin our loop + loop { + // Update the exponent + exponent += 1; + + // Update quotient1 and remainder1 + quotient1 *= 2; + remainder1 *= 2; + if remainder1 >= abs_test_numer { + quotient1 += 1; + remainder1 -= abs_test_numer; + } + + // Update quotient2 and remainder2 + quotient2 *= 2; + remainder2 *= 2; + if remainder2 >= abs_d { + quotient2 += 1; + remainder2 -= abs_d; + } + + // Keep going as long as (2**exponent) / abs_d <= delta + delta = abs_d - remainder2; + // !(quotient1 < delta || (quotient1 == delta && remainder1 == 0)) + if quotient1 >= delta && (quotient1 != delta || remainder1 != 0) { + break; + } + } + + let mult = (quotient2 + 1) as i32; + debug_assert!((mult > 0) && (mult < (1 << 23))); + let shift = exponent + 8; + + return (mult << 8) | shift; +} + +/// Precalculated arrays of reciprocals +//let recip_array: [i32; 1 << 16] = core::array::from_fn(|i| recip(i)); + +#[inline(always)] +pub fn div(dividend: i32, divisor_recip: i32) -> i32 { + let abs_d = dividend.unsigned_abs() as i64; + let shift = divisor_recip & 0xFF; + let mult = (divisor_recip & !0xFF) as i64; + let res = (abs_d * mult >> shift) as i32; + if dividend < 0 { + -res + } else { + res + } +} + +// pub fn div(dividend: i32x8, divisor_recip: i32x8) +// { +// let shift = preliminary_shift + (divisor_recip & 0xFF); +// let res = (dividend as i64) * ((divisor_recip & !0xFF) as i64) >> shift; +// return (res + if res < 0 { 1 } else { 0 }) as i32; +// } diff --git a/src/structs/lepton_decoder.rs b/src/structs/lepton_decoder.rs index 90523387..8792bf3d 100644 --- a/src/structs/lepton_decoder.rs +++ b/src/structs/lepton_decoder.rs @@ -334,8 +334,7 @@ pub fn read_coefficient_block( )?; // step 3, read the DC coefficient (0,0 of the block) - let q0 = qt.get_quantization_table()[0] as i32; - let predicted_dc = pt.adv_predict_dc_pix::(&raster, q0, &neighbor_data, features); + let predicted_dc = pt.adv_predict_dc_pix::(&raster, qt, &neighbor_data, features); let coef = model.read_dc( bool_reader, @@ -351,6 +350,7 @@ pub fn read_coefficient_block( ) as i16); // neighbor summary is used as a predictor for the next block + let q0 = qt.get_quantization_table()[0] as i32; let neighbor_summary = NeighborSummary::new( predicted_dc.next_edge_pixels_h, predicted_dc.next_edge_pixels_v, diff --git a/src/structs/lepton_encoder.rs b/src/structs/lepton_encoder.rs index 3350cd8a..7bbfa20e 100644 --- a/src/structs/lepton_encoder.rs +++ b/src/structs/lepton_encoder.rs @@ -358,9 +358,8 @@ pub fn write_coefficient_block( .context()?; // finally the DC coefficient (at 0,0) - let q0 = qt.get_quantization_table()[0] as i32; let predicted_val = - pt.adv_predict_dc_pix::(&raster, q0, &neighbors_data, features); + pt.adv_predict_dc_pix::(&raster, qt, &neighbors_data, features); let avg_predicted_dc = ProbabilityTables::adv_predict_or_unpredict_dc( here_tr.get_dc(), @@ -389,6 +388,7 @@ pub fn write_coefficient_block( .context()?; // neighbor summary is used as a predictor for the next block + let q0 = qt.get_quantization_table()[0] as i32; let neighbor_summary = NeighborSummary::new( predicted_val.next_edge_pixels_h, predicted_val.next_edge_pixels_v, diff --git a/src/structs/mod.rs b/src/structs/mod.rs index e6ef2bf0..c270c216 100644 --- a/src/structs/mod.rs +++ b/src/structs/mod.rs @@ -11,6 +11,7 @@ mod block_context; mod branch; +mod div; mod idct; mod lepton_decoder; mod lepton_encoder; diff --git a/src/structs/probability_tables.rs b/src/structs/probability_tables.rs index 30403aef..ad728ee4 100644 --- a/src/structs/probability_tables.rs +++ b/src/structs/probability_tables.rs @@ -11,6 +11,7 @@ use crate::consts::*; use crate::enabled_features; use crate::jpeg::block_based_image::AlignedBlock; use crate::structs::block_context::NeighborData; +use crate::structs::div::div; use crate::structs::idct::*; use crate::structs::model::*; use crate::structs::quantization_tables::*; @@ -219,20 +220,30 @@ impl ProbabilityTables { return 0; } - let mut best_prior: i32 = pred[if HORIZONTAL { + let best_prior: i32 = pred[if HORIZONTAL { coefficient_tr >> 3 } else { coefficient_tr }]; - best_prior /= (qt.get_quantization_table_transposed()[coefficient_tr] as i32) << 13; + //best_prior / ((qt.get_quantization_table_transposed()[coefficient_tr] as i32) << 13) + let recip = qt.get_recip(if HORIZONTAL { + coefficient_tr >> 3 + } else { + coefficient_tr + 8 + }); + let prior = div((best_prior.unsigned_abs() >> 13) as i32, recip); - best_prior + if best_prior < 0 { + -prior + } else { + prior + } } pub fn adv_predict_dc_pix( &self, raster_cols: &[i32x8; 8], - q0: i32, + qt: &QuantizationTables, //q0: i32, neighbor_data: &NeighborData, enabled_features: &enabled_features::EnabledFeatures, ) -> PredictDCResult { @@ -326,8 +337,12 @@ impl ProbabilityTables { let uncertainty2_val = (far_afield_value >> 3) as i16; + //let q0 = qt.get_quantization_table()[0] as i32; + let q0 = qt.get_recip(0); + let div = div(avgmed, q0); return PredictDCResult { - predicted_dc: (avgmed / q0 + 4) >> 3, + //predicted_dc: (avgmed / q0 + 4) >> 3, + predicted_dc: (div + 4) >> 3, uncertainty: uncertainty_val, uncertainty2: uncertainty2_val, next_edge_pixels_h, diff --git a/src/structs/quantization_tables.rs b/src/structs/quantization_tables.rs index 12ffaed8..240725b0 100644 --- a/src/structs/quantization_tables.rs +++ b/src/structs/quantization_tables.rs @@ -8,6 +8,7 @@ use crate::consts::*; use crate::helpers::*; use crate::jpeg::jpeg_header::JpegHeader; use crate::lepton_error::err_exit_code; +use crate::structs::div::recip; use crate::{ExitCode, Result}; pub struct QuantizationTables { @@ -18,6 +19,8 @@ pub struct QuantizationTables { // Calculated using approximate maximal magnitudes // of these coefficients `FREQ_MAX` min_noise_threshold: [u8; 14], + // Reciprocals for division-by-multiplicatio scheme + recip: [i32; 16], } impl QuantizationTables { @@ -32,6 +35,7 @@ impl QuantizationTables { quantization_table: [0; 64], quantization_table_transposed: [0; 64], min_noise_threshold: [0; 14], + recip: [0; 16], }; for pixel_row in 0..8 { @@ -59,6 +63,10 @@ impl QuantizationTables { } } } + for i in 0..16 { + let coord = if i < 8 { i } else { (i - 8) * 8 }; + retval.recip[i] = recip(retval.quantization_table[coord] as i32); + } retval } @@ -97,4 +105,8 @@ impl QuantizationTables { pub fn get_min_noise_threshold(&self, coef: usize) -> u8 { self.min_noise_threshold[coef] } + + pub fn get_recip(&self, coef: usize) -> i32 { + self.recip[coef] + } }