Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions lib/src/helpers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -61,20 +61,20 @@ pub fn has_ff(v: u64) -> bool {
}

#[inline(always)]
pub const fn devli(s: u8, value: u16) -> i16 {
let shifted = 1 << s;
pub const fn devli(s: u32, value: u32) -> i16 {
let shifted = (1 << s) - 1;

if value & (shifted >> 1) != 0 {
if (value << 1) > shifted {
value as i16
} else {
value.wrapping_add(2).wrapping_add(!shifted) as i16
value.wrapping_sub(shifted) as i16
}
}

/// check to make sure the behavior hasn't changed even with the optimization
#[test]
fn devli_test() {
for s in 0u8..15 {
for s in 0u32..15 {
for value in 0..(1 << s) {
assert_eq!(
devli(s, value),
Expand Down
134 changes: 90 additions & 44 deletions lib/src/jpeg/bit_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use std::io::BufRead;

use super::jpeg_code;
use crate::helpers::has_ff;
use crate::lepton_error::{ExitCode, err_exit_code};
use crate::lepton_error::{ExitCode, Result, err_exit_code};
use crate::{LeptonError, StreamPosition};

// Implemenation of bit reader on top of JPEG data stream as read by a reader
Expand Down Expand Up @@ -59,25 +59,21 @@ impl<R: BufRead + StreamPosition> BitReader<R> {

impl<R: BufRead> BitReader<R> {
#[inline(always)]
pub fn read(&mut self, bits_to_read: u32) -> std::io::Result<u16> {
if bits_to_read == 0 {
return Ok(0);
}

pub fn read(&mut self, bits_to_read: u32) -> Result<u32> {
if self.bits_left < bits_to_read {
self.fill_register(bits_to_read)?;
self.fill_register_slow(bits_to_read)?;
}

let retval =
(self.bits >> (self.bits_left - bits_to_read) & ((1 << bits_to_read) - 1)) as u16;
((self.bits >> (self.bits_left - bits_to_read)) as u32) & ((1 << bits_to_read) - 1);
self.bits_left -= bits_to_read;
return Ok(retval);
}

#[inline(always)]
pub fn peek(&self) -> (u8, u32) {
pub fn peek(&self) -> (u32, u32) {
(
((self.bits.wrapping_shl(64 - self.bits_left)) >> 56) as u8,
((self.bits.wrapping_shl(64 - self.bits_left)) >> 56) as u32,
self.bits_left,
)
}
Expand All @@ -87,44 +83,95 @@ impl<R: BufRead> BitReader<R> {
self.bits_left -= bits;
}

/// Fills the register as much as possible with the current buffer in
/// a non-destructive manner. This requires the BufRead implementation to
/// have enough space available.
#[inline(always)]
pub fn fill_register(&mut self, bits_to_read: u32) -> Result<(), std::io::Error> {
pub fn optimistic_fill(&mut self) {
if self.bits_left < 8 {
self.optimistic_fill_slow();
}
}

/// Assuming that there are less than 8 bits in our buffer, fill the rest of the buffer
/// with as many bits as possible, leaving the possibility of unwinding via the undo_read_ahead
/// function in certain corner cases.
#[inline(never)]
#[cold]

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this really helps with performance? By the rule of thumb refill should be a bit too frequent to being cold. I'll check on my box.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The tradeoff was with having more registers free for the inner loop, but it's possible that it's faster without the cold. fill_register is definitely cold since it gets called almost never.

@Melirius Melirius Jan 12, 2025

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With cold:

ivan@ivan-5950:~/lepton_jpeg_rust$ sudo rm images/img_52MP_7k2.lep; sudo perf stat -B -e cache-references,cache-misses,cycles,ic_fetch_stall.ic_stall_back_pressure,stalled-cycles-frontend,instructions,branch-instructions,branch-misses,ic_fetch_stall.ic_stall_any,ic_fetch_stall.ic_stall_dq_empty,l2_cache_misses_from_ic_miss,l2_latency.l2_cycles_waiting_on_fills,faults,migrations taskset -c 10 nice -n -20 target/release/lepton_jpeg_util images/img_52MP_7k.jpg images/img_52MP_7k2.lep
2025-01-12T14:26:43.217Z INFO  [lepton_jpeg::structs::lepton_file_writer] compressing to Lepton format
2025-01-12T14:26:43.583Z INFO  [lepton_jpeg::structs::lepton_file_writer] Number of threads: 8
2025-01-12T14:26:44.712Z INFO  [lepton_jpeg::structs::lepton_file_writer] worker threads 7732ms of CPU time in 1127ms of wall time
2025-01-12T14:26:44.712Z INFO  [lepton_jpeg::structs::lepton_file_writer] decompressing to verify contents
2025-01-12T14:26:46.304Z INFO  [lepton_jpeg_util] compressed input 22171278, output 17324074 bytes (compression = 28.0%)
2025-01-12T14:26:46.304Z INFO  [lepton_jpeg_util] Main thread CPU: 3095ms, Worker thread CPU: 18774 ms, walltime: 3095 ms

 Performance counter stats for 'taskset -c 10 nice -n -20 target/release/lepton_jpeg_util images/img_52MP_7k.jpg images/img_52MP_7k2.lep':

       857 527 770      cache-references                                                        (25,32%)
        76 481 649      cache-misses                     #    8,92% of all cache refs           (25,47%)
    14 095 457 313      cycles                                                                  (25,47%)
       865 595 905      ic_fetch_stall.ic_stall_back_pressure                                        (25,51%)
       935 546 969      stalled-cycles-frontend          #    6,64% frontend cycles idle        (25,31%)
    34 775 075 664      instructions                     #    2,47  insn per cycle            
                                                  #    0,03  stalled cycles per insn     (25,24%)
     3 665 427 290      branch-instructions                                                     (25,05%)
       140 713 890      branch-misses                    #    3,84% of all branches             (25,07%)
     5 325 860 720      ic_fetch_stall.ic_stall_any                                             (25,28%)
        39 528 616      ic_fetch_stall.ic_stall_dq_empty                                        (25,38%)
        62 527 706      l2_cache_misses_from_ic_miss                                            (25,24%)
     1 842 115 928      l2_latency.l2_cycles_waiting_on_fills                                        (25,12%)
           188 277      faults                                                                
                 1      migrations                                                            

       3,129010978 seconds time elapsed

       2,821312000 seconds user
       0,302604000 seconds sys

Without cold:

ivan@ivan-5950:~/lepton_jpeg_rust$ sudo rm images/img_52MP_7k2.lep; sudo perf stat -B -e cache-references,cache-misses,cycles,ic_fetch_stall.ic_stall_back_pressure,stalled-cycles-frontend,instructions,branch-instructions,branch-misses,ic_fetch_stall.ic_stall_any,ic_fetch_stall.ic_stall_dq_empty,l2_cache_misses_from_ic_miss,l2_latency.l2_cycles_waiting_on_fills,faults,migrations taskset -c 10 nice -n -20 target/release/lepton_jpeg_util images/img_52MP_7k.jpg images/img_52MP_7k2.lep
2025-01-12T14:25:52.492Z INFO  [lepton_jpeg::structs::lepton_file_writer] compressing to Lepton format
2025-01-12T14:25:52.974Z INFO  [lepton_jpeg::structs::lepton_file_writer] Number of threads: 8
2025-01-12T14:25:54.453Z INFO  [lepton_jpeg::structs::lepton_file_writer] worker threads 10251ms of CPU time in 1478ms of wall time
2025-01-12T14:25:54.453Z INFO  [lepton_jpeg::structs::lepton_file_writer] decompressing to verify contents
2025-01-12T14:25:56.028Z INFO  [lepton_jpeg_util] compressed input 22171278, output 17324074 bytes (compression = 28.0%)
2025-01-12T14:25:56.028Z INFO  [lepton_jpeg_util] Main thread CPU: 3547ms, Worker thread CPU: 21139 ms, walltime: 3547 ms

 Performance counter stats for 'taskset -c 10 nice -n -20 target/release/lepton_jpeg_util images/img_52MP_7k.jpg images/img_52MP_7k2.lep':

       854 375 836      cache-references                                                        (25,26%)
        74 116 401      cache-misses                     #    8,67% of all cache refs           (25,55%)
    14 048 593 475      cycles                                                                  (25,55%)
       487 211 314      ic_fetch_stall.ic_stall_back_pressure                                        (25,48%)
       869 359 514      stalled-cycles-frontend          #    6,19% frontend cycles idle        (25,31%)
    35 076 159 662      instructions                     #    2,50  insn per cycle            
                                                  #    0,02  stalled cycles per insn     (25,18%)
     3 638 127 413      branch-instructions                                                     (25,14%)
       140 480 887      branch-misses                    #    3,86% of all branches             (25,13%)
     5 203 535 058      ic_fetch_stall.ic_stall_any                                             (25,11%)
        36 559 121      ic_fetch_stall.ic_stall_dq_empty                                        (24,97%)
        61 473 795      l2_cache_misses_from_ic_miss                                            (24,92%)
     1 770 575 776      l2_latency.l2_cycles_waiting_on_fills                                        (24,98%)
           188 278      faults                                                                
                 1      migrations                                                            

       3,585066548 seconds time elapsed

       3,233798000 seconds user
       0,346549000 seconds sys

fn optimistic_fill_slow(&mut self) {
// for correctness, we need to have less than 8 bits left, otherwise we can't
// consume the read_ahead_bytes and successfully undo the read_ahead if we have to.
debug_assert!(self.bits_left < 8);

// first consume the read_ahead bytes that we have now consumed
// (otherwise we wouldn't have been called)
self.inner.consume(self.read_ahead_bytes as usize);

let fb = self.inner.fill_buf()?;
if let Ok(fb) = self.inner.fill_buf() {
// if we have 8 bytes and there is no 0xff in them, then we can just read the bits directly as big endian
if fb.len() < 8 {
self.read_ahead_bytes = 0;
return;
}

// if we have 8 bytes and there is no 0xff in them, then we can just read the bits directly as big endian
let mut v;
if fb.len() < 8 || {
v = u64::from_le_bytes(fb[..8].try_into().unwrap());
has_ff(v)
} {
self.read_ahead_bytes = 0;
return self.fill_register_slow(bits_to_read);
}
let mut v = u64::from_le_bytes(fb[..8].try_into().unwrap());
if has_ff(v) {
// this is the expensive path, but rarer where there are 0xff bytes in the buffer
let mut bytes_left = 8;
self.read_ahead_bytes = 0;

v = v.to_be();
while bytes_left >= 2 {
if v & 0xff == 0xff {
if v & 0xff00 != 0 {
// reset marker or end of scan, just exit the loop and let fill_register handle it
break;
}

// only fill 63 bits not 64 to avoid having to special case
// of self.bits << 64 which is a nop
let bytes_to_read = (63 - self.bits_left) / 8;
self.bits = (self.bits << 8) | 0xff;
self.bits_left += 8;
self.read_ahead_bytes += 2;

v >>= 16;
bytes_left -= 2;
} else {
self.bits = (self.bits << 8) | (v & 0xff);
self.bits_left += 8;
self.read_ahead_bytes += 1;

self.bits = self.bits << (bytes_to_read * 8) | v >> (64 - bytes_to_read * 8);
self.bits_left += bytes_to_read * 8;
self.read_ahead_bytes = (self.bits_left - bits_to_read) / 8;
v >>= 8;
bytes_left -= 1;
}
}
} else {
// no 0xff bytes, just read the bits all at a time and reverse endian to get them in the right order.
v = v.to_be();

self.inner
.consume((bytes_to_read - self.read_ahead_bytes) as usize);
// only fill 63 bits not 64 to avoid having to special case
// of self.bits << 64 which is a nop
let bytes_to_read = (63 - self.bits_left) / 8;

return Ok(());
self.bits = self.bits << (bytes_to_read * 8) | v >> (64 - bytes_to_read * 8);
self.bits_left += bytes_to_read * 8;
self.read_ahead_bytes = self.bits_left / 8;
}
}
}

/// Fills the register up to the number of bits requested, with the assumption that these
/// will be immediately consumed.
///
/// This function ends up being called very infrequently since almost all of the time the optimistic_fill ensures
/// that there are enough bits to work with. Effectively this function ends up only being called in corner cases
/// where we are near the end of a BufRead block, at the end of the file or about to hit a reset marker.
#[cold]
fn fill_register_slow(&mut self, bits_to_read: u32) -> Result<(), std::io::Error> {
loop {
#[inline(never)]
fn fill_register_slow(&mut self, bits_to_read: u32) -> Result<()> {
self.inner.consume(self.read_ahead_bytes as usize);
self.read_ahead_bytes = 0;

while self.bits_left < bits_to_read {
let fb = self.inner.fill_buf()?;

if let &[b, ..] = fb {
self.inner.consume(1);

Expand Down Expand Up @@ -176,10 +223,6 @@ impl<R: BufRead> BitReader<R> {

// continue since we still might need to read more 0 bits
}

if self.bits_left >= bits_to_read {
break;
}
}
Ok(())
}
Expand All @@ -190,10 +233,7 @@ impl<R: BufRead> BitReader<R> {

/// used to verify whether this image is using 1s or 0s as fill bits.
/// Returns whether the fill bit was 1 or so or unknown (None)
pub fn read_and_verify_fill_bits(
&mut self,
pad_bit: &mut Option<u8>,
) -> Result<(), LeptonError> {
pub fn read_and_verify_fill_bits(&mut self, pad_bit: &mut Option<u8>) -> Result<()> {
self.undo_read_ahead();

// if there are bits left, we need to see whether they
Expand Down Expand Up @@ -222,7 +262,7 @@ impl<R: BufRead> BitReader<R> {
}
Some(x) => {
// if we already saw a padding, then it should match
let expected = u16::from(x) & all_one;
let expected = u32::from(x) & all_one;
if actual != expected {
return err_exit_code(
ExitCode::InvalidPadding,
Expand All @@ -239,7 +279,7 @@ impl<R: BufRead> BitReader<R> {
return Ok(());
}

pub fn verify_reset_code(&mut self) -> Result<(), LeptonError> {
pub fn verify_reset_code(&mut self) -> Result<()> {
// we reached the end of a MCU, so we need to find a reset code and the huffman codes start get padded out, but the spec
// doesn't specify whether the padding should be 1s or 0s, so we ensure that at least the file is consistant so that we
// can recode it again just by remembering the pad bit.
Expand Down Expand Up @@ -285,9 +325,15 @@ impl<R: BufRead> BitReader<R> {
/// the only bits that are left are part of the current byte.
pub fn undo_read_ahead(&mut self) {
while self.bits_left >= 8 && self.read_ahead_bytes > 0 {
// if it was an 0xff then rewind 2 bytes since this was an escape code
if self.bits & 0xff == 0xff {
self.read_ahead_bytes -= 2;
} else {
self.read_ahead_bytes -= 1;
}

self.bits_left -= 8;
self.bits >>= 8;
self.read_ahead_bytes -= 1;
}

if self.read_ahead_bytes > 0 {
Expand Down
10 changes: 5 additions & 5 deletions lib/src/jpeg/bit_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ mod tests {
let mut r = BitReader::new(Cursor::new(&buf));

for i in 1..2048 {
assert_eq!(i, r.read(u32_bit_length(i as u32) as u32).unwrap());
assert_eq!(i, r.read(u32_bit_length(i) as u32).unwrap());
}

let mut pad = Some(0xff);
Expand Down Expand Up @@ -231,7 +231,7 @@ mod tests {
if rng.gen_range(0..100) == 0 {
test_data.push(Action::Pad(0xff));
} else {
test_data.push(Action::Write(v as u16, bits as u8));
test_data.push(Action::Write(v, bits));
}
}
test_data.push(Action::Pad(0xff));
Expand Down Expand Up @@ -264,16 +264,16 @@ mod tests {
let (peekcode, peekbits) = r.peek();
let num_valid_bits = peekbits.min(8).min(u32::from(numbits));

let mask = (0xff00 >> num_valid_bits) as u8;
let mask = 0xff00u32 >> num_valid_bits;

assert_eq!(
expected_peek_byte & mask,
expected_peek_byte as u32 & mask,
peekcode & mask,
"peek unexpected result"
);

assert_eq!(
code,
u32::from(code),
r.read(numbits as u32).unwrap(),
"read unexpected result"
);
Expand Down
Loading
Loading