From 88cdf69e4b83bb550ab4f6f7134892c2ad1978f4 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Mon, 26 Jan 2026 12:25:25 +0000 Subject: [PATCH 01/10] Fix byte_array_stop out-of-data checks The old version triggered one byte too late. However nuance is needed as it's permissible to completely fill the output buffer as long as the next input character is the stop byte. --- cram/cram_codecs.c | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/cram/cram_codecs.c b/cram/cram_codecs.c index d57020c73..68590a9d5 100644 --- a/cram/cram_codecs.c +++ b/cram/cram_codecs.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2012-2021,2023, 2025 Genome Research Ltd. +Copyright (c) 2012-2021,2023, 2025, 2026 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -3568,7 +3568,7 @@ cram_codec *cram_byte_array_len_encode_init(cram_stats *st, static int cram_byte_array_stop_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { - char *cp, ch; + uint8_t *cp; cram_block *b = NULL; b = cram_get_block_by_id(slice, c->u.byte_array_stop.content_id); @@ -3578,31 +3578,29 @@ static int cram_byte_array_stop_decode_char(cram_slice *slice, cram_codec *c, if (b->idx >= b->uncomp_size) return -1; - cp = (char *)b->data + b->idx; + ssize_t term = b->uncomp_size - b->idx; + cp = b->data + b->idx; if (out) { // memccpy equivalent but without copying the terminating byte - ssize_t term = MIN(*out_size, b->uncomp_size - b->idx); - while ((ch = *cp) != (char)c->u.byte_array_stop.stop) { - if (term-- < 0) - break; - *out++ = ch; - cp++; + if (term > *out_size) + term = *out_size; + while (--term >= 0 && *cp != c->u.byte_array_stop.stop) { + *out++ = *cp++; } - // Attempted overrun on input or output - if (ch != (char)c->u.byte_array_stop.stop) - return -1; } else { // Consume input, but produce no output - while ((ch = *cp) != (char)c->u.byte_array_stop.stop) { - if (cp - (char *)b->data >= b->uncomp_size) - return -1; + while (--term >= 0 && *cp != c->u.byte_array_stop.stop) { cp++; } } - *out_size = cp - (char *)(b->data + b->idx); - b->idx = cp - (char *)b->data + 1; + // Attempted overrun on input or output + if (cp >= b->data + b->uncomp_size || *cp != c->u.byte_array_stop.stop) + return -1; + + *out_size = cp - (b->data + b->idx); + b->idx = cp - b->data + 1; return 0; } From 0ec436796eca7b4ce7fcc9b77270c102da29bb2e Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 28 Jan 2026 21:21:50 +0000 Subject: [PATCH 02/10] Improve option checking for varint and const codecs These codecs do not currently support BYTE_ARRAY type outputs. It's necessary to reject them if a file attempts to specify them in a context where array data is expected. --- cram/cram_codecs.c | 40 +++++++++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/cram/cram_codecs.c b/cram/cram_codecs.c index 68590a9d5..e6af53fab 100644 --- a/cram/cram_codecs.c +++ b/cram/cram_codecs.c @@ -776,17 +776,23 @@ cram_codec *cram_varint_decode_init(cram_block_compression_hdr *hdr, // does not change. switch(codec) { case E_VARINT_UNSIGNED: - c->decode = (option == E_INT) - ? cram_varint_decode_int - : cram_varint_decode_long; + if (option == E_INT || option == E_SINT) + c->decode = cram_varint_decode_int; + else if (option == E_LONG || option == E_SLONG) + c->decode = cram_varint_decode_long; + else + goto malformed; break; case E_VARINT_SIGNED: - c->decode = (option == E_INT) - ? cram_varint_decode_sint - : cram_varint_decode_slong; + if (option == E_INT || option == E_SINT) + c->decode = cram_varint_decode_sint; + else if (option == E_LONG || option == E_SLONG) + c->decode = cram_varint_decode_slong; + else + goto malformed; break; default: - return NULL; + goto malformed; } c->free = cram_varint_decode_free; @@ -798,14 +804,17 @@ cram_codec *cram_varint_decode_init(cram_block_compression_hdr *hdr, c->u.varint.offset = vv->varint_get64s(&cp, cp_end, NULL); if (cp - data != size) { - fprintf(stderr, "Malformed varint header stream\n"); - free(c); - return NULL; + goto malformed; } c->u.varint.type = option; return c; + + malformed: + hts_log_error("Malformed varint header stream"); + free(c); + return NULL; } int cram_varint_encode_int(cram_slice *slice, cram_codec *c, @@ -978,12 +987,17 @@ cram_codec *cram_const_decode_init(cram_block_compression_hdr *hdr, return NULL; c->codec = codec; - if (codec == E_CONST_BYTE) + if (codec == E_CONST_BYTE && option == E_BYTE) c->decode = cram_const_decode_byte; - else if (option == E_INT) + else if (codec == E_CONST_INT && (option == E_INT || option == E_SINT)) c->decode = cram_const_decode_int; - else + else if (codec == E_CONST_INT && (option == E_LONG || option == E_SLONG)) c->decode = cram_const_decode_long; + else { + hts_log_error("Malformed const header stream"); + free(c); + return NULL; + } c->free = cram_const_decode_free; c->size = cram_const_decode_size; c->get_block = NULL; From 9cefb46453ad471e933b8212d4f45920524d3357 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Mon, 26 Jan 2026 12:29:31 +0000 Subject: [PATCH 03/10] Add more checks for valid CRAM ref id and mate ref id * Ensure ref_id is valid when reading CRAM RI data series In multi-ref containers, the reference IDs are stored in the RI data series. These need to be checked to ensure they are in the expected range. * Add check to ensure mate ref id is valid when decoding cram --- cram/cram_decode.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/cram/cram_decode.c b/cram/cram_decode.c index d91b9bc53..a685dad8a 100644 --- a/cram/cram_decode.c +++ b/cram/cram_decode.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2012-2020, 2022-2025 Genome Research Ltd. +Copyright (c) 2012-2020, 2022-2026 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -2565,6 +2565,11 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, ->decode(s, c->comp_hdr->codecs[DS_RI], blk, (char *)&cr->ref_id, &out_sz); if (r) goto block_err; + if (cr->ref_id < -1 || cr->ref_id >= bfd->nref) { + hts_log_error("Requested unknown reference ID %d", + cr->ref_id); + goto block_err; + } if ((fd->required_fields & (SAM_SEQ|SAM_TLEN)) && cr->ref_id >= 0 && cr->ref_id != last_ref_id) { @@ -2742,6 +2747,12 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, ->decode(s, c->comp_hdr->codecs[DS_NS], blk, (char *)&cr->mate_ref_id, &out_sz); if (r) goto block_err; + + if (cr->mate_ref_id < -1 || cr->mate_ref_id >= bfd->nref) { + hts_log_error("Requested unknown mate reference ID %d", + cr->mate_ref_id); + goto block_err; + } } // Skip as mate_ref of "*" is legit. It doesn't mean unmapped, just unknown. From 4a5ef25eb1fb3d64438103316fffe423b2c3f5f4 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 27 Jan 2026 16:22:52 +0000 Subject: [PATCH 04/10] Catch cram records that start before the current reference segment This error can be caught early when reading the AP data series. Records that span beyond the end of the reference are dealt with later. --- cram/cram_decode.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cram/cram_decode.c b/cram/cram_decode.c index a685dad8a..1f8a48c48 100644 --- a/cram/cram_decode.c +++ b/cram/cram_decode.c @@ -2664,6 +2664,9 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, cr->apos += s->last_apos; } s->last_apos= cr->apos; + + if (s->hdr->ref_seq_id >= 0 && cr->apos < s->hdr->ref_seq_start) + goto block_err; } else { cr->apos = c->ref_seq_start; } From 22ec5230ef95769ab009420da69568c7e530af28 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 28 Jan 2026 21:42:49 +0000 Subject: [PATCH 05/10] Fix checks for CRAM features accessing data beyond ref_end * Fix out-by-one in check for CRAM BB series data * Fix out-by-one in check for CRAM BA series data * Reject cram reads that end past slice reference extents Copies a check added in efe502d98 and applies it to the case where matching bases are added from the last feature up to the reported read length, and the ending reference position is beyond the reference length listed in the header. --- cram/cram_decode.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/cram/cram_decode.c b/cram/cram_decode.c index 1f8a48c48..099c1a61d 100644 --- a/cram/cram_decode.c +++ b/cram/cram_decode.c @@ -1501,7 +1501,7 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, break; } else { if (decode_md) { - if (ref_pos + x > s->ref_end) + if (ref_pos + x >= s->ref_end) goto beyond_slice; char r = s->ref[ref_pos+x-s->ref_start +1]; BLOCK_APPEND_CHAR(s->aux_blk, r); @@ -1572,7 +1572,7 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, md_dist = -1; } else { if (decode_md) { - if (ref_pos > s->ref_end) + if (ref_pos >= s->ref_end) goto beyond_slice; BLOCK_APPEND_CHAR(s->aux_blk, s->ref[ref_pos-s->ref_start +1]); @@ -1686,6 +1686,9 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, // May miss MD/NM cases where both seq/ref are N, but this is a // malformed cram file anyway. if (rlen > 0) { + if (ref_pos + rlen > s->ref_end) + goto beyond_slice; + if (seq_pos-1 + rlen < cr->len) memcpy(&seq[seq_pos-1], &s->ref[ref_pos - s->ref_start +1], rlen); From 8bcc9907be0f945ddc31796d64f078fa05456acd Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 28 Jan 2026 16:39:13 +0000 Subject: [PATCH 06/10] Improve check for cram features beyond the end of the sequence The check needs to deal with: * Records where no seq / qual was stored (which have cr->len == 0) * Features which write to seq / qual, which cannot be placed after the end of the read * Features like deletions and ref skips that do not write to seq or qual, and may be placed immediately after the end of the read --- cram/cram_decode.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/cram/cram_decode.c b/cram/cram_decode.c index 099c1a61d..51b3d9e68 100644 --- a/cram/cram_decode.c +++ b/cram/cram_decode.c @@ -1185,15 +1185,28 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, if (r) return r; pos += prev_pos; + // Misplaced feature detection - before start is easy if (pos <= 0) { hts_log_error("Feature position %d before start of read", pos); return -1; } - if (pos > seq_pos) { - if (pos > cr->len+1) + // After end is more complicated as the sequence may be absent, + // and operations like deletions could occur after the end + // of the stored sequence. First quickly find out if the feature is + // on or after the last base. + if (cr->len != 0 && pos > cr->len) { + // Now check carefully to ensure it's allowed. + int32_t valid_end = (op == 'N' || op == 'P' || op == 'H' || op == 'D') + ? cr->len+1 + : cr->len; + if (pos > valid_end) { + hts_log_error("Feature position %d after end of read", pos); return -1; + } + } + if (pos > seq_pos) { if (s->ref && cr->ref_id >= 0) { if (ref_pos + pos - seq_pos > bfd->ref[cr->ref_id].len) { static int whinged = 0; From 2a45eb129d703ad27f9fabc8169f0e94d3c69fa3 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Mon, 2 Feb 2026 09:41:06 +0000 Subject: [PATCH 07/10] Reject cram files with negative-length D, H, P or N features --- cram/cram_decode.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cram/cram_decode.c b/cram/cram_decode.c index 51b3d9e68..726e3815b 100644 --- a/cram/cram_decode.c +++ b/cram/cram_decode.c @@ -1403,6 +1403,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, r |= codecs[DS_DL]->decode(s, codecs[DS_DL], blk, (char *)&i32, &out_sz); if (r) return r; + if (i32 < 0) + goto beyond_slice; if (decode_md || decode_nm) { if (ref_pos + i32 > s->ref_end) goto beyond_slice; @@ -1638,6 +1640,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, r |= codecs[DS_HC]->decode(s, codecs[DS_HC], blk, (char *)&i32, &out_sz); if (r) return r; + if (i32 < 0) + goto beyond_slice; cig_op = BAM_CHARD_CLIP; cig_len += i32; } @@ -1654,6 +1658,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, r |= codecs[DS_PD]->decode(s, codecs[DS_PD], blk, (char *)&i32, &out_sz); if (r) return r; + if (i32 < 0) + goto beyond_slice; cig_op = BAM_CPAD; cig_len += i32; } @@ -1670,6 +1676,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, r |= codecs[DS_RS]->decode(s, codecs[DS_RS], blk, (char *)&i32, &out_sz); if (r) return r; + if (i32 < 0) + goto beyond_slice; cig_op = BAM_CREF_SKIP; cig_len += i32; ref_pos += i32; From d799b54c6401879187bba4741be83ff590ac73e3 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 27 Jan 2026 12:05:47 +0000 Subject: [PATCH 08/10] Improve handling of records where sequence was not present In SAM, these have sequence "*". In CRAM, this is indicated by setting cram_record::len to 0, although strangely some dummy sequence data needs to be stored in the file for these records so that feature lengths can be returned (e.g. for CIGAR matches). In such cases, the data read needs to be discarded, and nothing should be written to the seq or qual strings. Quality data probably shouldn't be present either for these, but if it is, it also needs to be discarded. This commit fixes some cases where this was not being handled correctly. * The code for copying over unchanged parts of the reference needs to skip the actual copy when seq is not stored. * The soft clip handler has fall-backs for when the IN or SC data stream is missing. These need to be prevented from writing to seq if it isn't present. * When processing the QS data stream, suppress storing the quality values when cr->len == 0. * If CRAM_FLAG_PRESERVE_QUAL_SCORES is false, don't try to add default quality values if cr->len == 0, and also don't try to check that the first quality value is 255 (which is used for the case where seq was present but the quality values were not). --- cram/cram_decode.c | 69 +++++++++++++++++++++++++++------------------- 1 file changed, 41 insertions(+), 28 deletions(-) diff --git a/cram/cram_decode.c b/cram/cram_decode.c index 726e3815b..cdc67ce55 100644 --- a/cram/cram_decode.c +++ b/cram/cram_decode.c @@ -1221,13 +1221,16 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, if (ref_pos + rlen > s->ref_end) goto beyond_slice; - memcpy(&seq[seq_pos-1], - &s->ref[ref_pos - s->ref_start +1], rlen); - if ((pos - seq_pos) - rlen > 0) - memset(&seq[seq_pos-1+rlen], 'N', - (pos - seq_pos) - rlen); + if (cr->len) { + memcpy(&seq[seq_pos-1], + &s->ref[ref_pos - s->ref_start +1], rlen); + if ((pos - seq_pos) - rlen > 0) + memset(&seq[seq_pos-1+rlen], 'N', + (pos - seq_pos) - rlen); + } } else { - memset(&seq[seq_pos-1], 'N', cr->len - seq_pos + 1); + if (cr->len) + memset(&seq[seq_pos-1], 'N', cr->len - seq_pos + 1); } if (md_dist >= 0) md_dist += pos - seq_pos; @@ -1238,7 +1241,6 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, const char *refp = s->ref + ref_pos - s->ref_start + 1; const int frag_len = pos - seq_pos; - int do_cpy = 1; if (decode_md || decode_nm) { char *N = memchr(refp, 'N', frag_len); if (N) { @@ -1253,14 +1255,12 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, } else { md_dist++; } - seq[seq_pos-1+i] = base; } - do_cpy = 0; } else { md_dist += frag_len; } } - if (do_cpy) + if (cr->len) memcpy(&seq[seq_pos-1], refp, frag_len); } } @@ -1299,24 +1299,33 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, switch (CRAM_MAJOR_VERS(fd->version)) { case 1: if (ds & CRAM_IN) { - r |= codecs[DS_IN] - ? codecs[DS_IN]->decode(s, codecs[DS_IN], - blk, - cr->len ? &seq[pos-1] : NULL, - &out_sz2) - : (seq[pos-1] = 'N', out_sz2 = 1, 0); + if (codecs[DS_IN]) { + r |= codecs[DS_IN]->decode(s, codecs[DS_IN], + blk, + cr->len ? &seq[pos-1] : NULL, + &out_sz2); + } else { + if (cr->len) + seq[pos-1] = 'N'; + out_sz2 = 1; + } have_sc = 1; } break; case 2: default: if (ds & CRAM_SC) { - r |= codecs[DS_SC] - ? codecs[DS_SC]->decode(s, codecs[DS_SC], - blk, - cr->len ? &seq[pos-1] : NULL, - &out_sz2) - : (seq[pos-1] = 'N', out_sz2 = 1, 0); + + if (codecs[DS_SC]) { + r |= codecs[DS_SC]->decode(s, codecs[DS_SC], + blk, + cr->len ? &seq[pos-1] : NULL, + &out_sz2); + } else { + if (cr->len) + seq[pos-1] = 'N'; + out_sz2 = 1; + } have_sc = 1; } break; @@ -1549,10 +1558,12 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, if (ds & CRAM_QQ) { if (!codecs[DS_QQ]) return -1; if ((ds & CRAM_QS) && !(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES) + && cr->len > 0 && (unsigned char)*qual == 255) memset(qual, 30, cr->len); // ? r |= codecs[DS_QQ]->decode(s, codecs[DS_QQ], blk, - (char *)&qual[pos-1], &len); + cr->len ? (char *)&qual[pos-1] : NULL, + &len); if (r) return r; } @@ -1599,11 +1610,12 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, } if (ds & CRAM_QS) { if (!codecs[DS_QS]) return -1; - if (!(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES) + if (!(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES) && cr->len > 0 && (unsigned char)*qual == 255) memset(qual, 30, cr->len); // ASCII ?. Same as htsjdk r |= codecs[DS_QS]->decode(s, codecs[DS_QS], blk, - (char *)&qual[pos-1], &out_sz); + cr->len ? (char *)&qual[pos-1] : NULL, + &out_sz); } #ifdef USE_X cig_op = BAM_CBASE_MISMATCH; @@ -1620,11 +1632,12 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, case 'Q': { // Quality score; QS if (ds & CRAM_QS) { if (!codecs[DS_QS]) return -1; - if (!(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES) && - (unsigned char)*qual == 255) + if (!(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES) && cr->len > 0 + && (unsigned char)*qual == 255) memset(qual, 30, cr->len); // ? r |= codecs[DS_QS]->decode(s, codecs[DS_QS], blk, - (char *)&qual[pos-1], &out_sz); + cr->len ? (char *)&qual[pos-1] : NULL, + &out_sz); //printf(" %d: QS = %d (ret %d)\n", f, qc, r); } break; From e64e68da567d2309509d059ace016d5d7fc7514f Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Thu, 29 Jan 2026 17:47:01 +0000 Subject: [PATCH 09/10] Fix codecs that did not correctly handle out == NULL Setting out to NULL indicates that the data should be discarded. This is mainly used in the case of records where the sequence or quality values were not present, but some dummy data is needed in order to reconstruct the CIGAR and MD records. * Make const and xrle char decoders work with out == NULL * Add missed case for NULL out passed to xpack char decoder --- cram/cram_codecs.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/cram/cram_codecs.c b/cram/cram_codecs.c index e6af53fab..f4ef18556 100644 --- a/cram/cram_codecs.c +++ b/cram/cram_codecs.c @@ -933,6 +933,9 @@ int cram_const_decode_byte(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { int i, n; + if (!out) + return 0; + for (i = 0, n = *out_size; i < n; i++) out[i] = c->u.xconst.val; @@ -1418,7 +1421,7 @@ int cram_xpack_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, cha if (out) memcpy(out, b->data + b->byte, *out_size); b->byte += *out_size; - } else { + } else if (out) { memset(out, c->u.xpack.rmap[0], *out_size); } @@ -2125,7 +2128,8 @@ int cram_xrle_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char cram_xrle_decode_expand_char(slice, c); cram_block *b = slice->block_by_id[512 + c->codec_id]; - memcpy(out, b->data + b->idx, n); + if (out) + memcpy(out, b->data + b->idx, n); b->idx += n; return 0; From 654a04a6843e359f224d99762ba919cb5348a981 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 4 Feb 2026 15:18:09 +0000 Subject: [PATCH 10/10] Reject CRAM blocks with an unknown compression method. --- cram/cram_io.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cram/cram_io.c b/cram/cram_io.c index 042ee6bc0..4e074a6a4 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -1421,6 +1421,10 @@ cram_block *cram_read_block(cram_fd *fd) { //fprintf(stderr, "Block at %d\n", (int)ftell(fd->fp)); if (-1 == (b->method = hgetc(fd->fp))) { free(b); return NULL; } + if (b->method > TOK3) { + hts_log_error("Unknown block compression method %d", (int) b->method); + free(b); return NULL; + } c = b->method; crc = crc32(crc, &c, 1); if (-1 == (b->content_type= hgetc(fd->fp))) { free(b); return NULL; } c = b->content_type; crc = crc32(crc, &c, 1);