diff --git a/cram/cram_codecs.c b/cram/cram_codecs.c index d57020c73..f4ef18556 100644 --- a/cram/cram_codecs.c +++ b/cram/cram_codecs.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2012-2021,2023, 2025 Genome Research Ltd. +Copyright (c) 2012-2021,2023, 2025, 2026 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -776,17 +776,23 @@ cram_codec *cram_varint_decode_init(cram_block_compression_hdr *hdr, // does not change. switch(codec) { case E_VARINT_UNSIGNED: - c->decode = (option == E_INT) - ? cram_varint_decode_int - : cram_varint_decode_long; + if (option == E_INT || option == E_SINT) + c->decode = cram_varint_decode_int; + else if (option == E_LONG || option == E_SLONG) + c->decode = cram_varint_decode_long; + else + goto malformed; break; case E_VARINT_SIGNED: - c->decode = (option == E_INT) - ? cram_varint_decode_sint - : cram_varint_decode_slong; + if (option == E_INT || option == E_SINT) + c->decode = cram_varint_decode_sint; + else if (option == E_LONG || option == E_SLONG) + c->decode = cram_varint_decode_slong; + else + goto malformed; break; default: - return NULL; + goto malformed; } c->free = cram_varint_decode_free; @@ -798,14 +804,17 @@ cram_codec *cram_varint_decode_init(cram_block_compression_hdr *hdr, c->u.varint.offset = vv->varint_get64s(&cp, cp_end, NULL); if (cp - data != size) { - fprintf(stderr, "Malformed varint header stream\n"); - free(c); - return NULL; + goto malformed; } c->u.varint.type = option; return c; + + malformed: + hts_log_error("Malformed varint header stream"); + free(c); + return NULL; } int cram_varint_encode_int(cram_slice *slice, cram_codec *c, @@ -924,6 +933,9 @@ int cram_const_decode_byte(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { int i, n; + if (!out) + return 0; + for (i = 0, n = *out_size; i < n; i++) out[i] = c->u.xconst.val; @@ -978,12 +990,17 @@ cram_codec *cram_const_decode_init(cram_block_compression_hdr *hdr, return NULL; c->codec = codec; - if (codec == E_CONST_BYTE) + if (codec == E_CONST_BYTE && option == E_BYTE) c->decode = cram_const_decode_byte; - else if (option == E_INT) + else if (codec == E_CONST_INT && (option == E_INT || option == E_SINT)) c->decode = cram_const_decode_int; - else + else if (codec == E_CONST_INT && (option == E_LONG || option == E_SLONG)) c->decode = cram_const_decode_long; + else { + hts_log_error("Malformed const header stream"); + free(c); + return NULL; + } c->free = cram_const_decode_free; c->size = cram_const_decode_size; c->get_block = NULL; @@ -1404,7 +1421,7 @@ int cram_xpack_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, cha if (out) memcpy(out, b->data + b->byte, *out_size); b->byte += *out_size; - } else { + } else if (out) { memset(out, c->u.xpack.rmap[0], *out_size); } @@ -2111,7 +2128,8 @@ int cram_xrle_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char cram_xrle_decode_expand_char(slice, c); cram_block *b = slice->block_by_id[512 + c->codec_id]; - memcpy(out, b->data + b->idx, n); + if (out) + memcpy(out, b->data + b->idx, n); b->idx += n; return 0; @@ -3568,7 +3586,7 @@ cram_codec *cram_byte_array_len_encode_init(cram_stats *st, static int cram_byte_array_stop_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { - char *cp, ch; + uint8_t *cp; cram_block *b = NULL; b = cram_get_block_by_id(slice, c->u.byte_array_stop.content_id); @@ -3578,31 +3596,29 @@ static int cram_byte_array_stop_decode_char(cram_slice *slice, cram_codec *c, if (b->idx >= b->uncomp_size) return -1; - cp = (char *)b->data + b->idx; + ssize_t term = b->uncomp_size - b->idx; + cp = b->data + b->idx; if (out) { // memccpy equivalent but without copying the terminating byte - ssize_t term = MIN(*out_size, b->uncomp_size - b->idx); - while ((ch = *cp) != (char)c->u.byte_array_stop.stop) { - if (term-- < 0) - break; - *out++ = ch; - cp++; + if (term > *out_size) + term = *out_size; + while (--term >= 0 && *cp != c->u.byte_array_stop.stop) { + *out++ = *cp++; } - // Attempted overrun on input or output - if (ch != (char)c->u.byte_array_stop.stop) - return -1; } else { // Consume input, but produce no output - while ((ch = *cp) != (char)c->u.byte_array_stop.stop) { - if (cp - (char *)b->data >= b->uncomp_size) - return -1; + while (--term >= 0 && *cp != c->u.byte_array_stop.stop) { cp++; } } - *out_size = cp - (char *)(b->data + b->idx); - b->idx = cp - (char *)b->data + 1; + // Attempted overrun on input or output + if (cp >= b->data + b->uncomp_size || *cp != c->u.byte_array_stop.stop) + return -1; + + *out_size = cp - (b->data + b->idx); + b->idx = cp - b->data + 1; return 0; } diff --git a/cram/cram_decode.c b/cram/cram_decode.c index d91b9bc53..cdc67ce55 100644 --- a/cram/cram_decode.c +++ b/cram/cram_decode.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2012-2020, 2022-2025 Genome Research Ltd. +Copyright (c) 2012-2020, 2022-2026 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -1185,15 +1185,28 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, if (r) return r; pos += prev_pos; + // Misplaced feature detection - before start is easy if (pos <= 0) { hts_log_error("Feature position %d before start of read", pos); return -1; } - if (pos > seq_pos) { - if (pos > cr->len+1) + // After end is more complicated as the sequence may be absent, + // and operations like deletions could occur after the end + // of the stored sequence. First quickly find out if the feature is + // on or after the last base. + if (cr->len != 0 && pos > cr->len) { + // Now check carefully to ensure it's allowed. + int32_t valid_end = (op == 'N' || op == 'P' || op == 'H' || op == 'D') + ? cr->len+1 + : cr->len; + if (pos > valid_end) { + hts_log_error("Feature position %d after end of read", pos); return -1; + } + } + if (pos > seq_pos) { if (s->ref && cr->ref_id >= 0) { if (ref_pos + pos - seq_pos > bfd->ref[cr->ref_id].len) { static int whinged = 0; @@ -1208,13 +1221,16 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, if (ref_pos + rlen > s->ref_end) goto beyond_slice; - memcpy(&seq[seq_pos-1], - &s->ref[ref_pos - s->ref_start +1], rlen); - if ((pos - seq_pos) - rlen > 0) - memset(&seq[seq_pos-1+rlen], 'N', - (pos - seq_pos) - rlen); + if (cr->len) { + memcpy(&seq[seq_pos-1], + &s->ref[ref_pos - s->ref_start +1], rlen); + if ((pos - seq_pos) - rlen > 0) + memset(&seq[seq_pos-1+rlen], 'N', + (pos - seq_pos) - rlen); + } } else { - memset(&seq[seq_pos-1], 'N', cr->len - seq_pos + 1); + if (cr->len) + memset(&seq[seq_pos-1], 'N', cr->len - seq_pos + 1); } if (md_dist >= 0) md_dist += pos - seq_pos; @@ -1225,7 +1241,6 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, const char *refp = s->ref + ref_pos - s->ref_start + 1; const int frag_len = pos - seq_pos; - int do_cpy = 1; if (decode_md || decode_nm) { char *N = memchr(refp, 'N', frag_len); if (N) { @@ -1240,14 +1255,12 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, } else { md_dist++; } - seq[seq_pos-1+i] = base; } - do_cpy = 0; } else { md_dist += frag_len; } } - if (do_cpy) + if (cr->len) memcpy(&seq[seq_pos-1], refp, frag_len); } } @@ -1286,24 +1299,33 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, switch (CRAM_MAJOR_VERS(fd->version)) { case 1: if (ds & CRAM_IN) { - r |= codecs[DS_IN] - ? codecs[DS_IN]->decode(s, codecs[DS_IN], - blk, - cr->len ? &seq[pos-1] : NULL, - &out_sz2) - : (seq[pos-1] = 'N', out_sz2 = 1, 0); + if (codecs[DS_IN]) { + r |= codecs[DS_IN]->decode(s, codecs[DS_IN], + blk, + cr->len ? &seq[pos-1] : NULL, + &out_sz2); + } else { + if (cr->len) + seq[pos-1] = 'N'; + out_sz2 = 1; + } have_sc = 1; } break; case 2: default: if (ds & CRAM_SC) { - r |= codecs[DS_SC] - ? codecs[DS_SC]->decode(s, codecs[DS_SC], - blk, - cr->len ? &seq[pos-1] : NULL, - &out_sz2) - : (seq[pos-1] = 'N', out_sz2 = 1, 0); + + if (codecs[DS_SC]) { + r |= codecs[DS_SC]->decode(s, codecs[DS_SC], + blk, + cr->len ? &seq[pos-1] : NULL, + &out_sz2); + } else { + if (cr->len) + seq[pos-1] = 'N'; + out_sz2 = 1; + } have_sc = 1; } break; @@ -1390,6 +1412,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, r |= codecs[DS_DL]->decode(s, codecs[DS_DL], blk, (char *)&i32, &out_sz); if (r) return r; + if (i32 < 0) + goto beyond_slice; if (decode_md || decode_nm) { if (ref_pos + i32 > s->ref_end) goto beyond_slice; @@ -1501,7 +1525,7 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, break; } else { if (decode_md) { - if (ref_pos + x > s->ref_end) + if (ref_pos + x >= s->ref_end) goto beyond_slice; char r = s->ref[ref_pos+x-s->ref_start +1]; BLOCK_APPEND_CHAR(s->aux_blk, r); @@ -1534,10 +1558,12 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, if (ds & CRAM_QQ) { if (!codecs[DS_QQ]) return -1; if ((ds & CRAM_QS) && !(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES) + && cr->len > 0 && (unsigned char)*qual == 255) memset(qual, 30, cr->len); // ? r |= codecs[DS_QQ]->decode(s, codecs[DS_QQ], blk, - (char *)&qual[pos-1], &len); + cr->len ? (char *)&qual[pos-1] : NULL, + &len); if (r) return r; } @@ -1572,7 +1598,7 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, md_dist = -1; } else { if (decode_md) { - if (ref_pos > s->ref_end) + if (ref_pos >= s->ref_end) goto beyond_slice; BLOCK_APPEND_CHAR(s->aux_blk, s->ref[ref_pos-s->ref_start +1]); @@ -1584,11 +1610,12 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, } if (ds & CRAM_QS) { if (!codecs[DS_QS]) return -1; - if (!(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES) + if (!(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES) && cr->len > 0 && (unsigned char)*qual == 255) memset(qual, 30, cr->len); // ASCII ?. Same as htsjdk r |= codecs[DS_QS]->decode(s, codecs[DS_QS], blk, - (char *)&qual[pos-1], &out_sz); + cr->len ? (char *)&qual[pos-1] : NULL, + &out_sz); } #ifdef USE_X cig_op = BAM_CBASE_MISMATCH; @@ -1605,11 +1632,12 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, case 'Q': { // Quality score; QS if (ds & CRAM_QS) { if (!codecs[DS_QS]) return -1; - if (!(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES) && - (unsigned char)*qual == 255) + if (!(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES) && cr->len > 0 + && (unsigned char)*qual == 255) memset(qual, 30, cr->len); // ? r |= codecs[DS_QS]->decode(s, codecs[DS_QS], blk, - (char *)&qual[pos-1], &out_sz); + cr->len ? (char *)&qual[pos-1] : NULL, + &out_sz); //printf(" %d: QS = %d (ret %d)\n", f, qc, r); } break; @@ -1625,6 +1653,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, r |= codecs[DS_HC]->decode(s, codecs[DS_HC], blk, (char *)&i32, &out_sz); if (r) return r; + if (i32 < 0) + goto beyond_slice; cig_op = BAM_CHARD_CLIP; cig_len += i32; } @@ -1641,6 +1671,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, r |= codecs[DS_PD]->decode(s, codecs[DS_PD], blk, (char *)&i32, &out_sz); if (r) return r; + if (i32 < 0) + goto beyond_slice; cig_op = BAM_CPAD; cig_len += i32; } @@ -1657,6 +1689,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, r |= codecs[DS_RS]->decode(s, codecs[DS_RS], blk, (char *)&i32, &out_sz); if (r) return r; + if (i32 < 0) + goto beyond_slice; cig_op = BAM_CREF_SKIP; cig_len += i32; ref_pos += i32; @@ -1686,6 +1720,9 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, // May miss MD/NM cases where both seq/ref are N, but this is a // malformed cram file anyway. if (rlen > 0) { + if (ref_pos + rlen > s->ref_end) + goto beyond_slice; + if (seq_pos-1 + rlen < cr->len) memcpy(&seq[seq_pos-1], &s->ref[ref_pos - s->ref_start +1], rlen); @@ -2565,6 +2602,11 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, ->decode(s, c->comp_hdr->codecs[DS_RI], blk, (char *)&cr->ref_id, &out_sz); if (r) goto block_err; + if (cr->ref_id < -1 || cr->ref_id >= bfd->nref) { + hts_log_error("Requested unknown reference ID %d", + cr->ref_id); + goto block_err; + } if ((fd->required_fields & (SAM_SEQ|SAM_TLEN)) && cr->ref_id >= 0 && cr->ref_id != last_ref_id) { @@ -2659,6 +2701,9 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, cr->apos += s->last_apos; } s->last_apos= cr->apos; + + if (s->hdr->ref_seq_id >= 0 && cr->apos < s->hdr->ref_seq_start) + goto block_err; } else { cr->apos = c->ref_seq_start; } @@ -2742,6 +2787,12 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, ->decode(s, c->comp_hdr->codecs[DS_NS], blk, (char *)&cr->mate_ref_id, &out_sz); if (r) goto block_err; + + if (cr->mate_ref_id < -1 || cr->mate_ref_id >= bfd->nref) { + hts_log_error("Requested unknown mate reference ID %d", + cr->mate_ref_id); + goto block_err; + } } // Skip as mate_ref of "*" is legit. It doesn't mean unmapped, just unknown. diff --git a/cram/cram_io.c b/cram/cram_io.c index 042ee6bc0..4e074a6a4 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -1421,6 +1421,10 @@ cram_block *cram_read_block(cram_fd *fd) { //fprintf(stderr, "Block at %d\n", (int)ftell(fd->fp)); if (-1 == (b->method = hgetc(fd->fp))) { free(b); return NULL; } + if (b->method > TOK3) { + hts_log_error("Unknown block compression method %d", (int) b->method); + free(b); return NULL; + } c = b->method; crc = crc32(crc, &c, 1); if (-1 == (b->content_type= hgetc(fd->fp))) { free(b); return NULL; } c = b->content_type; crc = crc32(crc, &c, 1);