From 7f7e830cf106dbdb6390fbb53fbd5af88eec82f2 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 26 Mar 2026 12:48:57 +0000 Subject: [PATCH 1/3] Fix CRAM v4.0 CRAM_FLAG_EXPLICIT_TLEN usage When we have paired reads where the detected template size doesn't match the recorded size, with CRAM 3.x we turn the reads into "detached" mode as that enables the ability to store ISIZE explicitly. However doing so removes many other benefits of deduplication, such as the read name and POS vs MATE_POS fields. With CRAM v4.0 if it's only the insert size which differs, which can be a common issue, we have an additional CRAM_FLAG_EXPLICIT_TLEN flag. Unfortunately may be set in the cram_flags later on, so we much check a local variable too. The impact of this was we build up the frequency statistics for DS_TS (template size) and then incorrectly remove the read-1 from it as we believe we're going into detached mode. We no longer remove the other read's size so the statistics are correct. This turned up in test data due to -20 / +20 becoming -20 only and hence encoding as a constant value. Bizarrely this only happens when specifying a reference while also asking it to generate consensus as a reference. The two together leave the memory copy of ref LN value as zero, which triggered incorrect cram container header data and in turn triggers the isize validity check to happen, forcing explicit-tlen storing. --- cram/cram_encode.c | 3 ++- test/test.pl | 10 ++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/cram/cram_encode.c b/cram/cram_encode.c index 7ec3c003a..b946f4c02 100644 --- a/cram/cram_encode.c +++ b/cram/cram_encode.c @@ -3927,7 +3927,8 @@ static int process_one_read(cram_fd *fd, cram_container *c, if (p->cram_flags & CRAM_FLAG_STATS_ADDED) { cram_stats_del(c->stats[DS_NP], p->mate_pos); cram_stats_del(c->stats[DS_MF], p->mate_flags); - if (!(p->cram_flags & CRAM_FLAG_EXPLICIT_TLEN)) + if (!(p->cram_flags & CRAM_FLAG_EXPLICIT_TLEN) + && !explicit_tlen) cram_stats_del(c->stats[DS_TS], p->tlen); cram_stats_del(c->stats[DS_NS], p->mate_ref_id); } diff --git a/test/test.pl b/test/test.pl index 8a68a658c..eaa65ea30 100755 --- a/test/test.pl +++ b/test/test.pl @@ -894,6 +894,16 @@ sub test_view testv $opts, "./test_view $tv_args -p $ersam2 $ercram"; testv $opts, "./compare_sam.pl $ersam $ersam2"; + # Embed_ref=2 with CRAM v4 uses explicit_len if it has to instead of + # breaking pairs with detached mode. + # Oddly this bug was only triggered when also specifying a reference. + $ersam = "xx#pair.sam"; + $ercram = "xx#pair.tmp.cram"; + $ersam2 = "${ercram}.sam"; + testv $opts, "./test_view $tv_args -o version=4.0 -o embed_ref=2 -t xx.fa -C -p $ercram $ersam"; + testv $opts, "./test_view $tv_args -p $ersam2 $ercram"; + testv $opts, "./compare_sam.pl $ersam $ersam2"; + if ($test_view_failures == 0) { passed($opts, "embed_ref=2 tests"); } else { From 37f55227c89247d893e82a501e86e5de64511d2f Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 26 Mar 2026 12:59:17 +0000 Subject: [PATCH 2/3] Fix c->ref_end for embedded references. Firstly, this was over-sizing as it used allocated memory rather than observed maximum extents. Secondly the combination of specifying a reference while also asking it not to use that reference and to use the consensus instead left ref LN_length variable as zero. We no longer shrink c->ref_end back down in this case. --- cram/cram_encode.c | 40 +++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/cram/cram_encode.c b/cram/cram_encode.c index b946f4c02..41c926407 100644 --- a/cram/cram_encode.c +++ b/cram/cram_encode.c @@ -1502,7 +1502,10 @@ int next_cigar_op(uint32_t *cigar, uint32_t ncigar, int *skip, int *spos, // Ensure ref and hist are large enough. static inline int extend_ref(char **ref, uint32_t (**hist)[5], hts_pos_t pos, - hts_pos_t ref_start, hts_pos_t *ref_end) { + hts_pos_t ref_start, hts_pos_t *ref_end, + hts_pos_t *max_pos) { + if (*max_pos < pos) + *max_pos = pos; if (pos < ref_start) return -1; if (pos < *ref_end) @@ -1546,7 +1549,7 @@ static inline int extend_ref(char **ref, uint32_t (**hist)[5], hts_pos_t pos, // Returns 1 on success, <0 on failure static int cram_add_to_ref_MD(bam1_t *b, char **ref, uint32_t (**hist)[5], hts_pos_t ref_start, hts_pos_t *ref_end, - const uint8_t *MD) { + hts_pos_t *max_pos, const uint8_t *MD) { uint8_t *seq = bam_get_seq(b); uint32_t *cigar = bam_get_cigar(b); uint32_t ncigar = b->core.n_cigar; @@ -1558,7 +1561,7 @@ static int cram_add_to_ref_MD(bam1_t *b, char **ref, uint32_t (**hist)[5], // No sequence means extend based on CIGAR instead if (!b->core.l_qseq && extend_ref(ref, hist, rseq_end, - ref_start, ref_end) < 0) + ref_start, ref_end, max_pos) < 0) return -1; int iseq = 0, next_op; @@ -1573,7 +1576,7 @@ static int cram_add_to_ref_MD(bam1_t *b, char **ref, uint32_t (**hist)[5], int len = hts_str2uint((char *)MD, (char **)&MD, 31, &overflow); if (overflow || extend_ref(ref, hist, iref+ref_start + len, - ref_start, ref_end) < 0) + ref_start, ref_end, max_pos) < 0) return -1; while (iseq < b->core.l_qseq && len) { // rewrite to have internal loops? @@ -1605,7 +1608,7 @@ static int cram_add_to_ref_MD(bam1_t *b, char **ref, uint32_t (**hist)[5], MD++; while (isalpha(*MD)) { if (extend_ref(ref, hist, iref+ref_start, ref_start, - ref_end) < 0) + ref_end, max_pos) < 0) return -1; if ((next_op = next_cigar_op(cigar, ncigar, cig_skip, &iseq, &cig_ind, &cig_op, @@ -1621,7 +1624,8 @@ static int cram_add_to_ref_MD(bam1_t *b, char **ref, uint32_t (**hist)[5], } } else { // substitution - if (extend_ref(ref, hist, iref+ref_start, ref_start, ref_end) < 0) + if (extend_ref(ref, hist, iref+ref_start, ref_start, ref_end, + max_pos) < 0) return -1; if ((next_op = next_cigar_op(cigar, ncigar, cig_skip, &iseq, &cig_ind, &cig_op, @@ -1650,12 +1654,14 @@ static int cram_add_to_ref_MD(bam1_t *b, char **ref, uint32_t (**hist)[5], // Returns >=0 on success, // -1 on failure (eg inconsistent data) static int cram_add_to_ref(bam1_t *b, char **ref, uint32_t (**hist)[5], - hts_pos_t ref_start, hts_pos_t *ref_end) { + hts_pos_t ref_start, hts_pos_t *ref_end, + hts_pos_t *max_pos) { const uint8_t *MD = bam_aux_get(b, "MD"); int ret = 0; if (MD && *MD == 'Z') { // We can use MD to directly compute the reference - int ret = cram_add_to_ref_MD(b, ref, hist, ref_start, ref_end, MD+1); + int ret = cram_add_to_ref_MD(b, ref, hist, ref_start, ref_end, + max_pos, MD+1); if (ret > 0) return ret; @@ -1683,7 +1689,7 @@ static int cram_add_to_ref(bam1_t *b, char **ref, uint32_t (**hist)[5], static uint8_t L16[16] = {4,0,1,4, 2,4,4,4, 3,4,4,4, 4,4,4,4}; if (extend_ref(ref, hist, iref+ref_start + len, - ref_start, ref_end) < 0) + ref_start, ref_end, max_pos) < 0) return -1; if (iseq + len <= b->core.l_qseq) { // Nullify failed MD:Z if appropriate @@ -1726,7 +1732,7 @@ static int cram_generate_reference(cram_container *c, cram_slice *s, int r1) { // user told us to do embed_ref=2. char *ref = NULL; uint32_t (*hist)[5] = NULL; - hts_pos_t ref_start = c->bams[r1]->core.pos, ref_end = 0; + hts_pos_t ref_start = c->bams[r1]->core.pos, ref_end = 0, max_pos = 0; if (ref_start < 0) return -1; // cannot build consensus from unmapped data @@ -1734,7 +1740,7 @@ static int cram_generate_reference(cram_container *c, cram_slice *s, int r1) { if (extend_ref(&ref, &hist, c->bams[r1 + s->hdr->num_records-1]->core.pos + c->bams[r1 + s->hdr->num_records-1]->core.l_qseq, - ref_start, &ref_end) < 0) + ref_start, &ref_end, &max_pos) < 0) return -1; // Add each bam file to the reference/consensus arrays @@ -1746,7 +1752,8 @@ static int cram_generate_reference(cram_container *c, cram_slice *s, int r1) { goto err; } last_pos = c->bams[r1]->core.pos; - if (cram_add_to_ref(c->bams[r1], &ref, &hist, ref_start, &ref_end) < 0) + if (cram_add_to_ref(c->bams[r1], &ref, &hist, ref_start, &ref_end, + &max_pos) < 0) goto err; } @@ -1768,8 +1775,9 @@ static int cram_generate_reference(cram_container *c, cram_slice *s, int r1) { // ref file. c->ref = ref; c->ref_start = ref_start+1; - c->ref_end = ref_end+1; + c->ref_end = max_pos+1; c->ref_free = 1; + return 0; err: @@ -2012,8 +2020,10 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { pthread_mutex_unlock(&fd->ref_lock); } - if (c->ref_end > fd->refs->ref_id[c->ref_id]->LN_length) - c->ref_end = fd->refs->ref_id[c->ref_id]->LN_length; + hts_pos_t rlen = MAX(fd->refs->ref_id[c->ref_id]->LN_length, + fd->refs->ref_id[c->ref_id]->length); + if (c->ref_end > rlen && rlen) + c->ref_end = rlen; } // Iterate through records creating the cram blocks for some From 5bd8e0ad31a5ab537a20a96909caa628f239fd23 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 2 Apr 2026 09:02:26 +0100 Subject: [PATCH 3/3] Rename arguments for extend_ref(). The container ref_start and ref_end are absolute coordinates in the reference covered by the alignments. However extend_ref had ref_end as the maximum reference allocated, meaning we had to track the actual end point in another variable (max_pos). Renamed max_pos to ref_end and now track the allocation in ref_end_alloc. --- cram/cram_encode.c | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/cram/cram_encode.c b/cram/cram_encode.c index 41c926407..735784e89 100644 --- a/cram/cram_encode.c +++ b/cram/cram_encode.c @@ -1503,19 +1503,19 @@ int next_cigar_op(uint32_t *cigar, uint32_t ncigar, int *skip, int *spos, // Ensure ref and hist are large enough. static inline int extend_ref(char **ref, uint32_t (**hist)[5], hts_pos_t pos, hts_pos_t ref_start, hts_pos_t *ref_end, - hts_pos_t *max_pos) { - if (*max_pos < pos) - *max_pos = pos; + hts_pos_t *ref_end_alloc) { + if (*ref_end < pos) + *ref_end = pos; if (pos < ref_start) return -1; - if (pos < *ref_end) + if (pos < *ref_end_alloc) return 0; // realloc if (pos - ref_start > UINT_MAX) return -2; // protect overflow in new_end calculation - hts_pos_t old_end = *ref_end ? *ref_end : ref_start; + hts_pos_t old_end = *ref_end_alloc ? *ref_end_alloc : ref_start; hts_pos_t new_end = ref_start + 1000 + (pos-ref_start)*1.5; // Refuse to work on excessively large blocks. @@ -1534,7 +1534,7 @@ static inline int extend_ref(char **ref, uint32_t (**hist)[5], hts_pos_t pos, if (!tmp5) return -1; *hist = tmp5; - *ref_end = new_end; + *ref_end_alloc = new_end; // initialise old_end -= ref_start; @@ -1549,7 +1549,7 @@ static inline int extend_ref(char **ref, uint32_t (**hist)[5], hts_pos_t pos, // Returns 1 on success, <0 on failure static int cram_add_to_ref_MD(bam1_t *b, char **ref, uint32_t (**hist)[5], hts_pos_t ref_start, hts_pos_t *ref_end, - hts_pos_t *max_pos, const uint8_t *MD) { + hts_pos_t *ref_end_alloc, const uint8_t *MD) { uint8_t *seq = bam_get_seq(b); uint32_t *cigar = bam_get_cigar(b); uint32_t ncigar = b->core.n_cigar; @@ -1561,7 +1561,7 @@ static int cram_add_to_ref_MD(bam1_t *b, char **ref, uint32_t (**hist)[5], // No sequence means extend based on CIGAR instead if (!b->core.l_qseq && extend_ref(ref, hist, rseq_end, - ref_start, ref_end, max_pos) < 0) + ref_start, ref_end, ref_end_alloc) < 0) return -1; int iseq = 0, next_op; @@ -1576,7 +1576,7 @@ static int cram_add_to_ref_MD(bam1_t *b, char **ref, uint32_t (**hist)[5], int len = hts_str2uint((char *)MD, (char **)&MD, 31, &overflow); if (overflow || extend_ref(ref, hist, iref+ref_start + len, - ref_start, ref_end, max_pos) < 0) + ref_start, ref_end, ref_end_alloc) < 0) return -1; while (iseq < b->core.l_qseq && len) { // rewrite to have internal loops? @@ -1608,7 +1608,7 @@ static int cram_add_to_ref_MD(bam1_t *b, char **ref, uint32_t (**hist)[5], MD++; while (isalpha(*MD)) { if (extend_ref(ref, hist, iref+ref_start, ref_start, - ref_end, max_pos) < 0) + ref_end, ref_end_alloc) < 0) return -1; if ((next_op = next_cigar_op(cigar, ncigar, cig_skip, &iseq, &cig_ind, &cig_op, @@ -1625,7 +1625,7 @@ static int cram_add_to_ref_MD(bam1_t *b, char **ref, uint32_t (**hist)[5], } else { // substitution if (extend_ref(ref, hist, iref+ref_start, ref_start, ref_end, - max_pos) < 0) + ref_end_alloc) < 0) return -1; if ((next_op = next_cigar_op(cigar, ncigar, cig_skip, &iseq, &cig_ind, &cig_op, @@ -1655,13 +1655,13 @@ static int cram_add_to_ref_MD(bam1_t *b, char **ref, uint32_t (**hist)[5], // -1 on failure (eg inconsistent data) static int cram_add_to_ref(bam1_t *b, char **ref, uint32_t (**hist)[5], hts_pos_t ref_start, hts_pos_t *ref_end, - hts_pos_t *max_pos) { + hts_pos_t *ref_end_alloc) { const uint8_t *MD = bam_aux_get(b, "MD"); int ret = 0; if (MD && *MD == 'Z') { // We can use MD to directly compute the reference int ret = cram_add_to_ref_MD(b, ref, hist, ref_start, ref_end, - max_pos, MD+1); + ref_end_alloc, MD+1); if (ret > 0) return ret; @@ -1689,7 +1689,7 @@ static int cram_add_to_ref(bam1_t *b, char **ref, uint32_t (**hist)[5], static uint8_t L16[16] = {4,0,1,4, 2,4,4,4, 3,4,4,4, 4,4,4,4}; if (extend_ref(ref, hist, iref+ref_start + len, - ref_start, ref_end, max_pos) < 0) + ref_start, ref_end, ref_end_alloc) < 0) return -1; if (iseq + len <= b->core.l_qseq) { // Nullify failed MD:Z if appropriate @@ -1732,7 +1732,8 @@ static int cram_generate_reference(cram_container *c, cram_slice *s, int r1) { // user told us to do embed_ref=2. char *ref = NULL; uint32_t (*hist)[5] = NULL; - hts_pos_t ref_start = c->bams[r1]->core.pos, ref_end = 0, max_pos = 0; + hts_pos_t ref_start = c->bams[r1]->core.pos, ref_end = 0, + ref_end_alloc = 0; if (ref_start < 0) return -1; // cannot build consensus from unmapped data @@ -1740,7 +1741,7 @@ static int cram_generate_reference(cram_container *c, cram_slice *s, int r1) { if (extend_ref(&ref, &hist, c->bams[r1 + s->hdr->num_records-1]->core.pos + c->bams[r1 + s->hdr->num_records-1]->core.l_qseq, - ref_start, &ref_end, &max_pos) < 0) + ref_start, &ref_end, &ref_end_alloc) < 0) return -1; // Add each bam file to the reference/consensus arrays @@ -1753,7 +1754,7 @@ static int cram_generate_reference(cram_container *c, cram_slice *s, int r1) { } last_pos = c->bams[r1]->core.pos; if (cram_add_to_ref(c->bams[r1], &ref, &hist, ref_start, &ref_end, - &max_pos) < 0) + &ref_end_alloc) < 0) goto err; } @@ -1775,7 +1776,7 @@ static int cram_generate_reference(cram_container *c, cram_slice *s, int r1) { // ref file. c->ref = ref; c->ref_start = ref_start+1; - c->ref_end = max_pos+1; + c->ref_end = ref_end+1; c->ref_free = 1; return 0;