Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
112 commits
Select commit Hold shift + click to select a range
bd57e00
set up project structure
jonperdomo Mar 4, 2025
b58db8f
work on train model
jonperdomo Mar 14, 2025
cb9bac1
work on annotations
jonperdomo Mar 16, 2025
05c0991
add fragile sites
jonperdomo Mar 17, 2025
f8619f6
work on annotations
jonperdomo Mar 17, 2025
3c131d5
update annotations
jonperdomo Mar 17, 2025
e4ce34f
cytoband annotations
jonperdomo Mar 28, 2025
bcc296f
update annotations
jonperdomo Mar 28, 2025
f9a0776
work on features
jonperdomo Mar 29, 2025
b1f5cec
work on features
jonperdomo Mar 29, 2025
309a140
update df
jonperdomo Apr 1, 2025
4ee6b81
fix annotations
jonperdomo Apr 1, 2025
4bb1f54
remove test code
jonperdomo Apr 1, 2025
e4e4728
model training
jonperdomo Apr 2, 2025
c3169da
test multiple models
jonperdomo Apr 3, 2025
8744fe0
caller model update
jonperdomo Apr 3, 2025
c08b339
cross validation
jonperdomo Apr 3, 2025
98a1777
update training model
jonperdomo Apr 4, 2025
5279105
add copy number state and read alignment offset features
jonperdomo Apr 15, 2025
1fbb273
create extract features module
jonperdomo Apr 15, 2025
017b920
implement predictions
jonperdomo Apr 15, 2025
80d8362
feature corr analysis
jonperdomo Apr 15, 2025
6078b96
add id column
jonperdomo Apr 15, 2025
a46794b
add hg002 hg19 to training
jonperdomo Apr 16, 2025
4ddde04
key fix
jonperdomo Apr 16, 2025
430d7f9
add hg19 filtering
jonperdomo Apr 17, 2025
d0c86ec
update plot
jonperdomo Apr 28, 2025
32eccd6
normalize features and add annovar annotations
jonperdomo May 1, 2025
7419bfc
fix segdup scores
jonperdomo May 1, 2025
990dfad
remove test code
jonperdomo May 1, 2025
23f98e6
remove unused code
jonperdomo May 1, 2025
27e6def
normalize coverage based features
jonperdomo May 4, 2025
ca4c9e9
fix scaling
jonperdomo May 5, 2025
cd4f099
improve large sv scores
jonperdomo Jul 10, 2025
ae4c11c
feature engineering
jonperdomo Jul 12, 2025
d9715e3
feature updates
jonperdomo Jul 13, 2025
257ed57
update features
jonperdomo Jul 14, 2025
1b0c25a
scale read depth and cluster size
jonperdomo Jul 14, 2025
01b327d
add threshold parameter and update cross validation plot
jonperdomo Sep 3, 2025
be6cf9b
work on leave out model training
jonperdomo Feb 15, 2026
94a00d6
parameter optimization for precision
jonperdomo Feb 15, 2026
58673c5
full model train
jonperdomo Feb 18, 2026
a057fcc
update prediction
jonperdomo Feb 18, 2026
1004ea6
comment previous code
jonperdomo Feb 23, 2026
fe3e27c
remove interaction terms
jonperdomo Feb 23, 2026
65b740d
shap analysis
jonperdomo Feb 27, 2026
618b3b2
update features
jonperdomo Feb 28, 2026
36a329f
work on feature importance
jonperdomo Feb 28, 2026
3dec63d
fix shap plot
jonperdomo Feb 28, 2026
6281306
update predict
jonperdomo Feb 28, 2026
fd2d2dd
normalize read depth with mean coverage
jonperdomo Mar 2, 2026
a84be5f
less than 10kb model
jonperdomo Mar 2, 2026
ffcc591
add filepath config file and prediction threshold sweep
jonperdomo Mar 5, 2026
b25ae22
optimal thresholds by sv type
jonperdomo Mar 8, 2026
884c89b
add sv length cutoff parameter
jonperdomo Mar 10, 2026
0378958
conda project restructure
jonperdomo Apr 12, 2026
e141707
work on unit tests
jonperdomo Apr 12, 2026
9d65823
add test data
jonperdomo Apr 12, 2026
57f1b18
handle gzip
jonperdomo Apr 12, 2026
30bed9b
add tests
jonperdomo Apr 12, 2026
618db2f
add unit tests workflow
jonperdomo Apr 12, 2026
c868f2c
Add unit tests badge to README
jonperdomo Apr 12, 2026
2996178
update test name
jonperdomo Apr 12, 2026
70ee53d
update gh action
jonperdomo Apr 12, 2026
9d6687f
update environment
jonperdomo Apr 12, 2026
3cc62cf
try mamba
jonperdomo Apr 12, 2026
e9d8134
mamba test
jonperdomo Apr 12, 2026
8cbc50b
test mamba
jonperdomo Apr 12, 2026
0779f97
wokr on conda package
jonperdomo Apr 13, 2026
64f80e0
improve installation
jonperdomo Apr 13, 2026
e599dd5
fix chr format mismatch
jonperdomo Apr 13, 2026
f66bfb4
fix warnings
jonperdomo Apr 16, 2026
b4fc5d9
restore plot
jonperdomo Apr 16, 2026
760f64c
save evaluation plots
jonperdomo Apr 20, 2026
ba4cba5
update readme
jonperdomo Apr 20, 2026
ca19b4a
update gitignore
jonperdomo May 10, 2026
fb9875c
update bed filepaths
jonperdomo May 10, 2026
fe46b3f
add pytest
jonperdomo May 10, 2026
7b10db4
Update unit tests badge in README.md
jonperdomo May 10, 2026
cb3aab3
update README
jonperdomo May 11, 2026
acdb5b6
Revise README content and improve clarity
jonperdomo May 11, 2026
dff54d0
clean up comments and logs
jonperdomo May 11, 2026
8086f93
Potential fix for pull request finding
jonperdomo May 11, 2026
8f20a2e
Potential fix for pull request finding
jonperdomo May 11, 2026
4fc28c9
Potential fix for pull request finding
jonperdomo May 11, 2026
4ad957f
Potential fix for pull request finding
jonperdomo May 11, 2026
df81ab2
Fix score intermediate BED path and cleanup behavior
Copilot May 11, 2026
ac82bf5
Lazy-load optional training dependencies in train_full_model
Copilot May 11, 2026
118f4d9
Potential fix for pull request finding
jonperdomo May 11, 2026
0f34095
Potential fix for pull request finding
jonperdomo May 11, 2026
9be53dc
Use safe subprocess invocation for ANNOVAR DB download
Copilot May 11, 2026
df9503c
Change branch for workflow trigger from 'initial-commit' to 'main'
jonperdomo May 11, 2026
874318c
update setup.py
jonperdomo May 11, 2026
8c0d491
updates
jonperdomo May 18, 2026
a349429
bin sizes
jonperdomo May 18, 2026
b854bd0
update
jonperdomo May 18, 2026
81f39eb
test
jonperdomo May 18, 2026
f112520
update stratification
jonperdomo May 18, 2026
9d624a1
stratify size bin
jonperdomo May 18, 2026
2e1b9ac
update logging
jonperdomo May 18, 2026
be48fac
error fix
jonperdomo May 18, 2026
dbd69b5
update
jonperdomo May 25, 2026
c8ab8d6
gmm
jonperdomo May 25, 2026
a2262a5
update
jonperdomo May 25, 2026
30d4e8c
update
jonperdomo May 25, 2026
d4ef302
update
jonperdomo May 25, 2026
26d5cae
gmm
jonperdomo May 25, 2026
71d92b0
update
jonperdomo May 25, 2026
60b313f
updates
jonperdomo May 25, 2026
b7f6162
update
jonperdomo Jun 1, 2026
cdb2f89
update contextscore model version
jonperdomo Jun 1, 2026
dcb47d0
Merge branch 'main' into revision
jonperdomo Jun 1, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,6 @@
"tests"
],
"python.testing.unittestEnabled": false,
"python.testing.pytestEnabled": true
"python.testing.pytestEnabled": true,
"python.terminal.activateEnvironment": false
}
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ These are the required ANNOVAR components for ContextScore:

## User Workflow
```bash
contextscore --input input.vcf --output scored.vcf --sample-coverage 30 --buildver {hg38,hg19} --threshold 0.2 \
contextscore --input input.vcf --output scored.vcf --sample-coverage 30 --buildver {hg38,hg19} \
--annovar /path/to/annovar --annovar-db /path/to/humandb
```

Expand Down
2 changes: 1 addition & 1 deletion conda/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ requirements:
- scikit-learn =1.6.1 # For consistency with model training environment
- joblib
- bedtools
- contextscore-models
- contextscore-models ==1.0.0

about:
home: https://github.com/WGLab/ContextScore
Expand Down
131 changes: 77 additions & 54 deletions contextscore/extract_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ def normalize_chrom_label(chrom):
return chrom_str.upper()

def extract_features(input_bed, annovar_path, db_path, outdiranno, buildversion='hg38', sample_coverage=None):
# ...existing code...
"""Extract the features from the BED file, columns are in the first row:
chrom, start, end, sv_type, sv_length, genotype, read_depth, hmm_llh, aln_type, cluster_size

Expand Down Expand Up @@ -170,6 +171,12 @@ def extract_features(input_bed, annovar_path, db_path, outdiranno, buildversion=

# Normalize SV length to a positive magnitude.
bed_df['sv_length'] = bed_df['sv_length'].abs()
# Add SV length interval features (4 non-type-specific intervals, one-hot encoded)
svlen = bed_df['sv_length']
bed_df['svlen_50_500'] = ((svlen >= 50) & (svlen < 500)).astype(int)
bed_df['svlen_500_5000'] = ((svlen >= 500) & (svlen < 5000)).astype(int)
bed_df['svlen_5000_50000'] = ((svlen >= 5000) & (svlen < 50000)).astype(int)
bed_df['svlen_50000_plus'] = (svlen >= 50000).astype(int)

# Drop the genotype column and cn_state columns (due to redundancy).
bed_df.drop(columns=['genotype', 'cn_state'], inplace=True)
Expand Down Expand Up @@ -240,62 +247,78 @@ def extract_features(input_bed, annovar_path, db_path, outdiranno, buildversion=
# Drop the cn_state column from the data.
bed_df = bed_df.drop(columns=['cn_state'], errors='ignore')

# Add distance to nearest other SV call, clustered false positives often appear near real SVs.
logging.info('Computing distance to nearest other SV call (same chromosome)...')
bed_df['dist_to_nearest_sv'] = np.nan
for chrom, idx in bed_df.groupby('chrom', sort=False).groups.items():
chrom_df = bed_df.loc[idx, ['start', 'end']].sort_values(['start', 'end'])
n = chrom_df.shape[0]

if n <= 1:
# Compute robust local-neighborhood SV features by chromosome + SV type.
# Nearest distance excludes overlaps and is stabilized with log and length-relative transforms.
logging.info('Computing robust nearest-SV and local SV-density features (same SV type)...')
bed_df['dist_nearest_nonoverlap_same_type'] = np.nan
bed_df['dist_nearest_nonoverlap_same_type_log1p'] = np.nan
bed_df['dist_nearest_nonoverlap_same_type_rel_log1p'] = np.nan
bed_df['local_same_type_count_1kb'] = 0
bed_df['local_same_type_count_10kb'] = 0
bed_df['local_same_type_count_100kb'] = 0

for (_, _), idx in bed_df.groupby(['chrom', 'sv_type_str'], sort=False).groups.items():
idx_list = list(idx)
group_df = bed_df.loc[idx_list, ['start', 'end', 'sv_length']].copy()
n = group_df.shape[0]

if n == 0:
continue

starts = chrom_df['start'].to_numpy(dtype=np.int64)
ends = chrom_df['end'].to_numpy(dtype=np.int64)

# Previous interval summary.
prev_max_end = np.maximum.accumulate(ends)
prev_max_end_excl = np.empty(n, dtype=np.int64)
prev_max_end_excl[0] = np.iinfo(np.int64).min
prev_max_end_excl[1:] = prev_max_end[:-1]

# Next interval summary.
next_start_excl = np.empty(n, dtype=np.int64)
next_start_excl[:-1] = starts[1:]
next_start_excl[-1] = np.iinfo(np.int64).max

# Overlap checks with prior/next intervals.
overlap_prev = prev_max_end_excl > starts
overlap_next = ends > next_start_excl
overlap_any = overlap_prev | overlap_next

# Gap to closest left/right neighbor (touching intervals yield 0).
left_gap = starts - prev_max_end_excl
right_gap = next_start_excl - ends

# No-left/no-right sentinels.
left_gap[0] = np.iinfo(np.int64).max
right_gap[-1] = np.iinfo(np.int64).max

nearest = np.minimum(left_gap, right_gap).astype(np.float64)
nearest[overlap_any] = 0.0

# Any remaining sentinel values are undefined (should only happen in degenerate cases).
sentinel = float(np.iinfo(np.int64).max)
nearest[nearest >= sentinel] = np.nan

bed_df.loc[chrom_df.index, 'dist_to_nearest_sv'] = nearest

logging.info('Distance to nearest SV calculated. Coverage: %.1f%%', (bed_df['dist_to_nearest_sv'].notna().sum() / len(bed_df) * 100))

# Print statistics about the distance to nearest SV feature.
logging.info('Distance to nearest SV - mean: %.2f, median: %.2f, std: %.2f', bed_df['dist_to_nearest_sv'].mean(), bed_df['dist_to_nearest_sv'].median(), bed_df['dist_to_nearest_sv'].std())

# Normalize by SV size
bed_df['dist_nearest_sv_per_kb'] = np.where(
bed_df['sv_length'] > 0,
bed_df['dist_to_nearest_sv'] / (bed_df['sv_length'] / 1000.0),
bed_df['dist_to_nearest_sv']
starts = pd.to_numeric(group_df['start'], errors='coerce').to_numpy(dtype=np.int64)
ends = pd.to_numeric(group_df['end'], errors='coerce').to_numpy(dtype=np.int64)
lengths = pd.to_numeric(group_df['sv_length'], errors='coerce').to_numpy(dtype=np.float64)

# Ensure interval ordering is valid even for malformed coordinates.
left = np.minimum(starts, ends)
right = np.maximum(starts, ends)

nearest = np.full(n, np.nan, dtype=np.float64)
if n > 1:
sorted_ends = np.sort(right)
sorted_starts = np.sort(left)

left_pos = np.searchsorted(sorted_ends, left, side='right') - 1
right_pos = np.searchsorted(sorted_starts, right, side='left')

left_gap = np.full(n, np.inf, dtype=np.float64)
valid_left = left_pos >= 0
left_gap[valid_left] = (left[valid_left] - sorted_ends[left_pos[valid_left]]).astype(np.float64)

right_gap = np.full(n, np.inf, dtype=np.float64)
valid_right = right_pos < n
right_gap[valid_right] = (sorted_starts[right_pos[valid_right]] - right[valid_right]).astype(np.float64)

nearest = np.minimum(left_gap, right_gap)
nearest[np.isinf(nearest)] = np.nan

# Length-relative normalization keeps comparability across 100bp to 100kb+ SVs.
length_scale = np.maximum(np.abs(lengths), 100.0)
nearest_rel = nearest / length_scale

bed_df.loc[idx_list, 'dist_nearest_nonoverlap_same_type'] = nearest
bed_df.loc[idx_list, 'dist_nearest_nonoverlap_same_type_log1p'] = np.log1p(nearest)
bed_df.loc[idx_list, 'dist_nearest_nonoverlap_same_type_rel_log1p'] = np.log1p(nearest_rel)

# Local center-based same-type SV density counts.
centers = ((left + right) // 2).astype(np.int64)
sorted_centers = np.sort(centers)
for window_bp, col_name in [
(1000, 'local_same_type_count_1kb'),
(10000, 'local_same_type_count_10kb'),
(100000, 'local_same_type_count_100kb'),
]:
lo = np.searchsorted(sorted_centers, centers - window_bp, side='left')
hi = np.searchsorted(sorted_centers, centers + window_bp, side='right')
counts = (hi - lo - 1).astype(np.int32) # Exclude self.
bed_df.loc[idx_list, col_name] = counts

coverage_pct = bed_df['dist_nearest_nonoverlap_same_type'].notna().mean() * 100
logging.info(
'Nearest same-type non-overlap distance computed. Coverage: %.2f%%, mean(log1p): %.3f, mean(rel_log1p): %.3f',
coverage_pct,
bed_df['dist_nearest_nonoverlap_same_type_log1p'].mean(skipna=True),
bed_df['dist_nearest_nonoverlap_same_type_rel_log1p'].mean(skipna=True),
)

# Return the features dataframe.
Expand Down
Loading
Loading