freimannk · kerimoff · Nov 17, 2025 · Nov 17, 2025 · Jan 30, 2026 · Feb 3, 2026
diff --git a/.gitignore b/.gitignore
@@ -4,4 +4,20 @@ results*
 .nextflow*
 work
 singularity_img/
-vcfs/
+vcfs/
+client-state/
+sessions/
+projects_settings/
+results/
+input/exon_testdata
+input/ge
+input/GTEX_blood_ge
+input/lc_testdata
+input/tx_testdata
+input/txrev_testdata
+sbatch_runs/*
+input/*
+temp_debug
+.venv/
+EDA/*
+test_data_legacy/*
diff --git a/bin/generate_leafcutter_plot_data.R b/bin/generate_leafcutter_plot_data.R
@@ -21,8 +21,8 @@ option_list <- list(
               help="The selected qtl_group in the study", metavar = "type"),
   make_option(c("-v", "--vcf_file"), type="character", default=NULL,
               help="TPM quantile TSV file with phenotype_id column", metavar = "type"),
-  make_option(c("-b", "--bigwig_path"), type="character", default=NULL,
-              help="Path to the bigwig files", metavar = "type"),
+  make_option(c("-c", "--coverage_parquet"), type="character", default=NULL,
+              help="Path to the coverage parquet file", metavar = "type"),
   make_option(c("-m", "--mane_transcript_gene_map"), type="character", default=NULL,
               help="Path to the MANE transcripts of genes map file", metavar = "type"),
   make_option(c("-g", "--gtf_file"), type="character", default=NULL,
@@ -146,7 +146,7 @@ sample_meta_path = opt$s
 phenotype_meta_path = opt$p
 qtl_group_in = opt$q
 vcf_file_path = opt$v
-bigwig_files_path = opt$b
+coverage_parquet_path = opt$c
 mane_transcript_gene_map_file = opt$m
 tpm_matrix_path = opt$tpm_matrix
 gtf_file_path = opt$g
@@ -162,7 +162,7 @@ message("######### susie_file_path    : ", susie_file_path)
 message("######### sample_meta_path   : ", sample_meta_path)
 message("######### phenotype_meta_path: ", phenotype_meta_path)
 message("######### vcf_file_path      : ", vcf_file_path)
-message("######### bigwig_files_path  : ", bigwig_files_path)
+message("######### coverage_parquet   : ", coverage_parquet_path)
 message("######### mane_map_file_path : ", mane_transcript_gene_map_file)
 message("######### gtf_file_path      : ", gtf_file_path)
 message("######### scaling_fct_path   : ", scaling_factors_path)
@@ -277,21 +277,13 @@ for (index in 1:nrow(highest_pip_vars_per_cs)) {
   var_genotype <- sample_meta_clean %>% 
     left_join(var_genotype, by = "genotype_id")
 
-  bigwig_files <- list.files(bigwig_files_path, full.names = T)
-
-  track_data_study <- data.frame(file_name = (gsub(pattern = paste0(bigwig_files_path, "/"), replacement = "", x = bigwig_files, fixed = T)),
-                                 bigWig = bigwig_files)
-
-  track_data_study <- track_data_study %>% 
-    dplyr::mutate(sample_id = gsub(pattern = ".bigwig", replacement = "", x = file_name)) %>% 
-    dplyr::filter(sample_id %in% sample_meta_clean$sample_id)
-
-  track_data_study <-track_data_study %>% 
-    dplyr::left_join(var_genotype %>% dplyr::select(sample_id, qtl_group, DS), by = c("sample_id")) %>% 
+  track_data_study <- var_genotype %>% 
+    dplyr::select(sample_id, qtl_group, DS) %>% 
     dplyr::left_join(scaling_factor_data) %>% 
     dplyr::rename(scaling_factor = scaling_factors) %>% 
     dplyr::mutate(track_id = qtl_group) %>% 
     dplyr::mutate(colour_group = as.character(DS)) %>% 
+    dplyr::mutate(bigWig = "") %>% 
     dplyr::select(sample_id, scaling_factor, bigWig, track_id, colour_group, qtl_group) %>% 
     dplyr::filter(qtl_group==qtl_group_in)
 
@@ -354,7 +346,8 @@ for (index in 1:nrow(highest_pip_vars_per_cs)) {
   coverage_data_list = tryCatch(wiggleplotr::extractCoverageData(exons = exons_to_plot, 
                                                                  cdss = exon_cdss_to_plot, # Does var will be default NULL if not stated?
                                                                  plot_fraction = 0.2,
-                                                                 track_data = track_data_study), 
+                                                                 track_data = track_data_study,
+                                                                 coverage_ranges_pq = coverage_parquet_path), 
                                 error = function(e) {
                                   message(" ## Problem with generating coverage_data wiggleplotr")
                                   message(e)
@@ -408,6 +401,10 @@ for (index in 1:nrow(highest_pip_vars_per_cs)) {
   track_data_study_box <- track_data_study_box %>%  
     dplyr::left_join(tpm_exp_df_oi, by = c("sample_id", "intron_id"))
 
+  # Remove samples present in expression matrix but missing from metadata-filtered sample set
+  track_data_study_box <- track_data_study_box %>%
+    dplyr::filter(!is.na(snp_id))
+
   nom_cc_sumstats_variant_phenotype_id <- read_and_filter_parquet( 
     file_list = ss_oi$nominal_cc_path[[1]],
     variant_to_match = ss_oi$variant,

diff --git a/bin/generate_plot_exon_data.R b/bin/generate_plot_exon_data.R
@@ -13,8 +13,8 @@ option_list <- list(
               help="The selected qtl_group in the study", metavar = "type"),
   make_option(c("-v", "--vcf_file"), type="character", default=NULL,
               help="TPM quantile TSV file with phenotype_id column", metavar = "type"),
-  make_option(c("-b", "--bigwig_path"), type="character", default=NULL,
-              help="Path to the bigwig files", metavar = "type"),
+  make_option(c("-c", "--coverage_parquet"), type="character", default=NULL,
+              help="Path to the coverage parquet file", metavar = "type"),
   make_option(c("-m", "--mane_transcript_gene_map"), type="character", default=NULL,
               help="Path to the MANE transcripts of genes map file", metavar = "type"),
   make_option(c("-g", "--gtf_file"), type="character", default=NULL,
@@ -134,7 +134,7 @@ sample_meta_path = opt$s
 phenotype_meta_path = opt$p
 qtl_group_in = opt$q
 vcf_file_path = opt$v
-bigwig_files_path = opt$b
+coverage_parquet_path = opt$c
 mane_transcript_gene_map_file = opt$m
 gtf_file_path = opt$g
 norm_usage_matrix_path = opt$u
@@ -151,7 +151,7 @@ message("######### susie_file_path    : ", susie_file_path)
 message("######### sample_meta_path   : ", sample_meta_path)
 message("######### phenotype_meta_path: ", phenotype_meta_path)
 message("######### vcf_file_path      : ", vcf_file_path)
-message("######### bigwig_files_path  : ", bigwig_files_path)
+message("######### coverage_parquet   : ", coverage_parquet_path)
 message("######### mane_map_file_path : ", mane_transcript_gene_map_file)
 message("######### gtf_file_path      : ", gtf_file_path)
 message("######### tpm_matrix         : ", tpm_matrix_path)
@@ -284,21 +284,13 @@ for (index in 1:nrow(highest_pip_vars_per_cs)) {
   var_genotype <- sample_meta_clean %>% 
     left_join(var_genotype, by = "genotype_id")
 
-  bigwig_files <- list.files(bigwig_files_path, full.names = T)
-
-  track_data_study <- data.frame(file_name = (gsub(pattern = paste0(bigwig_files_path, "/"), replacement = "", x = bigwig_files, fixed = T)),
-                                 bigWig = bigwig_files)
-
-  track_data_study <- track_data_study %>% 
-    dplyr::mutate(sample_id = gsub(pattern = ".bigwig", replacement = "", x = file_name)) %>% 
-    dplyr::filter(sample_id %in% sample_meta_clean$sample_id)
-
-  track_data_study <-track_data_study %>% 
-    dplyr::left_join(var_genotype %>% dplyr::select(sample_id, qtl_group, DS), by = c("sample_id")) %>% 
+  track_data_study <- var_genotype %>% 
+    dplyr::select(sample_id, qtl_group, DS) %>% 
     dplyr::left_join(scaling_factor_data) %>% 
     dplyr::rename(scaling_factor = scaling_factors) %>% 
     dplyr::mutate(track_id = qtl_group) %>% 
     dplyr::mutate(colour_group = as.character(DS)) %>% 
+    dplyr::mutate(bigWig = "") %>% 
     dplyr::select(sample_id, scaling_factor, bigWig, track_id, colour_group, qtl_group) %>% 
     dplyr::filter(qtl_group==qtl_group_in)
 
@@ -368,7 +360,8 @@ for (index in 1:nrow(highest_pip_vars_per_cs)) {
   coverage_data_list = tryCatch(wiggleplotr::extractCoverageData(exons = exons_to_plot, 
                                                                  cdss = exon_cdss_to_plot, 
                                                                  plot_fraction = 0.2,
-                                                                 track_data = track_data_study), 
+                                                                 track_data = track_data_study, 
+                                                                 coverage_ranges_pq = coverage_parquet_path), 
                                 error = function(e) {
                                   message(" ## Problem with generating coverage_data wiggleplotr")
                                   message(e)
@@ -414,6 +407,10 @@ for (index in 1:nrow(highest_pip_vars_per_cs)) {
   track_data_study_box <- track_data_study_box %>%  
     dplyr::left_join(tpm_exp_df_oi, by = c("sample_id", "tx_id"))
 
+  # Remove samples present in expression matrix but missing from metadata-filtered sample set
+  track_data_study_box <- track_data_study_box %>%
+    dplyr::filter(!is.na(snp_id))
+
   message(" ## Reading nominal summary stats with seqminer")
 
   # Keep only 1 rsid per variant per molecular_trait_id

diff --git a/bin/generate_plot_ge_data.R b/bin/generate_plot_ge_data.R
@@ -18,8 +18,8 @@ option_list <- list(
               help="The selected qtl_group in the study", metavar = "type"),
   make_option(c("-v", "--vcf_file"), type="character", default=NULL,
               help="TPM quantile TSV file with phenotype_id column", metavar = "type"),
-  make_option(c("-b", "--bigwig_path"), type="character", default=NULL,
-              help="Path to the bigwig files", metavar = "type"),
+  make_option(c("-c", "--coverage_parquet"), type="character", default=NULL,
+              help="Path to the coverage parquet file", metavar = "type"),
   make_option(c("-m", "--mane_transcript_gene_map"), type="character", default=NULL,
               help="Path to the MANE transcripts of genes map file", metavar = "type"),
   make_option(c("-g", "--gtf_file"), type="character", default=NULL,
@@ -78,7 +78,7 @@ prepareTranscriptStructureForPlotting <- function(exon_ranges, cds_ranges, trans
 filter_trait_matrix <- function(unique_trait_ids, trait_matrix_pq_file) {
   trait_dataset <- open_dataset(trait_matrix_pq_file)
   filtered_traits_dataset <- trait_dataset %>%
-    filter(phenotype_id %in% unique_trait_ids)
+    dplyr::filter(phenotype_id %in% unique_trait_ids)
   traits_df <- collect(filtered_traits_dataset)
   return(traits_df)
 }
@@ -109,7 +109,7 @@ read_and_filter_parquet <- function(file_list, variant_to_match, phenotype_id) {
     dataset <- open_dataset(file_name)
 
     filtered_data <- dataset %>%
-      filter(variant == variant_to_match & gene_id == phenotype_id) %>%
+      dplyr::filter(variant == variant_to_match & gene_id == phenotype_id) %>%
       collect()
 
     if (nrow(filtered_data) > 0) {
@@ -126,7 +126,7 @@ susie_file_path = opt$f
 sample_meta_path = opt$s
 qtl_group_in = opt$q
 vcf_file_path = opt$v
-bigwig_files_path = opt$b
+coverage_parquet_path = opt$c
 mane_transcript_gene_map_file = opt$m
 gtf_file_path = opt$g
 norm_usage_matrix_path = opt$u
@@ -135,13 +135,27 @@ scaling_factors_path = opt$div_scaling_factors
 vcf_sample_bad_symbol = opt$vcf_sample_bad_symbol
 vcf_sample_replacement_symbol = opt$vcf_sample_replacement_symbol
 
+# base_path <- "~/alasoo_lab/coverage_plot_fixer/work_dir/"
+# 
+# qtl_group_in <- "LCL"
+# susie_file_path <- paste0(base_path, "QTD000764_LCL_ge_1_1.parquet")
+# sample_meta_path <- paste0(base_path, "MAGE_McCoy_2023_QC_all_samples.tsv")
+# vcf_file_path <- paste0(base_path, "MAGE_with_SV_20220422_3202.filtered_chr5_96894474-96991959_test_samples.vcf.gz")
+# mane_transcript_gene_map_file <- paste0(base_path, "MANE_transcript_gene_map.txt")
+# gtf_file_path <- paste0(base_path, "Homo_sapiens.GRCh38.105.gtf")
+# scaling_factors_path <- paste0(base_path, "MAGE.LCL.scaling_factors.tsv.gz")
+# norm_usage_matrix_path <- paste0(base_path, "MAGE.parquet")
+# tpm_matrix_path <- paste0(base_path, "MAGE_TPM.parquet")
+# coverage_parquet_path <- paste0(base_path, "MAGE_bigwig_single.parquet")
+
+
 message("######### Options: ######### ")
 message("######### Working Directory  : ", getwd())
 message("######### qtl_group          : ", qtl_group_in)
 message("######### susie_file_path    : ", susie_file_path)
 message("######### sample_meta_path   : ", sample_meta_path)
 message("######### vcf_file_path      : ", vcf_file_path)
-message("######### bigwig_files_path  : ", bigwig_files_path)
+message("######### coverage_parquet   : ", coverage_parquet_path)
 message("######### mane_map_file_path : ", mane_transcript_gene_map_file)
 message("######### gtf_file_path      : ", gtf_file_path)
 message("######### scaling_fct_path   : ", scaling_factors_path)
@@ -197,10 +211,6 @@ norm_exp_df <- filter_trait_matrix(trait_ids, norm_usage_matrix_path)
 tpm_exp_df <- filter_trait_matrix(trait_ids, tpm_matrix_path)
 
 
-
-###norm_exp_df <- readr::read_tsv(norm_usage_matrix_path)
-
-
 message(" ## Reading scaling_factors file")
 scaling_factor_data <- readr::read_tsv(scaling_factors_path, col_types = "cd") 
 
@@ -262,25 +272,16 @@ for (index in 1:nrow(highest_pip_vars_per_cs)) {
   var_genotype <- sample_meta_clean %>% 
     left_join(var_genotype, by = "genotype_id")
 
-  bigwig_files <- list.files(bigwig_files_path, full.names = T)
-
-  track_data_study <- data.frame(file_name = (gsub(pattern = paste0(bigwig_files_path, "/"), replacement = "", x = bigwig_files, fixed = T)),
-                                 bigWig = bigwig_files)
-
-  track_data_study <- track_data_study %>% 
-    dplyr::mutate(sample_id = gsub(pattern = ".bigwig", replacement = "", x = file_name)) %>% 
-    dplyr::filter(sample_id %in% sample_meta_clean$sample_id)
-
-  track_data_study <-track_data_study %>% 
-    dplyr::left_join(var_genotype %>% dplyr::select(sample_id, qtl_group, DS), by = c("sample_id")) %>% 
+  track_data_study <- var_genotype %>% 
+    dplyr::select(sample_id, qtl_group, DS) %>% 
     dplyr::left_join(scaling_factor_data) %>% 
     dplyr::rename(scaling_factor = scaling_factors) %>% 
     dplyr::mutate(track_id = qtl_group) %>% 
     dplyr::mutate(colour_group = as.character(DS)) %>% 
+    dplyr::mutate(bigWig = "") %>% 
     dplyr::select(sample_id, scaling_factor, bigWig, track_id, colour_group, qtl_group) %>% 
     dplyr::filter(qtl_group==qtl_group_in)
 
-
   start_time <- time_here(prev_time = start_time, message_text = " >> prepared track_data_study in: ")
 
   nom_exon_cc_sumstats_all <- read_and_filter_parquet( 
@@ -331,7 +332,8 @@ for (index in 1:nrow(highest_pip_vars_per_cs)) {
   coverage_data_list = tryCatch(wiggleplotr::extractCoverageData(exons = exons_to_plot, 
                                                                  cdss = exon_cdss_to_plot, 
                                                                  plot_fraction = 0.2,
-                                                                 track_data = track_data_study), 
+                                                                 track_data = track_data_study, 
+                                                                 coverage_ranges_pq = coverage_parquet_path), 
                                 error = function(e) {
                                   message(" ## Problem with generating coverage_data wiggleplotr")
                                   message(e)
@@ -369,6 +371,10 @@ for (index in 1:nrow(highest_pip_vars_per_cs)) {
     dplyr::left_join(track_data_study_box, by = "sample_id") 
   track_data_study_box <- track_data_study_box %>%  
     dplyr::left_join(tpm_exp_df_oi, by = c("sample_id", "tx_id"))
+
+  # Remove samples present in expression matrix but missing from metadata-filtered sample set
+  track_data_study_box <- track_data_study_box %>%
+    dplyr::filter(!is.na(snp_id))
 
   message(" ## Filter nominal summstats")
 
@@ -425,4 +431,4 @@ for (index in 1:nrow(highest_pip_vars_per_cs)) {
   nom_exon_cc_sumstats_filt <- nom_exon_cc_sumstats_filt %>%
     mutate(rsid = stringr::str_trim(rsid))
   arrow::write_parquet(nom_exon_cc_sumstats_filt, file.path(output_path, paste0("nom_exon_cc_", signal_name, ".parquet")))
-}
+}