Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
18 changes: 17 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,20 @@ results*
.nextflow*
work
singularity_img/
vcfs/
vcfs/
client-state/
sessions/
projects_settings/
results/
input/exon_testdata
input/ge
input/GTEX_blood_ge
input/lc_testdata
input/tx_testdata
input/txrev_testdata
sbatch_runs/*
input/*
temp_debug
.venv/
EDA/*
test_data_legacy/*
29 changes: 13 additions & 16 deletions bin/generate_leafcutter_plot_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ option_list <- list(
help="The selected qtl_group in the study", metavar = "type"),
make_option(c("-v", "--vcf_file"), type="character", default=NULL,
help="TPM quantile TSV file with phenotype_id column", metavar = "type"),
make_option(c("-b", "--bigwig_path"), type="character", default=NULL,
help="Path to the bigwig files", metavar = "type"),
make_option(c("-c", "--coverage_parquet"), type="character", default=NULL,
help="Path to the coverage parquet file", metavar = "type"),
make_option(c("-m", "--mane_transcript_gene_map"), type="character", default=NULL,
help="Path to the MANE transcripts of genes map file", metavar = "type"),
make_option(c("-g", "--gtf_file"), type="character", default=NULL,
Expand Down Expand Up @@ -146,7 +146,7 @@ sample_meta_path = opt$s
phenotype_meta_path = opt$p
qtl_group_in = opt$q
vcf_file_path = opt$v
bigwig_files_path = opt$b
coverage_parquet_path = opt$c
mane_transcript_gene_map_file = opt$m
tpm_matrix_path = opt$tpm_matrix
gtf_file_path = opt$g
Expand All @@ -162,7 +162,7 @@ message("######### susie_file_path : ", susie_file_path)
message("######### sample_meta_path : ", sample_meta_path)
message("######### phenotype_meta_path: ", phenotype_meta_path)
message("######### vcf_file_path : ", vcf_file_path)
message("######### bigwig_files_path : ", bigwig_files_path)
message("######### coverage_parquet : ", coverage_parquet_path)
message("######### mane_map_file_path : ", mane_transcript_gene_map_file)
message("######### gtf_file_path : ", gtf_file_path)
message("######### scaling_fct_path : ", scaling_factors_path)
Expand Down Expand Up @@ -277,21 +277,13 @@ for (index in 1:nrow(highest_pip_vars_per_cs)) {
var_genotype <- sample_meta_clean %>%
left_join(var_genotype, by = "genotype_id")

bigwig_files <- list.files(bigwig_files_path, full.names = T)

track_data_study <- data.frame(file_name = (gsub(pattern = paste0(bigwig_files_path, "/"), replacement = "", x = bigwig_files, fixed = T)),
bigWig = bigwig_files)

track_data_study <- track_data_study %>%
dplyr::mutate(sample_id = gsub(pattern = ".bigwig", replacement = "", x = file_name)) %>%
dplyr::filter(sample_id %in% sample_meta_clean$sample_id)

track_data_study <-track_data_study %>%
dplyr::left_join(var_genotype %>% dplyr::select(sample_id, qtl_group, DS), by = c("sample_id")) %>%
track_data_study <- var_genotype %>%
dplyr::select(sample_id, qtl_group, DS) %>%
dplyr::left_join(scaling_factor_data) %>%
dplyr::rename(scaling_factor = scaling_factors) %>%
dplyr::mutate(track_id = qtl_group) %>%
dplyr::mutate(colour_group = as.character(DS)) %>%
dplyr::mutate(bigWig = "") %>%
dplyr::select(sample_id, scaling_factor, bigWig, track_id, colour_group, qtl_group) %>%
dplyr::filter(qtl_group==qtl_group_in)

Expand Down Expand Up @@ -354,7 +346,8 @@ for (index in 1:nrow(highest_pip_vars_per_cs)) {
coverage_data_list = tryCatch(wiggleplotr::extractCoverageData(exons = exons_to_plot,
cdss = exon_cdss_to_plot, # Does var will be default NULL if not stated?
plot_fraction = 0.2,
track_data = track_data_study),
track_data = track_data_study,
coverage_ranges_pq = coverage_parquet_path),
error = function(e) {
message(" ## Problem with generating coverage_data wiggleplotr")
message(e)
Expand Down Expand Up @@ -408,6 +401,10 @@ for (index in 1:nrow(highest_pip_vars_per_cs)) {
track_data_study_box <- track_data_study_box %>%
dplyr::left_join(tpm_exp_df_oi, by = c("sample_id", "intron_id"))

# Remove samples present in expression matrix but missing from metadata-filtered sample set
track_data_study_box <- track_data_study_box %>%
dplyr::filter(!is.na(snp_id))

nom_cc_sumstats_variant_phenotype_id <- read_and_filter_parquet(
file_list = ss_oi$nominal_cc_path[[1]],
variant_to_match = ss_oi$variant,
Expand Down
29 changes: 13 additions & 16 deletions bin/generate_plot_exon_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ option_list <- list(
help="The selected qtl_group in the study", metavar = "type"),
make_option(c("-v", "--vcf_file"), type="character", default=NULL,
help="TPM quantile TSV file with phenotype_id column", metavar = "type"),
make_option(c("-b", "--bigwig_path"), type="character", default=NULL,
help="Path to the bigwig files", metavar = "type"),
make_option(c("-c", "--coverage_parquet"), type="character", default=NULL,
help="Path to the coverage parquet file", metavar = "type"),
make_option(c("-m", "--mane_transcript_gene_map"), type="character", default=NULL,
help="Path to the MANE transcripts of genes map file", metavar = "type"),
make_option(c("-g", "--gtf_file"), type="character", default=NULL,
Expand Down Expand Up @@ -134,7 +134,7 @@ sample_meta_path = opt$s
phenotype_meta_path = opt$p
qtl_group_in = opt$q
vcf_file_path = opt$v
bigwig_files_path = opt$b
coverage_parquet_path = opt$c
mane_transcript_gene_map_file = opt$m
gtf_file_path = opt$g
norm_usage_matrix_path = opt$u
Expand All @@ -151,7 +151,7 @@ message("######### susie_file_path : ", susie_file_path)
message("######### sample_meta_path : ", sample_meta_path)
message("######### phenotype_meta_path: ", phenotype_meta_path)
message("######### vcf_file_path : ", vcf_file_path)
message("######### bigwig_files_path : ", bigwig_files_path)
message("######### coverage_parquet : ", coverage_parquet_path)
message("######### mane_map_file_path : ", mane_transcript_gene_map_file)
message("######### gtf_file_path : ", gtf_file_path)
message("######### tpm_matrix : ", tpm_matrix_path)
Expand Down Expand Up @@ -284,21 +284,13 @@ for (index in 1:nrow(highest_pip_vars_per_cs)) {
var_genotype <- sample_meta_clean %>%
left_join(var_genotype, by = "genotype_id")

bigwig_files <- list.files(bigwig_files_path, full.names = T)

track_data_study <- data.frame(file_name = (gsub(pattern = paste0(bigwig_files_path, "/"), replacement = "", x = bigwig_files, fixed = T)),
bigWig = bigwig_files)

track_data_study <- track_data_study %>%
dplyr::mutate(sample_id = gsub(pattern = ".bigwig", replacement = "", x = file_name)) %>%
dplyr::filter(sample_id %in% sample_meta_clean$sample_id)

track_data_study <-track_data_study %>%
dplyr::left_join(var_genotype %>% dplyr::select(sample_id, qtl_group, DS), by = c("sample_id")) %>%
track_data_study <- var_genotype %>%
dplyr::select(sample_id, qtl_group, DS) %>%
dplyr::left_join(scaling_factor_data) %>%
dplyr::rename(scaling_factor = scaling_factors) %>%
dplyr::mutate(track_id = qtl_group) %>%
dplyr::mutate(colour_group = as.character(DS)) %>%
dplyr::mutate(bigWig = "") %>%
dplyr::select(sample_id, scaling_factor, bigWig, track_id, colour_group, qtl_group) %>%
dplyr::filter(qtl_group==qtl_group_in)

Expand Down Expand Up @@ -368,7 +360,8 @@ for (index in 1:nrow(highest_pip_vars_per_cs)) {
coverage_data_list = tryCatch(wiggleplotr::extractCoverageData(exons = exons_to_plot,
cdss = exon_cdss_to_plot,
plot_fraction = 0.2,
track_data = track_data_study),
track_data = track_data_study,
coverage_ranges_pq = coverage_parquet_path),
error = function(e) {
message(" ## Problem with generating coverage_data wiggleplotr")
message(e)
Expand Down Expand Up @@ -414,6 +407,10 @@ for (index in 1:nrow(highest_pip_vars_per_cs)) {
track_data_study_box <- track_data_study_box %>%
dplyr::left_join(tpm_exp_df_oi, by = c("sample_id", "tx_id"))

# Remove samples present in expression matrix but missing from metadata-filtered sample set
track_data_study_box <- track_data_study_box %>%
dplyr::filter(!is.na(snp_id))

message(" ## Reading nominal summary stats with seqminer")

# Keep only 1 rsid per variant per molecular_trait_id
Expand Down
54 changes: 30 additions & 24 deletions bin/generate_plot_ge_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ option_list <- list(
help="The selected qtl_group in the study", metavar = "type"),
make_option(c("-v", "--vcf_file"), type="character", default=NULL,
help="TPM quantile TSV file with phenotype_id column", metavar = "type"),
make_option(c("-b", "--bigwig_path"), type="character", default=NULL,
help="Path to the bigwig files", metavar = "type"),
make_option(c("-c", "--coverage_parquet"), type="character", default=NULL,
help="Path to the coverage parquet file", metavar = "type"),
make_option(c("-m", "--mane_transcript_gene_map"), type="character", default=NULL,
help="Path to the MANE transcripts of genes map file", metavar = "type"),
make_option(c("-g", "--gtf_file"), type="character", default=NULL,
Expand Down Expand Up @@ -78,7 +78,7 @@ prepareTranscriptStructureForPlotting <- function(exon_ranges, cds_ranges, trans
filter_trait_matrix <- function(unique_trait_ids, trait_matrix_pq_file) {
trait_dataset <- open_dataset(trait_matrix_pq_file)
filtered_traits_dataset <- trait_dataset %>%
filter(phenotype_id %in% unique_trait_ids)
dplyr::filter(phenotype_id %in% unique_trait_ids)
traits_df <- collect(filtered_traits_dataset)
return(traits_df)
}
Expand Down Expand Up @@ -109,7 +109,7 @@ read_and_filter_parquet <- function(file_list, variant_to_match, phenotype_id) {
dataset <- open_dataset(file_name)

filtered_data <- dataset %>%
filter(variant == variant_to_match & gene_id == phenotype_id) %>%
dplyr::filter(variant == variant_to_match & gene_id == phenotype_id) %>%
collect()

if (nrow(filtered_data) > 0) {
Expand All @@ -126,7 +126,7 @@ susie_file_path = opt$f
sample_meta_path = opt$s
qtl_group_in = opt$q
vcf_file_path = opt$v
bigwig_files_path = opt$b
coverage_parquet_path = opt$c
mane_transcript_gene_map_file = opt$m
gtf_file_path = opt$g
norm_usage_matrix_path = opt$u
Expand All @@ -135,13 +135,27 @@ scaling_factors_path = opt$div_scaling_factors
vcf_sample_bad_symbol = opt$vcf_sample_bad_symbol
vcf_sample_replacement_symbol = opt$vcf_sample_replacement_symbol

# base_path <- "~/alasoo_lab/coverage_plot_fixer/work_dir/"
#
# qtl_group_in <- "LCL"
# susie_file_path <- paste0(base_path, "QTD000764_LCL_ge_1_1.parquet")
# sample_meta_path <- paste0(base_path, "MAGE_McCoy_2023_QC_all_samples.tsv")
# vcf_file_path <- paste0(base_path, "MAGE_with_SV_20220422_3202.filtered_chr5_96894474-96991959_test_samples.vcf.gz")
# mane_transcript_gene_map_file <- paste0(base_path, "MANE_transcript_gene_map.txt")
# gtf_file_path <- paste0(base_path, "Homo_sapiens.GRCh38.105.gtf")
# scaling_factors_path <- paste0(base_path, "MAGE.LCL.scaling_factors.tsv.gz")
# norm_usage_matrix_path <- paste0(base_path, "MAGE.parquet")
# tpm_matrix_path <- paste0(base_path, "MAGE_TPM.parquet")
# coverage_parquet_path <- paste0(base_path, "MAGE_bigwig_single.parquet")


message("######### Options: ######### ")
message("######### Working Directory : ", getwd())
message("######### qtl_group : ", qtl_group_in)
message("######### susie_file_path : ", susie_file_path)
message("######### sample_meta_path : ", sample_meta_path)
message("######### vcf_file_path : ", vcf_file_path)
message("######### bigwig_files_path : ", bigwig_files_path)
message("######### coverage_parquet : ", coverage_parquet_path)
message("######### mane_map_file_path : ", mane_transcript_gene_map_file)
message("######### gtf_file_path : ", gtf_file_path)
message("######### scaling_fct_path : ", scaling_factors_path)
Expand Down Expand Up @@ -197,10 +211,6 @@ norm_exp_df <- filter_trait_matrix(trait_ids, norm_usage_matrix_path)
tpm_exp_df <- filter_trait_matrix(trait_ids, tpm_matrix_path)



###norm_exp_df <- readr::read_tsv(norm_usage_matrix_path)


message(" ## Reading scaling_factors file")
scaling_factor_data <- readr::read_tsv(scaling_factors_path, col_types = "cd")

Expand Down Expand Up @@ -262,25 +272,16 @@ for (index in 1:nrow(highest_pip_vars_per_cs)) {
var_genotype <- sample_meta_clean %>%
left_join(var_genotype, by = "genotype_id")

bigwig_files <- list.files(bigwig_files_path, full.names = T)

track_data_study <- data.frame(file_name = (gsub(pattern = paste0(bigwig_files_path, "/"), replacement = "", x = bigwig_files, fixed = T)),
bigWig = bigwig_files)

track_data_study <- track_data_study %>%
dplyr::mutate(sample_id = gsub(pattern = ".bigwig", replacement = "", x = file_name)) %>%
dplyr::filter(sample_id %in% sample_meta_clean$sample_id)

track_data_study <-track_data_study %>%
dplyr::left_join(var_genotype %>% dplyr::select(sample_id, qtl_group, DS), by = c("sample_id")) %>%
track_data_study <- var_genotype %>%
dplyr::select(sample_id, qtl_group, DS) %>%
dplyr::left_join(scaling_factor_data) %>%
dplyr::rename(scaling_factor = scaling_factors) %>%
dplyr::mutate(track_id = qtl_group) %>%
dplyr::mutate(colour_group = as.character(DS)) %>%
dplyr::mutate(bigWig = "") %>%
dplyr::select(sample_id, scaling_factor, bigWig, track_id, colour_group, qtl_group) %>%
dplyr::filter(qtl_group==qtl_group_in)


start_time <- time_here(prev_time = start_time, message_text = " >> prepared track_data_study in: ")

nom_exon_cc_sumstats_all <- read_and_filter_parquet(
Expand Down Expand Up @@ -331,7 +332,8 @@ for (index in 1:nrow(highest_pip_vars_per_cs)) {
coverage_data_list = tryCatch(wiggleplotr::extractCoverageData(exons = exons_to_plot,
cdss = exon_cdss_to_plot,
plot_fraction = 0.2,
track_data = track_data_study),
track_data = track_data_study,
coverage_ranges_pq = coverage_parquet_path),
error = function(e) {
message(" ## Problem with generating coverage_data wiggleplotr")
message(e)
Expand Down Expand Up @@ -369,6 +371,10 @@ for (index in 1:nrow(highest_pip_vars_per_cs)) {
dplyr::left_join(track_data_study_box, by = "sample_id")
track_data_study_box <- track_data_study_box %>%
dplyr::left_join(tpm_exp_df_oi, by = c("sample_id", "tx_id"))

# Remove samples present in expression matrix but missing from metadata-filtered sample set
track_data_study_box <- track_data_study_box %>%
dplyr::filter(!is.na(snp_id))

message(" ## Filter nominal summstats")

Expand Down Expand Up @@ -425,4 +431,4 @@ for (index in 1:nrow(highest_pip_vars_per_cs)) {
nom_exon_cc_sumstats_filt <- nom_exon_cc_sumstats_filt %>%
mutate(rsid = stringr::str_trim(rsid))
arrow::write_parquet(nom_exon_cc_sumstats_filt, file.path(output_path, paste0("nom_exon_cc_", signal_name, ".parquet")))
}
}
Loading