From a144ae698f52f700fe9dcb168e9d56e50e08c3cd Mon Sep 17 00:00:00 2001 From: azmtag Date: Sun, 10 Mar 2024 13:45:00 +0100 Subject: [PATCH 1/4] NRPSPredictor_utils: handle SVM_entry getting incorrect input --- src/nerpa_pipeline/NRPSPredictor_utils/json_handler.py | 4 +++- src/nerpa_pipeline/NRPSPredictor_utils/main.py | 9 ++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/src/nerpa_pipeline/NRPSPredictor_utils/json_handler.py b/src/nerpa_pipeline/NRPSPredictor_utils/json_handler.py index 806ab90..7dd2081 100644 --- a/src/nerpa_pipeline/NRPSPredictor_utils/json_handler.py +++ b/src/nerpa_pipeline/NRPSPredictor_utils/json_handler.py @@ -68,7 +68,7 @@ def __init__(self, domain_prediction): for stachelhaus_match in prediction_data['stachelhaus_matches']], default=0) self.uncertain = stachelhaus_match_count < 7 # not so sure about this - else: # older version of antismash + elif 'NRPSPredictor2' in domain_prediction: # older version of antismash prediction_data = domain_prediction['NRPSPredictor2'] self.angstrom_code = prediction_data['angstrom_code'] @@ -78,6 +78,8 @@ def __init__(self, domain_prediction): self.small_cluster_pred = prediction_data['small_cluster_pred'] self.single_amino_pred = prediction_data['single_amino_pred'] self.uncertain = prediction_data['uncertain'] + else: + raise RuntimeError('Neither "nrpys" nor "NRPSPredictor2" in domain prediction.') def __str__(self): return '\t'.join([self.angstrom_code, diff --git a/src/nerpa_pipeline/NRPSPredictor_utils/main.py b/src/nerpa_pipeline/NRPSPredictor_utils/main.py index d2a5673..fb656cf 100755 --- a/src/nerpa_pipeline/NRPSPredictor_utils/main.py +++ b/src/nerpa_pipeline/NRPSPredictor_utils/main.py @@ -99,9 +99,12 @@ def main(args): is_root_outdir = True if (args.output_dir is not None and len(args.inputs) > 1) else False processed_output_dirs = [] for input_path in args.inputs: - processed_output_dirs.append(json_handler.handle_single_input( - Path(input_path), args.output_dir, is_root_outdir, args.naming_style, - known_codes, scoring_mode=args.mode, verbose=args.verbose)) + try: + processed_output_dirs.append(json_handler.handle_single_input( + Path(input_path), args.output_dir, is_root_outdir, args.naming_style, + known_codes, scoring_mode=args.mode, verbose=args.verbose)) + except RuntimeError as e: + info(f'ERROR: Unable to parse the input at "{input_path}": {e}') return processed_output_dirs From 88d35439c5786ef4580348a196eaa66d9888fefd Mon Sep 17 00:00:00 2001 From: azmtag Date: Sun, 10 Mar 2024 21:25:23 +0100 Subject: [PATCH 2/4] pipeline: handle exceptions while creating predictions --- .../predictions_preprocessor.py | 84 ++++++++++--------- 1 file changed, 45 insertions(+), 39 deletions(-) diff --git a/src/nerpa_pipeline/predictions_preprocessor.py b/src/nerpa_pipeline/predictions_preprocessor.py index 08ee132..52e8318 100644 --- a/src/nerpa_pipeline/predictions_preprocessor.py +++ b/src/nerpa_pipeline/predictions_preprocessor.py @@ -77,45 +77,51 @@ def create_predictions_by_antiSMASH_out(antiSMASH_outs, outdir, log): predictions_info_file = os.path.join(outdir, "predictions.info") predictions_info_list = [] for dirname in antiSMASH_outs: - if dirname[-1] == '\n': - dirname = dirname[:-1] - - orf_pos = handle_helper.get_orf_position(dirname) - orf_ori = handle_helper.get_orf_orientation(dirname) - orf_domains = handle_helper.get_orf_domain_list(dirname) - - print("====PARTS BEFORE: ") - parts = handle_helper.get_parts(dirname) - handle_helper.debug_print_parts(dirname, parts, orf_domains, orf_ori, orf_pos) - - #print("====SPLIT BY DIST:") - parts = splitter.split_by_dist(parts, orf_pos) - #handle_helper.debug_print_parts(dirname, parts, orf_domains, orf_ori, orf_pos) - - #print("====SPLIT BY SINGLE ORF WITH Starter-TE") - parts = splitter.split_by_one_orf_Starter_TE(parts, orf_ori, orf_domains) - #handle_helper.debug_print_parts(dirname, parts, orf_domains, orf_ori, orf_pos) - - #print("====REMOVE SINGLE DOMAINs ORFS") - parts = splitter.split_by_single_domain_orf(parts, orf_ori, orf_domains) - #handle_helper.debug_print_parts(dirname, parts, orf_domains, orf_ori, orf_pos) - - print("====SPLIT AND REORDER") - parts = splitter.split_and_reorder(parts, orf_ori, orf_pos, orf_domains) - handle_helper.debug_print_parts(dirname, parts, orf_domains, orf_ori, orf_pos) - - - nrpspred_dir = os.path.join(dirname, "nrpspks_predictions_txt") - if os.path.isdir(nrpspred_dir): - for filename in os.listdir(nrpspred_dir): - if filename.endswith('nrpspredictor2_codes.txt'): - base_antiSMASHout_name = os.path.basename(dirname) - base_pred_name = os.path.basename(filename) - #predictions_info_list.append(os.path.join(dir_for_predictions, base_antiSMASHout_name + "_" + base_pred_name)) - #shutil.copyfile(os.path.join(nrpspred_dir, filename), os.path.join(dir_for_predictions, base_antiSMASHout_name + "_" + base_pred_name)) - gen_predictions(parts, os.path.join(nrpspred_dir, filename), - os.path.join(dir_for_predictions, base_antiSMASHout_name + "_" + base_pred_name)[:-4], - 0, predictions_info_list, dirname) + try: + if dirname[-1] == '\n': + dirname = dirname[:-1] + + orf_pos = handle_helper.get_orf_position(dirname) + orf_ori = handle_helper.get_orf_orientation(dirname) + orf_domains = handle_helper.get_orf_domain_list(dirname) + + print("====PARTS BEFORE: ") + parts = handle_helper.get_parts(dirname) + handle_helper.debug_print_parts(dirname, parts, orf_domains, orf_ori, orf_pos) + + #print("====SPLIT BY DIST:") + parts = splitter.split_by_dist(parts, orf_pos) + #handle_helper.debug_print_parts(dirname, parts, orf_domains, orf_ori, orf_pos) + + #print("====SPLIT BY SINGLE ORF WITH Starter-TE") + parts = splitter.split_by_one_orf_Starter_TE(parts, orf_ori, orf_domains) + #handle_helper.debug_print_parts(dirname, parts, orf_domains, orf_ori, orf_pos) + + #print("====REMOVE SINGLE DOMAINs ORFS") + parts = splitter.split_by_single_domain_orf(parts, orf_ori, orf_domains) + #handle_helper.debug_print_parts(dirname, parts, orf_domains, orf_ori, orf_pos) + + print("====SPLIT AND REORDER") + parts = splitter.split_and_reorder(parts, orf_ori, orf_pos, orf_domains) + handle_helper.debug_print_parts(dirname, parts, orf_domains, orf_ori, orf_pos) + + + nrpspred_dir = os.path.join(dirname, "nrpspks_predictions_txt") + if os.path.isdir(nrpspred_dir): + for filename in os.listdir(nrpspred_dir): + if filename.endswith('nrpspredictor2_codes.txt'): + base_antiSMASHout_name = os.path.basename(dirname) + base_pred_name = os.path.basename(filename) + #predictions_info_list.append(os.path.join(dir_for_predictions, base_antiSMASHout_name + "_" + base_pred_name)) + #shutil.copyfile(os.path.join(nrpspred_dir, filename), os.path.join(dir_for_predictions, base_antiSMASHout_name + "_" + base_pred_name)) + gen_predictions(parts, os.path.join(nrpspred_dir, filename), + os.path.join(dir_for_predictions, base_antiSMASHout_name + "_" + base_pred_name)[:-4], + 0, predictions_info_list, dirname) + except KeyboardInterrupt as e: + raise e + except Exception as e: + print(f'Error: {type(e).__name__}: {e}') + print(f'Skipping {dirname}') f = open(predictions_info_file, 'w') for line in predictions_info_list: From d4f23dea425b0e1af1867c1ba0c5be6c310c0ccf Mon Sep 17 00:00:00 2001 From: azmtag Date: Sun, 10 Mar 2024 21:27:03 +0100 Subject: [PATCH 3/4] revert me: add an ad hoc limit for max number of predictions per BGC. Skip BGCs exceeding this limit. --- src/nerpa_pipeline/predictions_preprocessor.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/nerpa_pipeline/predictions_preprocessor.py b/src/nerpa_pipeline/predictions_preprocessor.py index 52e8318..6898749 100644 --- a/src/nerpa_pipeline/predictions_preprocessor.py +++ b/src/nerpa_pipeline/predictions_preprocessor.py @@ -105,6 +105,9 @@ def create_predictions_by_antiSMASH_out(antiSMASH_outs, outdir, log): parts = splitter.split_and_reorder(parts, orf_ori, orf_pos, orf_domains) handle_helper.debug_print_parts(dirname, parts, orf_domains, orf_ori, orf_pos) + if len(parts) > 100: + raise RuntimeError(f'Too many parts: {len(parts)}') + nrpspred_dir = os.path.join(dirname, "nrpspks_predictions_txt") if os.path.isdir(nrpspred_dir): From 099c0db645ed299edefc22204ac19c2dbd429581 Mon Sep 17 00:00:00 2001 From: azmtag Date: Thu, 27 Feb 2025 11:58:08 +0100 Subject: [PATCH 4/4] revert me: catch unmanaged exceptions in antismash preprocessing and continue --- src/nerpa_pipeline/NRPSPredictor_utils/main.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/nerpa_pipeline/NRPSPredictor_utils/main.py b/src/nerpa_pipeline/NRPSPredictor_utils/main.py index fb656cf..fbd93ad 100755 --- a/src/nerpa_pipeline/NRPSPredictor_utils/main.py +++ b/src/nerpa_pipeline/NRPSPredictor_utils/main.py @@ -103,8 +103,12 @@ def main(args): processed_output_dirs.append(json_handler.handle_single_input( Path(input_path), args.output_dir, is_root_outdir, args.naming_style, known_codes, scoring_mode=args.mode, verbose=args.verbose)) + except KeyboardInterrupt as e: + raise e except RuntimeError as e: info(f'ERROR: Unable to parse the input at "{input_path}": {e}') + except Exception as e: + info(f'ERROR: Unmanaged Exception while parsing the input at "{input_path}": {e}') return processed_output_dirs