pf_python_modelbuilding/app_flask.py at main · USEPA/pf_python_modelbuilding · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870

"""
Flask webservice to build QSAR models with a variety of modeling strategies (RF, SVM, DNN, XGB...more to come?)
Run with Python 3.9 to avoid problems with parallelizing RF (bug in older versions of joblib backing sklearn)
@author: TMARTI02 (Todd Martin) - RF, base webservice code, predictions for new chemicals and reports
@author: GSincl01 (Gabriel Sinclair) - SVM (based on work by CRupakhe), XGB, refactored webservice code
@author: cramslan (Christian Ramsland) - DNN
Repository created 05/21/2021
"""

from flask import request, abort, Flask, send_file, jsonify

import json
import logging
import pickle
import gzip

from model_ws_db_utilities import ModelPredictor, ModelInitializer

# why not make the following methods part of a Utility class then call methods from instance of it?
from model_ws_utilities import get_model_info, call_build_model_with_preselected_descriptors, models, \
    call_build_embedding_ga, call_build_embedding_importance, call_build_embedding_lasso, call_cross_validate, \
    call_do_predictions, instantiateModelForPrediction, get_model_details, call_generate_plot

from applicability_domain import applicability_domain_utilities as adu

from sklearn2pmml import sklearn2pmml

from dotenv import load_dotenv
load_dotenv('personal.env')
from util import predict_constants as pc

import util.get_model_file as gmf
import io

from report_creator_dict import ReportCreator

custom_level_styles = {
    'debug': {'color': 'cyan'},
    'info': {'color': 'yellow'},
    'warning': {'color': 'red', 'bold': True},
    'error': {'color': 'white', 'background': 'red'},
}
from logging import INFO, DEBUG, ERROR
import coloredlogs
level = INFO
coloredlogs.install(level=level, milliseconds=True, level_styles=custom_level_styles,
                    fmt='%(asctime)s - %(name)s - %(levelname)s - %(message)s (%(filename)s:%(lineno)d)')

app = Flask(__name__)
log = logging.getLogger('werkzeug')
log.setLevel(logging.DEBUG)


def get_version():
    try:
        from build_info import BUILD_TIMESTAMP, BUILD_NUMBER
    except ImportError:
        BUILD_TIMESTAMP = None
        BUILD_NUMBER = None

    return dict(name="predictor_models",
                title="EPA/Models",
                version="1.0.0",
                compiled=BUILD_TIMESTAMP,
                build_id=BUILD_NUMBER)


def get_metadata():
    return dict(
        version=get_version()
    )


@app.route('/hello/<name>', methods=['GET'])
def say_hello(name):
    """
    API endpoint that returns a greeting for the given name.
    The name is extracted from the URL path parameter.
    """
    return "Hello, " + name


@app.route('/api/predictor_models/<string:qsar_method>/info', methods=['GET'])
def method_info(qsar_method):
    """Returns a short, generic description of the QSAR method"""
    return get_model_info(qsar_method), 200


@app.route('/api/predictor_models/<string:qsar_method>/train', methods=['POST'])
def train(qsar_method):
    """Trains a model for the specified QSAR method on provided data"""

    logging.debug('enter train')

    obj = request.form
    training_tsv = obj.get('training_tsv')  # Retrieves the training data as a TSV
    prediction_tsv = obj.get('prediction_tsv')  # Retrieves the training data as a TSV

    if obj.get('use_pmml'):
        use_pmml = obj.get('use_pmml', '').lower() == 'true'
    else:
        abort(400, 'missing use_pmml')

    # TODO we might want to have option to not use standardization at all- not needed for RF or XGB (only need for kNN)- standardization causes interoperability problems when loading pmml

    if obj.get('include_standardization_in_pmml'):
        include_standardization_in_pmml = obj.get('include_standardization_in_pmml', '').lower() == 'true'
    else:
        abort(400, 'missing include_standardization_in_pmml')

    if training_tsv is None:
        training_tsv = request.files.get('training_tsv').read().decode('UTF-8')
    # Can't train a model without data
    if training_tsv is None:
        abort(400, 'missing training tsv')

    model_id = obj.get('model_id')  # Retrieves the model number to use for persistent storage

    if obj.get('remove_log_p'):  # Sets boolean remove_log_p from string
        remove_log_p = obj.get('remove_log_p', '').lower() == 'true'
    else:
        remove_log_p = False

    if obj.get('num_jobs'):
        n_jobs = int(obj.get('num_jobs'))
    else:
        n_jobs = 8

    embedding = get_embedding(obj)
    logging.debug("embedding = ***\t", embedding, '\t***')

    if embedding and embedding == 'error':
        abort(400, 'non blank embedding and dont have tab character')

    model = call_build_model_with_preselected_descriptors(qsar_method=qsar_method,
                                                          training_tsv=training_tsv,
                                                          prediction_tsv=prediction_tsv,
                                                          remove_log_p=remove_log_p,
                                                          use_pmml_pipeline=use_pmml,
                                                          include_standardization_in_pmml=include_standardization_in_pmml,
                                                          descriptor_names_tsv=embedding,
                                                          n_jobs=n_jobs, filterColumnsInBothSets=True)

    if model is None:
        abort(500, 'unknown model training error')

    # Sets status 200 OK
    status = 200

    # If model number provided for storage, stores the model and sets status 201 CREATED instead
    if model_id.strip():
        models[model_id] = model
        status = 201

    # Returns model bytes
    if use_pmml:
        pmml_file = 'model.pmml'
        sklearn2pmml(model.model_obj,
                     pmml_file)  # write pmml to harddrive temporarily- TODO will this cause problems in docker???
        with open(pmml_file, 'r') as file:
            return bytes(file.read(), 'utf-8'), status  # return pmml as string, todo compress it?
    else:
        return pickle.dumps(model), status


@app.route('/api/predictor_models/prediction_applicability_domain', methods=['POST'])
def prediction_applicability_domain():
    """Generates applicability domain values"""

    obj = request.form

    training_tsv = obj.get('training_tsv')  # Retrieves the training data as a TSV
    test_tsv = obj.get('test_tsv')  # Retrieves the training data as a TSV

    applicability_domain = obj.get('applicability_domain')

    if training_tsv is None:
        training_tsv = request.files.get('training_tsv').read().decode('UTF-8')

    if test_tsv is None:
        test_tsv = request.files.get('test_tsv').read().decode('UTF-8')

    if obj.get('remove_log_p'):  # Sets boolean remove_log_p from string
        remove_log_p = obj.get('remove_log_p', '').lower() == 'true'
    else:
        remove_log_p = False

    # Can't train a model without data
    if training_tsv is None:
        abort(400, 'missing training tsv')

    # Need test set to run AD on:
    if test_tsv is None:
        abort(400, 'missing test tsv')

    embedding = get_embedding(obj)

    logging.debug("embedding = ***\t", embedding, '\t***')

    if embedding and embedding == 'error':
        abort(400, 'non blank embedding and dont have tab character')

    output, ad_cutoff = adu.generate_applicability_domain_with_preselected_descriptors(training_tsv=training_tsv,
                                                                            test_tsv=test_tsv,
                                                                            remove_log_p=remove_log_p,
                                                                            embedding=embedding,
                                                                            applicability_domain=applicability_domain,
                                                                            filterColumnsInBothSets=True)
    result = output.to_json(orient='records', lines=True)
    # print(result)
    return result


def get_embedding(obj):
    embedding_tsv = obj.get('embedding_tsv')

    if embedding_tsv is None:
        embedding_tsv_obj = request.files.get('embedding_tsv')  # try  reading from file
        if embedding_tsv_obj is not None:
            embedding_tsv = embedding_tsv_obj.read().decode('UTF-8')

    if embedding_tsv is None:
        return None

    if len(embedding_tsv) == 0:
        embedding = None
    else:
        embedding = []
        if "\t" in embedding_tsv:
            embedding = embedding_tsv.split("\t")
        else:
            return 'error'

    return embedding


@app.route('/api/predictor_models/<string:qsar_method>/embedding', methods=['POST'])
def train_embedding_ga(qsar_method):
    """Post method that trains GA embedding for the specified QSAR method on provided data"""

    logging.debug('Enter train_embedding (method to make GA based embedding)')

    obj = request.form

    training_tsv = obj.get('training_tsv')  # Retrieves the training data as a TSV
    if training_tsv is None:
        training_tsv = request.files.get('training_tsv').read().decode('UTF-8')
    if training_tsv is None:
        abort(400, 'missing training tsv')

    prediction_tsv = obj.get('prediction_tsv')  # Retrieves the training data as a TSV
    if prediction_tsv is None:
        logging.debug('prediction_tsv is none!')
        prediction_tsv = request.files.get('prediction_tsv').read().decode('UTF-8')
    if prediction_tsv is None:
        abort(400, 'missing prediction tsv')

    if obj.get('remove_log_p'):  # Sets boolean remove_log_p from string
        remove_log_p = obj.get('remove_log_p', '').lower() == 'true'
    else:
        remove_log_p = False

    if obj.get('use_wards'):  # Sets boolean remove_log_p from string
        use_wards = obj.get('use_wards', '').lower() == 'true'
    else:
        use_wards = False

    num_generations = int(obj.get('num_generations'))
    num_optimizers = int(obj.get('num_optimizers'))
    num_jobs = int(obj.get('num_jobs'))

    max_length = int(obj.get('max_length'))
    threshold = int(obj.get('threshold'))
    descriptor_coefficient = float(obj.get('descriptor_coefficient'))
    n_threads = int(obj.get('n_threads'))

    embedding, timeMin = call_build_embedding_ga(qsar_method=qsar_method,
                                                 training_tsv=training_tsv,
                                                 prediction_tsv=prediction_tsv,
                                                 remove_log_p=remove_log_p,
                                                 num_generations=num_generations,
                                                 num_optimizers=num_optimizers,
                                                 num_jobs=num_jobs, n_threads=n_threads,
                                                 descriptor_coefficient=descriptor_coefficient,
                                                 max_length=max_length,
                                                 threshold=threshold,
                                                 use_wards=use_wards,
                                                 run_rfe=False)

    result_obj = {}
    result_obj['embedding'] = embedding
    result_obj['timeMin'] = timeMin
    result_str = json.dumps(result_obj)

    logging.debug('result_str=' + result_str)
    return result_str


@app.route('/api/predictor_models/<string:qsar_method>/embedding_importance', methods=['POST'])
def train_embedding_importance(qsar_method):
    """Post method that trains importance based embedding for the specified QSAR method on provided data"""

    logging.debug('Enter train_embedding_importance')

    obj = request.form

    training_tsv = obj.get('training_tsv')  # Retrieves the training data as a TSV
    if training_tsv is None:
        training_tsv = request.files.get('training_tsv').read().decode('UTF-8')
    if training_tsv is None:
        abort(400, 'missing training tsv')

    prediction_tsv = obj.get('prediction_tsv')  # Retrieves the training data as a TSV
    if prediction_tsv is None:
        logging.debug('prediction_tsv is none!')
        prediction_tsv = request.files.get('prediction_tsv').read().decode('UTF-8')
    if prediction_tsv is None:
        abort(400, 'missing prediction tsv')

    if obj.get('remove_log_p'):  # Sets boolean remove_log_p from string
        remove_log_p = obj.get('remove_log_p', '').lower() == 'true'
    else:
        remove_log_p = False

    if obj.get('use_wards'):  # Sets boolean remove_log_p from string
        use_wards = obj.get('use_wards', '').lower() == 'true'
    else:
        use_wards = False

    if obj.get('run_rfe'):  # Sets boolean remove_log_p from string
        run_rfe = obj.get('run_rfe', '').lower() == 'true'
    else:
        run_rfe = False

    if obj.get('use_permutative'):  # Sets boolean remove_log_p from string
        use_permutative = obj.get('use_permutative', '').lower() == 'true'
    else:
        use_permutative = False

    # TODO add importance_type for when not using permutative importance

    num_generations = int(obj.get('num_generations'))
    fraction_of_max_importance = float(obj.get('fraction_of_max_importance'))
    min_descriptor_count = int(obj.get('min_descriptor_count'))
    max_descriptor_count = int(obj.get('max_descriptor_count'))
    n_threads = int(obj.get('n_threads'))

    embedding, timeMin = call_build_embedding_importance(qsar_method=qsar_method,
                                                         training_tsv=training_tsv,
                                                         prediction_tsv=prediction_tsv,
                                                         remove_log_p_descriptors=remove_log_p,
                                                         n_threads=n_threads,
                                                         num_generations=num_generations,
                                                         use_permutative=use_permutative,
                                                         run_rfe=run_rfe,
                                                         fraction_of_max_importance=fraction_of_max_importance,
                                                         min_descriptor_count=min_descriptor_count,
                                                         max_descriptor_count=max_descriptor_count,
                                                         use_wards=use_wards)

    result_obj = {}
    result_obj['embedding'] = embedding
    result_obj['timeMin'] = timeMin
    result_str = json.dumps(result_obj)

    logging.debug('result_str=' + result_str)
    return result_str


@app.route('/api/predictor_models/<string:qsar_method>/embedding_lasso', methods=['POST'])
def train_embedding_lasso(qsar_method):
    """Post method that trains importance based embedding for the specified QSAR method on provided data"""

    logging.debug('Enter train_embedding_importance')

    obj = request.form

    training_tsv = obj.get('training_tsv')  # Retrieves the training data as a TSV
    if training_tsv is None:
        training_tsv = request.files.get('training_tsv').read().decode('UTF-8')
    if training_tsv is None:
        abort(400, 'missing training tsv')

    prediction_tsv = obj.get('prediction_tsv')  # Retrieves the training data as a TSV
    if prediction_tsv is None:
        print('prediction_tsv is none!')
        prediction_tsv = request.files.get('prediction_tsv').read().decode('UTF-8')
    if prediction_tsv is None:
        abort(400, 'missing prediction tsv')

    if obj.get('remove_log_p'):  # Sets boolean remove_log_p from string
        remove_log_p = obj.get('remove_log_p', '').lower() == 'true'
    else:
        remove_log_p = False

    if obj.get('run_rfe'):  # Sets boolean remove_log_p from string
        run_rfe = obj.get('run_rfe', '').lower() == 'true'
    else:
        run_rfe = False

    n_threads = int(obj.get('n_threads'))

    embedding, timeMin = call_build_embedding_lasso(qsar_method=qsar_method,
                                                    training_tsv=training_tsv,
                                                    prediction_tsv=prediction_tsv,
                                                    remove_log_p_descriptors=remove_log_p,
                                                    n_threads=n_threads,
                                                    run_rfe=run_rfe)

    result_obj = {}
    result_obj['embedding'] = embedding
    result_obj['timeMin'] = timeMin
    result_str = json.dumps(result_obj)

    print('result_str=' + result_str)
    return result_str


@app.route('/api/predictor_models/<string:qsar_method>/cross_validate', methods=['POST'])
def cross_validate_fold(qsar_method):
    """Trains a model for the specified QSAR method on provided data"""
    print('\n********************************************************************************************************')
    print('run_cross_validate_fold')

    obj = request.form

    if obj.get('use_pmml'):
        use_pmml = obj.get('use_pmml', '').lower() == 'true'
    else:
        abort(400, 'missing use_pmml')

    training_tsv = obj.get('training_tsv')  # Retrieves the training data as a TSV
    if training_tsv is None:
        training_tsv = request.files.get('training_tsv').read().decode('UTF-8')
    if training_tsv is None:
        abort(400, 'missing training tsv')

    prediction_tsv = obj.get('prediction_tsv')  # Retrieves the training data as a TSV
    if prediction_tsv is None:
        print('prediction_tsv is none!')
        prediction_tsv = request.files.get('prediction_tsv').read().decode('UTF-8')
    if prediction_tsv is None:
        abort(400, 'missing prediction tsv')

    # print('prediction_tsv',prediction_tsv)

    if obj.get('remove_log_p'):  # Sets boolean remove_log_p from string
        remove_log_p = obj.get('remove_log_p', '').lower() == 'true'
    else:
        remove_log_p = False

    if obj.get('num_jobs'):
        n_jobs = int(obj.get('num_jobs'))
    else:
        n_jobs = 8

    embedding = get_embedding(obj)

    if embedding and embedding == 'error':
        abort(400, 'non blank embedding and dont have tab character')

    print("embedding = ***\t", embedding, '\t***')

    hyperparameters = obj.get('hyperparameters')
    hyperparameters = json.loads(hyperparameters)  # convert to dictionary

    return call_cross_validate(qsar_method=qsar_method,
                               cv_training_tsv=training_tsv, cv_prediction_tsv=prediction_tsv,
                               descriptor_names_tsv=embedding,
                               use_pmml_pipeline=use_pmml,
                               remove_log_p=remove_log_p,
                               hyperparameters=hyperparameters, n_jobs=n_jobs)


@app.route('/api/predictor_models/predictDB', methods=['POST', 'GET'])
def predictDB():
    """Automates prediction and AD for single smiles using model in database
    """

    if request.method == 'POST':
        obj = request.form
    elif request.method == 'GET':
        obj = request.args
    smiles = obj.get('smiles')  # Retrieves the model number to use
    model_id = obj.get('model_id')
    report_format = obj.get('report_format', 'json').lower()

    if report_format not in ['json', 'html']:
        report_format = 'json'

    mp = ModelPredictor()
    modelResultsJson = mp.predictFromDB(model_id, smiles)

    if "invalid" in modelResultsJson.lower():
        return modelResultsJson, 400

    if report_format == "html":
        rc = ReportCreator()
        html = rc.create_html_report_from_json(modelResultsJson)
        return html, 200
    else:
        return modelResultsJson, 200

    return mp.predictFromDB(model_id, smiles, report_format), 200


@app.route('/api/predictor_models/predict_identifier', methods=['POST', 'GET'])
def predict_identifier():
    """Automates prediction and AD for single identifier using model in database
    """

    if request.method == 'POST':
        obj = request.form
    elif request.method == 'GET':
        obj = request.args

    identifier = obj.get('identifier')  # Retrieves the model number to use

    model_id = obj.get('model_id')
    report_format = obj.get('report_format', 'json').lower()

    if report_format not in ['json', 'html']:
        report_format = 'json'

    from API_Utilities import SearchAPI
    import os
    serverAPIs = os.getenv("CIM_API_SERVER", "https://cim-dev.sciencedataexperts.com")

    chemicals, code = SearchAPI.call_resolver_get(serverAPIs, identifier)

    # print(chemicals, code)

    if code != 200:
        return jsonify(error="not_found", message=f"Could not find {identifier}"), 404

    if len(chemicals) > 0:
        smiles = chemicals[0]["chemical"]["smiles"]
    else:
        return jsonify(error="not_found", message=f"Could not find {identifier}"), 404

    mp = ModelPredictor()
    modelResultsJson = mp.predictFromDB(model_id, smiles)

    if "invalid" in modelResultsJson.lower():
        return modelResultsJson, 400

    if report_format == "html":
        rc = ReportCreator()
        html = rc.create_html_report_from_json(modelResultsJson)
        return html, 200
    else:
        return modelResultsJson, 200

    return mp.predictFromDB(model_id, smiles, report_format), 200


def _read_text_form_or_file(field_name: str):
    # Prefer file upload
    f = request.files.get(field_name)
    if f:
        name = getattr(f, "filename", "")
        data = f.read()
        if name.endswith(".gz") or (len(data) >= 2 and data[:2] == b"\x1f\x8b"):
            data = gzip.decompress(data)
        return data.decode("utf-8")

    # Fallback to form field
    val = request.form.get(field_name)
    return val


@app.route('/api/predictor_models/predict', methods=['POST'])
def predict():
    """input: model_id and prediction_tsv
       output: predictions json (list of 'id', 'exp', 'pred')
    """

    obj = request.form
    model_id = obj.get('model_id')

    prediction_tsv = _read_text_form_or_file("prediction_tsv")

    if prediction_tsv is None:
        abort(400, 'missing prediction tsv')
    if model_id is None:
        abort(400, 'missing model id')

    if model_id in models:
        model = models[model_id]
    else:
        abort(400, 'Need to init model or use predictDB API call instead')

    if model is None:
        abort(404, 'no stored model with id ' + model_id)

    return call_do_predictions(prediction_tsv, model), 200


@app.route('/api/predictor_models/plot', methods=['POST'])
def generate_plot():
    """Makes predictions for a stored model on provided data"""
    obj = request.form
    model_id = obj.get('model_id')  # Retrieves the model number to use

    model_name = obj.get('model_name')  # Retrieves the model number to use

    plot_type = obj.get('plot_type')

    training_tsv = obj.get('training_tsv')  # Retrieves the prediction data as a TSV
    if training_tsv is None:
        training_tsv = request.files.get('prediction_tsv').read().decode('UTF-8')

    prediction_tsv = obj.get('prediction_tsv')  # Retrieves the prediction data as a TSV
    if prediction_tsv is None:
        prediction_tsv = request.files.get('prediction_tsv').read().decode('UTF-8')

    # Can't make predictions without data
    if prediction_tsv is None:
        abort(400, 'missing prediction tsv')
    # Can't make predictions without a model
    if model_id is None:
        abort(400, 'missing model id')

    if models[model_id] is not None:
        # Gets stored model using model number
        model = models[model_id]
    else:
        abort(400, 'Need to init model first')

    # 404 NOT FOUND if no model stored under provided number
    if model is None:
        abort(404, 'no stored model with id ' + model_id)

    # Calls the appropriate prediction method and returns the results
    return call_generate_plot(training_tsv, prediction_tsv, model, model_name, plot_type), 200


@app.route('/api/predictor_models/initPMML', methods=['POST'])
def initPMML():
    """Loads a model and stores it under the provided number"""

    form_obj = request.get_json()
    model_id = form_obj.get('model_id')  # Retrieves the model number to use for persistent storage

    # Can't store a model unless number is specified
    if model_id is None:
        abort(400, 'missing model id')

    if model_id in models:
        print('already have model in memory')
        model = models[model_id]
        return model.get_model_description(), 201

    # Retrieves the model file from the request files
    model_file = form_obj['model']

    print('use_sklearn2mml in form_obj:', form_obj.get('use_sklearn2pmml'))

    if form_obj.get('use_sklearn2pmml') is None:
        abort(400, 'missing use_sklearn2pmml')

    if isinstance(form_obj.get('use_sklearn2pmml'), str):
        use_sklearn2pmml = form_obj.get('use_sklearn2pmml', '').lower() == 'true'
    else:
        use_sklearn2pmml = form_obj.get('use_sklearn2pmml')

    print('use_sklearn2mml variable', form_obj.get('use_sklearn2pmml'))

    model = None

    # print (files_obj)

    if model_file is None:
        print('Missing model bytes')
        # Can't store a model if none provided
        abort(400, 'missing model bytes')

    print('have model file, type = ', type(model_file))
    pmml_file_path = 'model_api.pmml'

    # model_file.save(pmml_file_path, buffer_size=16384)  # save to hard drive so can load it

    f = open(pmml_file_path, "w")
    f.write(model_file)
    f.close()

    print('wrote pmmlfile to harddrive')

    if isinstance(form_obj['is_binary'], bool):
        is_binary = form_obj['is_binary']
    else:
        is_binary = form_obj['is_binary'].lower == 'true'

    model = instantiateModelForPrediction(qsar_method=form_obj['qsar_method'],
                                          is_binary=is_binary, pmml_file_path=pmml_file_path,
                                          use_sklearn2pmml=use_sklearn2pmml)  # init from app should take care of this when doing from java
    model.set_details(details=form_obj)

    # Stores model under provided number
    models[model_id] = model

    print('After init model_description =', model.get_model_description())

    # 400 BAD REQUEST if something is wrong with the loaded bytes
    if model is None:
        print('Model is none')
        abort(400, 'unknown model initialization error')

    # Return storage ID and 201 CREATED
    return model.get_model_description(), 201


@app.route('/api/predictor_models/initPickle', methods=['POST'])
def initPickle():
    """Loads a model and stores it under the provided number"""
    print('enter initPickle')

    form_obj = request.form
    # print('form_obj',form_obj)
    files_obj = request.files  # Retrieves the files attached to the request

    model_id = form_obj.get('model_id')  # Retrieves the model number to use for persistent storage

    # Can't store a model unless number is specified
    if model_id is None:
        abort(400, 'missing model id')

    # Retrieves the model file from the request files
    model_file = files_obj['model']

    # print (files_obj)

    if model_file is not None:

        print('have model file, type = ', type(model_file))

        # print('is_categorical', is_categorical)
        model = pickle.loads(model_file.read())

        if not hasattr(model, "is_binary"):
            print('model.is_binary is none, setting to false')
            model.is_binary = False

        # Stores model under provided number
        models[model_id] = model

        print('After init model_description =', model.get_model_description())
        return model.get_model_description(), 201

    else:
        # Can't store a model if none provided
        abort(400, 'missing model bytes')


@app.get(pc.URL_LOCAL_FILE_API)
def get_file():

    # Validate and parse query params
    type_id_str = request.args.get("type_id")
    model_id_str = request.args.get("model_id")

    if not type_id_str or not model_id_str:
        return jsonify(error="Missing required query params: typeId and modelId"), 400

    try:
        type_id = int(type_id_str)
        model_id = int(model_id_str)
    except ValueError:
        return jsonify(error="typeId and modelId must be integers"), 400

    # Open a session and fetch file data
    try:
        raw_bytes, file_name, mime_type = gmf.fetch_model_file(model_id=model_id, type_id=type_id)

    except FileNotFoundError as e:
        return jsonify(error=str(e)), 404
    except ValueError as e:
        return jsonify(error=str(e)), 400
    except Exception as e:
        return jsonify(error=f"Database error: {e}"), 500

    # New rule: if typeId == 2 then download, else inline
    as_attachment = (type_id == 2)

    # Stream the file
    bio = io.BytesIO(raw_bytes)
    bio.seek(0)
    return send_file(
        bio,
        mimetype=mime_type,
        download_name=str(file_name),
        as_attachment=as_attachment,
        max_age=0,
        etag=False,
        conditional=False,
    )


@app.route('/api/predictor_models/<string:model_id>', methods=['GET'])
def details(model_id):
    """Returns a detailed description of the QSAR model with version and parameter information (also inits the model if needed)"""

    mi = ModelInitializer()
    model = mi.init_model(model_id)

    # 404 NOT FOUND if no model stored under provided number
    if model is None:
        abort(404, 'no stored model with id ' + model_id)

    # Retrieves details from specified model
    model_details = get_model_details(model)
    if model_details is None:
        # 404 NOT FOUND if model has no detail information
        abort(404, 'no details for stored model with id ' + model_id)

    # Return description and 200 OK
    return model_details, 200


@app.route('/api/predictor_models/models', methods=['GET'])
def available_models():
    """Returns a detailed description of the QSAR model with version and parameter information"""

    # model = mwu.models[model_id]
    mi = ModelInitializer()
    models = mi.get_available_models()

    # Return description and 200 OK
    return models, 200


@app.route('/api/predictor_models/reg_coeff/<string:model_id>', methods=['GET'])
def model_coeffs(model_id):
    """Returns a detailed description of the QSAR model with version and parameter information"""

    mi = ModelInitializer()
    model = mi.init_model(model_id)

    # 404 NOT FOUND if no model stored under provided number
    if model is None:
        abort(404, 'no stored model with id ' + model_id)

    if hasattr(model, 'getOriginalRegressionCoefficients') and callable(getattr(model, 'getOriginalRegressionCoefficients')):
        coeff_dict = model.getOriginalRegressionCoefficients()
        return coeff_dict, 200
    else:
        return "Cant return coefficients for " + model.qsar_method


@app.route('/api/predictor_models/<string:model_id>/object', methods=['GET'])
def model_obj(model_id):
    """Returns model object"""

    model = models[model_id]

    # 404 NOT FOUND if no model stored under provided number
    if model is None:
        abort(404, 'no stored model with id ' + model_id)

    if model.model_obj is None:
        # 404 NOT FOUND if model has no detail information
        abort(404, 'no model object for id ' + model_id)

    # Return model_obj
    return model.model_obj, 200


if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5004, debug=True)