SBMLShowcase/BioModels/parse_biomodels.py at master · OpenSourceBrain/SBMLShowcase · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
#!/usr/bin/env python3

import os
import pickle
import re
import sys
import urllib

import matplotlib
import pyneuroml.sbml  # for validate_sbml_files
import pyneuroml.sedml  # for validate_sedml_files

sys.path.append("..")
import utils

md_description = """
Download and run validation tests on all the curated models from BioModels https://www.ebi.ac.uk/biomodels.
The final step is to run the model in tellurium,
only models specified in SBML with a matching SEDML file are run in tellurium.
Errors or validation failures are reported at each step.
Outputs to the Markdown Table below.

'valid-sbml-units' enforces strict unit checking, 'broken-ref' indicates that the SEDML file contained
a broken source='model.xml' reference which was corrected to the name of the model's provided SBML file.
"""

matplotlib.use("agg")  # prevent matplotlib from trying to open a window
API_URL: str = "https://www.ebi.ac.uk/biomodels"

out_format = "json"
max_count = 0  # 0 for unlimited

# local temporary storage of the model files
# this is independent of caching, and still happens when caching is turned off
# this allows the model to be executed and the files manually examined etc
tmp_dir = "tmplocalfiles"

# suppress stdout/err output from validation functions to make progress counter readable
suppress_stdout = True
suppress_stderr = True

# whether to replace "model.xml" in the sedml file with the name of the actual sbml file
fix_broken_ref = True

# skip tests that cause the script to be killed due to lack of RAM
# needs at least 8GB
skip = {}


def download_file(model_id, filename, output_file, cache):
    """
    request the given file and save it to disk
    """

    qfilename = urllib.parse.quote_plus(filename)

    response = cache.do_request(
        f"{API_URL}/model/download/{model_id}?filename={qfilename}"
    ).content

    with open(output_file, "wb") as fout:
        fout.write(response)


def replace_model_xml(sedml_path, sbml_filename):
    """
    if the SEDML refers to a generic "model.xml" file
    and the SBML file is not called this
    replace the SEDML reference with the actual SBML filename

    method used assumes 'source="model.xml"' will only
    occur in the SBML file reference
    which was true at time of testing on current BioModels release

    returns True if the SBML reference already seemed valid
    """

    if sbml_filename == "model.xml":
        return True

    with open(sedml_path, encoding="utf-8") as f:
        data = f.read()

    if 'source="model.xml"' not in data:
        return True

    data = data.replace('source="model.xml"', f'source="{sbml_filename}"')

    with open(f"{sedml_path}", "w", encoding="utf-8") as fout:
        fout.write(data)

    return False


def validate_sbml_file(model_id, mtab, info, cache, sup):
    """
    tasks relating to validating the SBML file
    return None to indicate aborting any further tests on this model
    otherwise return the SBML filename
    """

    # handle only single SBML files
    if not info["format"]["name"] == "SBML":
        mtab["valid_sbml"] = [
            "NonSBML",
            f"{info['format']['name']}:{info['files']['main']}",
        ]
        return None

    if len(info["files"]["main"]) > 1:
        mtab["valid_sbml"] = ["MultipleSBMLs", f"{info['files']['main']}"]
        return None

    if len(info["files"]["main"]) < 1:
        mtab["valid_sbml"] = ["NoSBMLs", f"{info['files']['main']}"]
        return None

    # download the sbml file
    sbml_file = info["files"]["main"][0]["name"]
    try:
        download_file(model_id, sbml_file, sbml_file, cache)
    except Exception as e:
        mtab["valid_sbml"] = ["DownloadFail", f"{sbml_file} {e}"]
        return None

    # validate the sbml file
    sup.suppress()  # suppress validation warning/error messages
    valid_sbml = pyneuroml.sbml.validate_sbml_files([sbml_file], strict_units=False)
    valid_sbml_units = pyneuroml.sbml.validate_sbml_files(
        [sbml_file], strict_units=True
    )
    sup.restore()

    mtab["valid_sbml"] = [
        "pass" if valid_sbml else "FAIL",
        f"[{sbml_file}]({API_URL}/{model_id}#Files)",
    ]
    mtab["valid_sbml_units"] = "pass" if valid_sbml_units else "FAIL"

    return sbml_file


def validate_sedml_file(model_id, mtab, info, cache, sup, sbml_file):
    """
    tasks relating to validating the SEDML file
    return None to indicate aborting any further tests on this model
    otherwise return the SEDML filename
    """

    # must have a SEDML file as well in order to be executed
    if "additional" not in info["files"]:
        mtab["valid_sedml"] = "NoSEDML"
        return None

    sedml_file = []
    for file_info in info["files"]["additional"]:
        pattern = "SED[-]?ML"
        target = f"{file_info['name']}|{file_info['description']}".upper()
        if re.search(pattern, target):
            sedml_file.append(file_info["name"])

    # require exactly one SEDML file
    if len(sedml_file) == 0:
        mtab["valid_sedml"] = "NoSEDML"
        return None

    if len(sedml_file) > 1:
        mtab["valid_sedml"] = ["MultipleSEDMLs", f"{sedml_file}"]
        return None

    # download sedml file
    sedml_file = sedml_file[0]
    try:
        download_file(model_id, sedml_file, sedml_file, cache)
    except Exception as e:
        mtab["valid_sedml"] = ["DownloadFail", f"{e}"]
        return None

    # if the sedml file contains a generic 'source="model.xml"' replace it with the sbml filename
    if fix_broken_ref:
        broken_ref = replace_model_xml(sedml_file, sbml_file)
        mtab["broken_ref"] = "pass" if broken_ref else "FAIL"
    else:
        mtab["broken_ref"] = "NA"

    sup.suppress()
    valid_sedml = pyneuroml.sedml.validate_sedml_files([sedml_file])
    sup.restore()
    mtab["valid_sedml"] = [
        "pass" if valid_sedml else "FAIL",
        f"[{sedml_file}]({API_URL}/{model_id}#Files)",
    ]

    return sedml_file


def main():
    """
    download the BioModel model files, run various validation steps
    report the results as a markdown table README file with a summary row at the top
    """

    # caching is used to prevent the need to download the same responses from the remote server multiple times during testing
    # mode="off" to disable caching, "store" to wipe and store fresh results, "reuse" to use the stored cache
    cache = utils.RequestCache(mode="auto", direc="cache")

    # accumulate results in columns defined by keys which correspond to the local variable names to be used below
    # to allow automated loading into the columns
    column_labels = "Model     |valid-sbml|valid-sbml-units|valid-sedml|broken-ref|tellurium|tellurium-remote|copasi-remote"
    column_keys = "model_desc|valid_sbml|valid_sbml_units|valid_sedml|broken_ref|tellurium_outcome|tellurium_remote_outcome|copasi_remote_outcome"
    mtab = utils.MarkdownTable(column_labels, column_keys)

    # allow stdout/stderr from validation tests to be suppressed to improve progress count visibility
    sup = utils.SuppressOutput(stdout=suppress_stdout, stderr=suppress_stderr)

    # get list of all available models
    model_ids = cache.do_request(
        f"{API_URL}/model/identifiers?format={out_format}"
    ).json()["models"]
    count = 0
    starting_dir = os.getcwd()

    for model_id in model_ids:
        pickle_name = f"{model_id}_mtab.p"
        pickle_path = os.path.join(starting_dir, tmp_dir, model_id, pickle_name)
        if os.path.exists(pickle_path) and use_pickles:
            print(f"\r{model_id} {count}/{len(model_ids)}       ", end="")
            print(f"loading {pickle_path}...")
            mtab_dict = pickle.load(open(pickle_path, "rb"))
            mtab.new_row()
            mtab = mtab_dict["mtab_row"]
            continue
        # allow testing on a small sample of models
        if max_count > 0 and count >= max_count:
            break
        count += 1
        print(f"\r{model_id} {count}/{len(model_ids)}       ", end="")

        # only process curated models
        # BIOMD ids should be the curated models
        if "BIOMD" not in model_id:
            continue

        # skip if on the list to be skipped
        if count in skip or model_id in skip:
            continue

        # from this point the model will create an output row even if not all tests are run
        mtab.new_row()  # append empty placeholder row
        info = cache.do_request(f"{API_URL}/{model_id}?format={out_format}").json()

        if len(info["name"]) > 36:
            model_summary = (
                f"[{model_id}]({API_URL}/{model_id})<br/><sup>{info['name'][:30]}</sup>"
            )
            model_details = f"<sup>{info['name']}</sup>"
            mtab["model_desc"] = mtab.make_fold(model_summary, model_details)
        else:
            mtab["model_desc"] = (
                f"[{model_id}]({API_URL}/{model_id})<br/><sup>{info['name']}</sup>"
            )

        # make temporary downloads of the sbml and sedml files
        model_dir = os.path.join(starting_dir, tmp_dir, model_id)
        os.makedirs(model_dir, exist_ok=True)
        os.chdir(model_dir)

        # sbml file validation tasks, includes downloading a local copy
        sbml_file = validate_sbml_file(model_id, mtab, info, cache, sup)
        if not sbml_file:
            continue  # no further tests possible

        sedml_file = validate_sedml_file(model_id, mtab, info, cache, sup, sbml_file)
        if not sedml_file:
            continue  # no further tests possible

        # run the validation functions on the sbml and sedml files
        print(f"\ntesting {sbml_file}...")
        sup.suppress()
        mtab["tellurium_outcome"] = utils.test_engine("tellurium", sedml_file)
        sup.restore()

        engine_keys = ["copasi", "tellurium"]
        test_folder = "tests"
        d1_plots_remote_dir = os.path.join(test_folder, "d1_plots_remote")
        results_remote = utils.run_biosimulators_remotely(
            engine_keys,
            sedml_file_name=sedml_file,
            sbml_file_name=sbml_file,
            d1_plots_remote_dir=d1_plots_remote_dir,
            test_folder=test_folder,
        )

        for e in engine_keys:
            # only if log_yml key is present
            if "log_yml" in results_remote[e]:
                results_remote_processed = utils.process_log_yml_dict(
                    results_remote[e]["log_yml"]
                )
            else:
                results_remote_processed = {
                    "status": "ERROR",
                    "error_message": "log_yml key not found",
                    "exception_type": "KeyError",
                }
            mtab_remote_outcome_key = f"{e}_remote_outcome"

            info_submission = f"Download: {results_remote[e]['download']}<br><br>Logs: {results_remote[e]['logs']}<br><br>View: {results_remote[e]['view']}<br><br>HTTP response: {str(results_remote[e]['response'])}"
            error_message_string = f'Error message: {results_remote_processed["error_message"]}<br><br>Exception type: {results_remote_processed["exception_type"]}'

            if results_remote_processed["error_message"] != "":
                info_submission = info_submission + f"<br><br>{error_message_string}"

            mtab[mtab_remote_outcome_key] = [
                results_remote_processed["status"],
                info_submission,
            ]

        # stop matplotlib plots from building up
        matplotlib.pyplot.close()

        mtab_dict = {"mtab_row": mtab, "results_remote": results_remote}
        pickle.dump(mtab_dict, open(pickle_name, "wb"))

    print()  # end progress counter, go to next line of stdout

    # show total cases processed
    mtab.add_summary("model_desc", f"n={mtab.n_rows()}")

    # count occurrences of each cell value, convert to final form
    for key in [
        "valid_sbml",
        "valid_sbml_units",
        "valid_sedml",
        "broken_ref",
        "tellurium_outcome",
        "tellurium_remote_outcome",
        "copasi_remote_outcome",
    ]:
        mtab.simple_summary(key)
        mtab.transform_column(key)

    # convert engine error messages to foldable readable form
    # calculate error category counts for summary row
    # mtab.process_engine_outcomes('tellurium','tellurium_outcome')

    # write out to file
    os.chdir(starting_dir)
    with open("README.md", "w", encoding="utf-8") as fout:
        fout.write(md_description)
        mtab.write(fout)


if __name__ == "__main__":
    use_pickles = True
    main()