From c0fb207b8bd23abd097c7b68b5cbc40e38a47808 Mon Sep 17 00:00:00 2001 From: shirazos7 Date: Tue, 14 Jan 2025 01:17:16 +0100 Subject: [PATCH 1/3] extracting the years of references --- src/zbmath_rest2oai/getAsXml.py | 83 +++++++++++++++++++++++---------- 1 file changed, 58 insertions(+), 25 deletions(-) diff --git a/src/zbmath_rest2oai/getAsXml.py b/src/zbmath_rest2oai/getAsXml.py index 21148790..083cfd5d 100644 --- a/src/zbmath_rest2oai/getAsXml.py +++ b/src/zbmath_rest2oai/getAsXml.py @@ -1,6 +1,6 @@ import re import sys - +import os import requests from dict2xml import Converter @@ -47,7 +47,8 @@ def apply_zbmath_api_fixes(result, prefix): replace('0001-01-01T00:00:00Z', '0001-01-01T00:00:00')) if result.get('id'): - result['id'] = prefix + str(result['id']) + # Remove the prefix from the ID + result['id'] = str(result['id']).replace(prefix, "") old_states = result.get('states') if old_states is None: return @@ -77,32 +78,45 @@ def extract_tags(result): def add_references_to_software(api_uri, dict_res): list_articles_ids_to_soft = [] list_articles_ids_and_alter_ids_to_soft = [] + list_articles_ids_and_years = [] if "software" in api_uri: - if api_uri.startswith("https://api.zbmath.org/v1/software/_all?start_after=")==False: - soft_id=api_uri.split("/")[-1] + if not (api_uri. + startswith("https://api.zbmath.org/v1/software/_all?start_after=")): + soft_id = api_uri.split("/")[-1] + def api_doc_endpoint(page): - return requests.get("https://api.zbmath.org/v1/document/_structured_search?page={}&results_per_page=100&software%20id={}".format(page,soft_id)) - page=0 + return requests.get( + "https://api.zbmath.org/v1/document/_structured_search?page={}&results_per_page=100&software%20id={}".format( + page, soft_id + ) + ) + + page = 0 while True: data = api_doc_endpoint(page).json() if data is None or "result" not in data or not data["result"]: break - - list_ids=[] - list_ids_and_alter = [] for entry in data["result"]: - list_ids.append(entry["id"]) + list_ids = [] list_links = [] + year = "Unknown" + + if "year" in entry: + year = entry["year"] + + list_ids.append(entry["id"]) for alt_dic in entry["links"]: if alt_dic["type"] == "doi": list_links.append(alt_dic["identifier"]) elif alt_dic["type"] == "arxiv": list_links.append(alt_dic["identifier"]) - list_ids_and_alter.append(";".join([str(entry["id"])]+list_links)) + list_articles_ids_and_years.append(year) - list_articles_ids_to_soft.extend(list_ids) - list_articles_ids_and_alter_ids_to_soft.extend(list_ids_and_alter) + list_articles_ids_to_soft.append(entry["id"]) + list_articles_ids_and_alter_ids_to_soft.append( + ";".join([str(entry["id"])] + list_links) + ) page+=1 @@ -110,10 +124,16 @@ def api_doc_endpoint(page): dict_res["references"] = list_articles_ids_to_soft # Wrap it in a list to make it iterable for your existing loop dict_res["references_alt"] = list_articles_ids_and_alter_ids_to_soft + dict_res["references_year_alt"] = list_articles_ids_and_years dict_res = [dict_res] return dict_res -def final_xml2(api_source, prefix): +def save_xml_to_file(xml_content, file_path): + os.makedirs(os.path.dirname(file_path), exist_ok=True) + with open(file_path, "w", encoding="utf-8") as file: + file.write(xml_content) + +def final_xml2(api_source, prefix ,output_file_path=None): headers = {'Accept': 'application/json'} r = requests.get(api_source, headers=headers, timeout=(10, 60)) if r.status_code == 404: @@ -137,17 +157,26 @@ def final_xml2(api_source, prefix): result = result[0] apply_zbmath_api_fixes(result, prefix) identifier = result["id"] - dict_math_entities[identifier] = _illegal_xml_chars_RE.sub("", Converter(wrap="root").build( - result, - closed_tags_for=[[], '', [None], None])) - tags[identifier] = extract_tags(result) - elif isinstance(result, dict): - apply_zbmath_api_fixes(result, prefix) - identifier = result["id"] - dict_math_entities[identifier] = _illegal_xml_chars_RE.sub("", Converter(wrap="root").build( - result, - closed_tags_for=[[], '', [None], None])) - tags[identifier] = extract_tags(result) + if "references_alt" in result: + result["references_alt"] = result["references_alt"] + if "references_year_alt" in result: + result["references_year_alt"] = result["references_year_alt"] + + xml_converter = Converter(wrap="root") + xml_output = _illegal_xml_chars_RE.sub("", + xml_converter.build(result, closed_tags_for=[[], '', [None], None])) + + if identifier not in dict_math_entities: + dict_math_entities[identifier] = xml_output + else: + raise Exception(f"Duplicate identifier detected: {identifier}") + + tags[identifier] = extract_tags(result) + + if output_file_path: + combined_xml_content = "\n".join(dict_math_entities.values()) + save_xml_to_file(combined_xml_content, output_file_path) + return [dict_math_entities, r.elapsed.total_seconds(), tags] @@ -156,4 +185,8 @@ def final_xml2(api_source, prefix): prefix="oai:zbmath.org:" else: prefix="oai:swmath.org:" + output_file_path = (os.path + .join('../../test/data/software/plain_with_references.xml')) + + result = final_xml2(sys.argv[1], prefix, output_file_path) print(final_xml2(sys.argv[1], prefix)) From 030851bed61b9734a3185f22e3602f08dddf794e Mon Sep 17 00:00:00 2001 From: shirazos7 Date: Tue, 14 Jan 2025 12:20:31 +0100 Subject: [PATCH 2/3] extracting the years of references --- src/zbmath_rest2oai/getAsXml.py | 90 ++++++++++++--------------------- 1 file changed, 33 insertions(+), 57 deletions(-) diff --git a/src/zbmath_rest2oai/getAsXml.py b/src/zbmath_rest2oai/getAsXml.py index 083cfd5d..50555320 100644 --- a/src/zbmath_rest2oai/getAsXml.py +++ b/src/zbmath_rest2oai/getAsXml.py @@ -1,6 +1,6 @@ import re import sys -import os + import requests from dict2xml import Converter @@ -47,8 +47,7 @@ def apply_zbmath_api_fixes(result, prefix): replace('0001-01-01T00:00:00Z', '0001-01-01T00:00:00')) if result.get('id'): - # Remove the prefix from the ID - result['id'] = str(result['id']).replace(prefix, "") + result['id'] = prefix + str(result['id']) old_states = result.get('states') if old_states is None: return @@ -78,62 +77,52 @@ def extract_tags(result): def add_references_to_software(api_uri, dict_res): list_articles_ids_to_soft = [] list_articles_ids_and_alter_ids_to_soft = [] - list_articles_ids_and_years = [] + list_references_year_alt = [] # New list to store years + if "software" in api_uri: - if not (api_uri. - startswith("https://api.zbmath.org/v1/software/_all?start_after=")): + if api_uri.startswith("https://api.zbmath.org/v1/software/_all?start_after=") == False: soft_id = api_uri.split("/")[-1] def api_doc_endpoint(page): - return requests.get( - "https://api.zbmath.org/v1/document/_structured_search?page={}&results_per_page=100&software%20id={}".format( - page, soft_id - ) - ) + return requests.get("https://api.zbmath.org/v1/document/_structured_search?page={}&results_per_page=100&software%20id={}".format(page, soft_id)) page = 0 while True: data = api_doc_endpoint(page).json() if data is None or "result" not in data or not data["result"]: break - for entry in data["result"]: - list_ids = [] - list_links = [] - year = "Unknown" - - if "year" in entry: - year = entry["year"] + list_ids = [] + list_ids_and_alter = [] + for entry in data["result"]: list_ids.append(entry["id"]) + list_links = [] for alt_dic in entry["links"]: if alt_dic["type"] == "doi": list_links.append(alt_dic["identifier"]) elif alt_dic["type"] == "arxiv": list_links.append(alt_dic["identifier"]) - list_articles_ids_and_years.append(year) + list_ids_and_alter.append(";".join([str(entry["id"])] + list_links)) - list_articles_ids_to_soft.append(entry["id"]) - list_articles_ids_and_alter_ids_to_soft.append( - ";".join([str(entry["id"])] + list_links) - ) + # Extract the year from the datestamp + if "datestamp" in entry: + year = entry["datestamp"][:4] # Extract the first 4 characters (year) + list_references_year_alt.append(year) - page+=1 + list_articles_ids_to_soft.extend(list_ids) + list_articles_ids_and_alter_ids_to_soft.extend(list_ids_and_alter) + + page += 1 if isinstance(dict_res, dict): dict_res["references"] = list_articles_ids_to_soft - # Wrap it in a list to make it iterable for your existing loop dict_res["references_alt"] = list_articles_ids_and_alter_ids_to_soft - dict_res["references_year_alt"] = list_articles_ids_and_years + dict_res["references_year_alt"] = list_references_year_alt # Add the years to the dict dict_res = [dict_res] return dict_res -def save_xml_to_file(xml_content, file_path): - os.makedirs(os.path.dirname(file_path), exist_ok=True) - with open(file_path, "w", encoding="utf-8") as file: - file.write(xml_content) - -def final_xml2(api_source, prefix ,output_file_path=None): +def final_xml2(api_source, prefix): headers = {'Accept': 'application/json'} r = requests.get(api_source, headers=headers, timeout=(10, 60)) if r.status_code == 404: @@ -157,26 +146,17 @@ def final_xml2(api_source, prefix ,output_file_path=None): result = result[0] apply_zbmath_api_fixes(result, prefix) identifier = result["id"] - if "references_alt" in result: - result["references_alt"] = result["references_alt"] - if "references_year_alt" in result: - result["references_year_alt"] = result["references_year_alt"] - - xml_converter = Converter(wrap="root") - xml_output = _illegal_xml_chars_RE.sub("", - xml_converter.build(result, closed_tags_for=[[], '', [None], None])) - - if identifier not in dict_math_entities: - dict_math_entities[identifier] = xml_output - else: - raise Exception(f"Duplicate identifier detected: {identifier}") - - tags[identifier] = extract_tags(result) - - if output_file_path: - combined_xml_content = "\n".join(dict_math_entities.values()) - save_xml_to_file(combined_xml_content, output_file_path) - + dict_math_entities[identifier] = _illegal_xml_chars_RE.sub("", Converter(wrap="root").build( + result, + closed_tags_for=[[], '', [None], None])) + tags[identifier] = extract_tags(result) + elif isinstance(result, dict): + apply_zbmath_api_fixes(result, prefix) + identifier = result["id"] + dict_math_entities[identifier] = _illegal_xml_chars_RE.sub("", Converter(wrap="root").build( + result, + closed_tags_for=[[], '', [None], None])) + tags[identifier] = extract_tags(result) return [dict_math_entities, r.elapsed.total_seconds(), tags] @@ -185,8 +165,4 @@ def final_xml2(api_source, prefix ,output_file_path=None): prefix="oai:zbmath.org:" else: prefix="oai:swmath.org:" - output_file_path = (os.path - .join('../../test/data/software/plain_with_references.xml')) - - result = final_xml2(sys.argv[1], prefix, output_file_path) - print(final_xml2(sys.argv[1], prefix)) + print(final_xml2(sys.argv[1], prefix)) \ No newline at end of file From 4109b6aaeb8b95eda02f5ea1eb81093d2bed9060 Mon Sep 17 00:00:00 2001 From: shirazos7 Date: Tue, 14 Jan 2025 12:25:41 +0100 Subject: [PATCH 3/3] adding the year of references --- test/data/software/plain_with_references.xml | 111 +++++++++++++++++++ 1 file changed, 111 insertions(+) diff --git a/test/data/software/plain_with_references.xml b/test/data/software/plain_with_references.xml index df1ff406..0964d626 100644 --- a/test/data/software/plain_with_references.xml +++ b/test/data/software/plain_with_references.xml @@ -250,6 +250,117 @@ 6666873;10.1016/j.jcp.2016.06.039 5538352;10.1007/978-3-540-71992-2_17 2234457 + 2005 + 2012 + 2009 + 2007 + 2023 + 2018 + 2007 + 2024 + 2008 + 2008 + 2005 + 2015 + 2015 + 2021 + 2008 + 2011 + 2010 + 2011 + 2007 + 2007 + 2008 + 2008 + 2017 + 2010 + 2019 + 2010 + 2019 + 2023 + 2007 + 2016 + 2011 + 2010 + 2013 + 2016 + 2017 + 2024 + 2019 + 2019 + 2023 + 2011 + 2008 + 2016 + 2023 + 2011 + 2010 + 2009 + 2019 + 2016 + 2015 + 2019 + 2018 + 2014 + 2019 + 2015 + 2018 + 2015 + 2014 + 2013 + 2007 + 2012 + 2021 + 2021 + 2020 + 2010 + 2022 + 2008 + 2007 + 2018 + 2014 + 2009 + 2007 + 2023 + 2011 + 2018 + 2011 + 2010 + 2024 + 2022 + 2010 + 2010 + 2021 + 2010 + 2022 + 2022 + 2022 + 2023 + 2011 + 2021 + 2021 + 2024 + 2014 + 2024 + 2022 + 2010 + 2020 + 2023 + 2016 + 2011 + 2020 + 2020 + 2019 + 2020 + 2024 + 2018 + 2021 + 2016 + 2024 + 2018 + 2016 + 2009 + 2005 4013 MUMPS