From c0fb207b8bd23abd097c7b68b5cbc40e38a47808 Mon Sep 17 00:00:00 2001
From: shirazos7 <shiraz@zbmath.org>
Date: Tue, 14 Jan 2025 01:17:16 +0100
Subject: [PATCH 1/3] extracting the years of references

---
 src/zbmath_rest2oai/getAsXml.py | 83 +++++++++++++++++++++++----------
 1 file changed, 58 insertions(+), 25 deletions(-)

diff --git a/src/zbmath_rest2oai/getAsXml.py b/src/zbmath_rest2oai/getAsXml.py
index 21148790..083cfd5d 100644
--- a/src/zbmath_rest2oai/getAsXml.py
+++ b/src/zbmath_rest2oai/getAsXml.py
@@ -1,6 +1,6 @@
 import re
 import sys
-
+import os
 import requests
 from dict2xml import Converter
 
@@ -47,7 +47,8 @@ def apply_zbmath_api_fixes(result, prefix):
                                replace('0001-01-01T00:00:00Z', '0001-01-01T00:00:00'))
 
     if result.get('id'):
-        result['id'] = prefix + str(result['id'])
+        # Remove the prefix from the ID
+        result['id'] = str(result['id']).replace(prefix, "")
     old_states = result.get('states')
     if old_states is None:
         return
@@ -77,32 +78,45 @@ def extract_tags(result):
 def add_references_to_software(api_uri, dict_res):
     list_articles_ids_to_soft = []
     list_articles_ids_and_alter_ids_to_soft = []
+    list_articles_ids_and_years = []
     if "software" in api_uri:
-        if api_uri.startswith("https://api.zbmath.org/v1/software/_all?start_after=")==False:
-            soft_id=api_uri.split("/")[-1]
+        if not (api_uri.
+                startswith("https://api.zbmath.org/v1/software/_all?start_after=")):
+            soft_id = api_uri.split("/")[-1]
+
             def api_doc_endpoint(page):
-                return requests.get("https://api.zbmath.org/v1/document/_structured_search?page={}&results_per_page=100&software%20id={}".format(page,soft_id))
-            page=0
+                return requests.get(
+                    "https://api.zbmath.org/v1/document/_structured_search?page={}&results_per_page=100&software%20id={}".format(
+                        page, soft_id
+                    )
+                )
+
+            page = 0
             while True:
                 data = api_doc_endpoint(page).json()
                 if data is None or "result" not in data or not data["result"]:
                     break
-
-                list_ids=[]
-                list_ids_and_alter = []
                 for entry in data["result"]:
-                    list_ids.append(entry["id"])
+                    list_ids = []
                     list_links = []
+                    year = "Unknown"
+
+                    if "year" in entry:
+                        year = entry["year"]
+
+                    list_ids.append(entry["id"])
                     for alt_dic in entry["links"]:
                         if alt_dic["type"] == "doi":
                             list_links.append(alt_dic["identifier"])
                         elif alt_dic["type"] == "arxiv":
                             list_links.append(alt_dic["identifier"])
 
-                    list_ids_and_alter.append(";".join([str(entry["id"])]+list_links))
+                    list_articles_ids_and_years.append(year)
 
-                list_articles_ids_to_soft.extend(list_ids)
-                list_articles_ids_and_alter_ids_to_soft.extend(list_ids_and_alter)
+                    list_articles_ids_to_soft.append(entry["id"])
+                    list_articles_ids_and_alter_ids_to_soft.append(
+                        ";".join([str(entry["id"])] + list_links)
+                    )
 
                 page+=1
 
@@ -110,10 +124,16 @@ def api_doc_endpoint(page):
             dict_res["references"] = list_articles_ids_to_soft
             # Wrap it in a list to make it iterable for your existing loop
             dict_res["references_alt"] = list_articles_ids_and_alter_ids_to_soft
+            dict_res["references_year_alt"] = list_articles_ids_and_years
             dict_res = [dict_res]
 
     return dict_res
-def final_xml2(api_source, prefix):
+def save_xml_to_file(xml_content, file_path):
+    os.makedirs(os.path.dirname(file_path), exist_ok=True)
+    with open(file_path, "w", encoding="utf-8") as file:
+        file.write(xml_content)
+
+def final_xml2(api_source, prefix ,output_file_path=None):
     headers = {'Accept': 'application/json'}
     r = requests.get(api_source, headers=headers, timeout=(10, 60))
     if r.status_code == 404:
@@ -137,17 +157,26 @@ def final_xml2(api_source, prefix):
             result = result[0]
             apply_zbmath_api_fixes(result, prefix)
             identifier = result["id"]
-            dict_math_entities[identifier] = _illegal_xml_chars_RE.sub("", Converter(wrap="root").build(
-                result,
-                closed_tags_for=[[], '', [None], None]))
-            tags[identifier] = extract_tags(result)
-        elif isinstance(result, dict):  
-            apply_zbmath_api_fixes(result, prefix)
-            identifier = result["id"]
-            dict_math_entities[identifier] = _illegal_xml_chars_RE.sub("", Converter(wrap="root").build(
-                result,
-                closed_tags_for=[[], '', [None], None]))
-            tags[identifier] = extract_tags(result)
+        if "references_alt" in result:
+            result["references_alt"] = result["references_alt"]
+        if "references_year_alt" in result:
+            result["references_year_alt"] = result["references_year_alt"]
+
+        xml_converter = Converter(wrap="root")
+        xml_output = _illegal_xml_chars_RE.sub("",
+                                               xml_converter.build(result, closed_tags_for=[[], '', [None], None]))
+
+        if identifier not in dict_math_entities:
+            dict_math_entities[identifier] = xml_output
+        else:
+            raise Exception(f"Duplicate identifier detected: {identifier}")
+
+        tags[identifier] = extract_tags(result)
+
+        if output_file_path:
+            combined_xml_content = "\n".join(dict_math_entities.values())
+            save_xml_to_file(combined_xml_content, output_file_path)
+
     return [dict_math_entities, r.elapsed.total_seconds(), tags]
 
 
@@ -156,4 +185,8 @@ def final_xml2(api_source, prefix):
         prefix="oai:zbmath.org:"
     else:
         prefix="oai:swmath.org:"
+        output_file_path = (os.path
+                            .join('../../test/data/software/plain_with_references.xml'))
+
+        result = final_xml2(sys.argv[1], prefix, output_file_path)
     print(final_xml2(sys.argv[1], prefix))

From 030851bed61b9734a3185f22e3602f08dddf794e Mon Sep 17 00:00:00 2001
From: shirazos7 <shiraz@zbmath.org>
Date: Tue, 14 Jan 2025 12:20:31 +0100
Subject: [PATCH 2/3] extracting the years of references

---
 src/zbmath_rest2oai/getAsXml.py | 90 ++++++++++++---------------------
 1 file changed, 33 insertions(+), 57 deletions(-)

diff --git a/src/zbmath_rest2oai/getAsXml.py b/src/zbmath_rest2oai/getAsXml.py
index 083cfd5d..50555320 100644
--- a/src/zbmath_rest2oai/getAsXml.py
+++ b/src/zbmath_rest2oai/getAsXml.py
@@ -1,6 +1,6 @@
 import re
 import sys
-import os
+
 import requests
 from dict2xml import Converter
 
@@ -47,8 +47,7 @@ def apply_zbmath_api_fixes(result, prefix):
                                replace('0001-01-01T00:00:00Z', '0001-01-01T00:00:00'))
 
     if result.get('id'):
-        # Remove the prefix from the ID
-        result['id'] = str(result['id']).replace(prefix, "")
+        result['id'] = prefix + str(result['id'])
     old_states = result.get('states')
     if old_states is None:
         return
@@ -78,62 +77,52 @@ def extract_tags(result):
 def add_references_to_software(api_uri, dict_res):
     list_articles_ids_to_soft = []
     list_articles_ids_and_alter_ids_to_soft = []
-    list_articles_ids_and_years = []
+    list_references_year_alt = []  # New list to store years
+
     if "software" in api_uri:
-        if not (api_uri.
-                startswith("https://api.zbmath.org/v1/software/_all?start_after=")):
+        if api_uri.startswith("https://api.zbmath.org/v1/software/_all?start_after=") == False:
             soft_id = api_uri.split("/")[-1]
 
             def api_doc_endpoint(page):
-                return requests.get(
-                    "https://api.zbmath.org/v1/document/_structured_search?page={}&results_per_page=100&software%20id={}".format(
-                        page, soft_id
-                    )
-                )
+                return requests.get("https://api.zbmath.org/v1/document/_structured_search?page={}&results_per_page=100&software%20id={}".format(page, soft_id))
 
             page = 0
             while True:
                 data = api_doc_endpoint(page).json()
                 if data is None or "result" not in data or not data["result"]:
                     break
-                for entry in data["result"]:
-                    list_ids = []
-                    list_links = []
-                    year = "Unknown"
-
-                    if "year" in entry:
-                        year = entry["year"]
 
+                list_ids = []
+                list_ids_and_alter = []
+                for entry in data["result"]:
                     list_ids.append(entry["id"])
+                    list_links = []
                     for alt_dic in entry["links"]:
                         if alt_dic["type"] == "doi":
                             list_links.append(alt_dic["identifier"])
                         elif alt_dic["type"] == "arxiv":
                             list_links.append(alt_dic["identifier"])
 
-                    list_articles_ids_and_years.append(year)
+                    list_ids_and_alter.append(";".join([str(entry["id"])] + list_links))
 
-                    list_articles_ids_to_soft.append(entry["id"])
-                    list_articles_ids_and_alter_ids_to_soft.append(
-                        ";".join([str(entry["id"])] + list_links)
-                    )
+                    # Extract the year from the datestamp
+                    if "datestamp" in entry:
+                        year = entry["datestamp"][:4]  # Extract the first 4 characters (year)
+                        list_references_year_alt.append(year)
 
-                page+=1
+                list_articles_ids_to_soft.extend(list_ids)
+                list_articles_ids_and_alter_ids_to_soft.extend(list_ids_and_alter)
+
+                page += 1
 
         if isinstance(dict_res, dict):
             dict_res["references"] = list_articles_ids_to_soft
-            # Wrap it in a list to make it iterable for your existing loop
             dict_res["references_alt"] = list_articles_ids_and_alter_ids_to_soft
-            dict_res["references_year_alt"] = list_articles_ids_and_years
+            dict_res["references_year_alt"] = list_references_year_alt  # Add the years to the dict
             dict_res = [dict_res]
 
     return dict_res
-def save_xml_to_file(xml_content, file_path):
-    os.makedirs(os.path.dirname(file_path), exist_ok=True)
-    with open(file_path, "w", encoding="utf-8") as file:
-        file.write(xml_content)
-
-def final_xml2(api_source, prefix ,output_file_path=None):
+def final_xml2(api_source, prefix):
     headers = {'Accept': 'application/json'}
     r = requests.get(api_source, headers=headers, timeout=(10, 60))
     if r.status_code == 404:
@@ -157,26 +146,17 @@ def final_xml2(api_source, prefix ,output_file_path=None):
             result = result[0]
             apply_zbmath_api_fixes(result, prefix)
             identifier = result["id"]
-        if "references_alt" in result:
-            result["references_alt"] = result["references_alt"]
-        if "references_year_alt" in result:
-            result["references_year_alt"] = result["references_year_alt"]
-
-        xml_converter = Converter(wrap="root")
-        xml_output = _illegal_xml_chars_RE.sub("",
-                                               xml_converter.build(result, closed_tags_for=[[], '', [None], None]))
-
-        if identifier not in dict_math_entities:
-            dict_math_entities[identifier] = xml_output
-        else:
-            raise Exception(f"Duplicate identifier detected: {identifier}")
-
-        tags[identifier] = extract_tags(result)
-
-        if output_file_path:
-            combined_xml_content = "\n".join(dict_math_entities.values())
-            save_xml_to_file(combined_xml_content, output_file_path)
-
+            dict_math_entities[identifier] = _illegal_xml_chars_RE.sub("", Converter(wrap="root").build(
+                result,
+                closed_tags_for=[[], '', [None], None]))
+            tags[identifier] = extract_tags(result)
+        elif isinstance(result, dict):
+            apply_zbmath_api_fixes(result, prefix)
+            identifier = result["id"]
+            dict_math_entities[identifier] = _illegal_xml_chars_RE.sub("", Converter(wrap="root").build(
+                result,
+                closed_tags_for=[[], '', [None], None]))
+            tags[identifier] = extract_tags(result)
     return [dict_math_entities, r.elapsed.total_seconds(), tags]
 
 
@@ -185,8 +165,4 @@ def final_xml2(api_source, prefix ,output_file_path=None):
         prefix="oai:zbmath.org:"
     else:
         prefix="oai:swmath.org:"
-        output_file_path = (os.path
-                            .join('../../test/data/software/plain_with_references.xml'))
-
-        result = final_xml2(sys.argv[1], prefix, output_file_path)
-    print(final_xml2(sys.argv[1], prefix))
+    print(final_xml2(sys.argv[1], prefix))
\ No newline at end of file

From 4109b6aaeb8b95eda02f5ea1eb81093d2bed9060 Mon Sep 17 00:00:00 2001
From: shirazos7 <shiraz@zbmath.org>
Date: Tue, 14 Jan 2025 12:25:41 +0100
Subject: [PATCH 3/3] adding the year of references

---
 test/data/software/plain_with_references.xml | 111 +++++++++++++++++++
 1 file changed, 111 insertions(+)

diff --git a/test/data/software/plain_with_references.xml b/test/data/software/plain_with_references.xml
index df1ff406..0964d626 100644
--- a/test/data/software/plain_with_references.xml
+++ b/test/data/software/plain_with_references.xml
@@ -250,6 +250,117 @@
   <references_alt>6666873;10.1016/j.jcp.2016.06.039</references_alt>
   <references_alt>5538352;10.1007/978-3-540-71992-2_17</references_alt>
   <references_alt>2234457</references_alt>
+   <references_year_alt>2005</references_year_alt>
+  <references_year_alt>2012</references_year_alt>
+  <references_year_alt>2009</references_year_alt>
+  <references_year_alt>2007</references_year_alt>
+  <references_year_alt>2023</references_year_alt>
+  <references_year_alt>2018</references_year_alt>
+  <references_year_alt>2007</references_year_alt>
+  <references_year_alt>2024</references_year_alt>
+  <references_year_alt>2008</references_year_alt>
+  <references_year_alt>2008</references_year_alt>
+  <references_year_alt>2005</references_year_alt>
+  <references_year_alt>2015</references_year_alt>
+  <references_year_alt>2015</references_year_alt>
+  <references_year_alt>2021</references_year_alt>
+  <references_year_alt>2008</references_year_alt>
+  <references_year_alt>2011</references_year_alt>
+  <references_year_alt>2010</references_year_alt>
+  <references_year_alt>2011</references_year_alt>
+  <references_year_alt>2007</references_year_alt>
+  <references_year_alt>2007</references_year_alt>
+  <references_year_alt>2008</references_year_alt>
+  <references_year_alt>2008</references_year_alt>
+  <references_year_alt>2017</references_year_alt>
+  <references_year_alt>2010</references_year_alt>
+  <references_year_alt>2019</references_year_alt>
+  <references_year_alt>2010</references_year_alt>
+  <references_year_alt>2019</references_year_alt>
+  <references_year_alt>2023</references_year_alt>
+  <references_year_alt>2007</references_year_alt>
+  <references_year_alt>2016</references_year_alt>
+  <references_year_alt>2011</references_year_alt>
+  <references_year_alt>2010</references_year_alt>
+  <references_year_alt>2013</references_year_alt>
+  <references_year_alt>2016</references_year_alt>
+  <references_year_alt>2017</references_year_alt>
+  <references_year_alt>2024</references_year_alt>
+  <references_year_alt>2019</references_year_alt>
+  <references_year_alt>2019</references_year_alt>
+  <references_year_alt>2023</references_year_alt>
+  <references_year_alt>2011</references_year_alt>
+  <references_year_alt>2008</references_year_alt>
+  <references_year_alt>2016</references_year_alt>
+  <references_year_alt>2023</references_year_alt>
+  <references_year_alt>2011</references_year_alt>
+  <references_year_alt>2010</references_year_alt>
+  <references_year_alt>2009</references_year_alt>
+  <references_year_alt>2019</references_year_alt>
+  <references_year_alt>2016</references_year_alt>
+  <references_year_alt>2015</references_year_alt>
+  <references_year_alt>2019</references_year_alt>
+  <references_year_alt>2018</references_year_alt>
+  <references_year_alt>2014</references_year_alt>
+  <references_year_alt>2019</references_year_alt>
+  <references_year_alt>2015</references_year_alt>
+  <references_year_alt>2018</references_year_alt>
+  <references_year_alt>2015</references_year_alt>
+  <references_year_alt>2014</references_year_alt>
+  <references_year_alt>2013</references_year_alt>
+  <references_year_alt>2007</references_year_alt>
+  <references_year_alt>2012</references_year_alt>
+  <references_year_alt>2021</references_year_alt>
+  <references_year_alt>2021</references_year_alt>
+  <references_year_alt>2020</references_year_alt>
+  <references_year_alt>2010</references_year_alt>
+  <references_year_alt>2022</references_year_alt>
+  <references_year_alt>2008</references_year_alt>
+  <references_year_alt>2007</references_year_alt>
+  <references_year_alt>2018</references_year_alt>
+  <references_year_alt>2014</references_year_alt>
+  <references_year_alt>2009</references_year_alt>
+  <references_year_alt>2007</references_year_alt>
+  <references_year_alt>2023</references_year_alt>
+  <references_year_alt>2011</references_year_alt>
+  <references_year_alt>2018</references_year_alt>
+  <references_year_alt>2011</references_year_alt>
+  <references_year_alt>2010</references_year_alt>
+  <references_year_alt>2024</references_year_alt>
+  <references_year_alt>2022</references_year_alt>
+  <references_year_alt>2010</references_year_alt>
+  <references_year_alt>2010</references_year_alt>
+  <references_year_alt>2021</references_year_alt>
+  <references_year_alt>2010</references_year_alt>
+  <references_year_alt>2022</references_year_alt>
+  <references_year_alt>2022</references_year_alt>
+  <references_year_alt>2022</references_year_alt>
+  <references_year_alt>2023</references_year_alt>
+  <references_year_alt>2011</references_year_alt>
+  <references_year_alt>2021</references_year_alt>
+  <references_year_alt>2021</references_year_alt>
+  <references_year_alt>2024</references_year_alt>
+  <references_year_alt>2014</references_year_alt>
+  <references_year_alt>2024</references_year_alt>
+  <references_year_alt>2022</references_year_alt>
+  <references_year_alt>2010</references_year_alt>
+  <references_year_alt>2020</references_year_alt>
+  <references_year_alt>2023</references_year_alt>
+  <references_year_alt>2016</references_year_alt>
+  <references_year_alt>2011</references_year_alt>
+  <references_year_alt>2020</references_year_alt>
+  <references_year_alt>2020</references_year_alt>
+  <references_year_alt>2019</references_year_alt>
+  <references_year_alt>2020</references_year_alt>
+  <references_year_alt>2024</references_year_alt>
+  <references_year_alt>2018</references_year_alt>
+  <references_year_alt>2021</references_year_alt>
+  <references_year_alt>2016</references_year_alt>
+  <references_year_alt>2024</references_year_alt>
+  <references_year_alt>2018</references_year_alt>
+  <references_year_alt>2016</references_year_alt>
+  <references_year_alt>2009</references_year_alt>
+  <references_year_alt>2005</references_year_alt>
   <related_software>
     <id>4013</id>
     <name>MUMPS</name>