From 328e60587f7e68c0d28a505af75376d341554117 Mon Sep 17 00:00:00 2001
From: Alex Garnett <axfelix@gmail.com>
Date: Tue, 21 Jan 2020 14:19:59 -0800
Subject: [PATCH 01/67] add configuration option to use a postgres backend
 instead of datacite API

---
 vendor/docker/env.conf |   5 +
 viringo/catalogs.py    | 271 ++++++++++++++++++++++++++++++++++++++++-
 viringo/config.py      |  10 ++
 viringo/oai.py         |  10 +-
 4 files changed, 294 insertions(+), 2 deletions(-)
diff --git a/vendor/docker/env.conf b/vendor/docker/env.conf
index 201e3e9..b8c5644 100644
--- a/vendor/docker/env.conf
+++ b/vendor/docker/env.conf
@@ -7,3 +7,8 @@ env SENTRY_DSN;
 env API_ADMIN_USERNAME;
 env API_ADMIN_PASSWORD;
 env RESULT_SET_SIZE;
+env CATALOG_SET;
+env POSTGRES_SERVER;
+env POSTGRES_DB;
+env POSTGRES_USER;
+env POSTGRES_PASSWORD;
diff --git a/viringo/catalogs.py b/viringo/catalogs.py
index 11fc98e..b4dc22e 100644
--- a/viringo/catalogs.py
+++ b/viringo/catalogs.py
@@ -281,6 +281,276 @@ def build_metadata_map(self, result):
 
         return metadata
 
+
+class PostgresOAIServer():
+    """Build OAI-PMH responses from a Postgres server"""
+    def identify(self):
+        """Construct common identification for the OAI service"""
+
+        identify = common.Identify(
+            repositoryName=config.OAIPMH_REPOS_NAME,
+            baseURL=config.OAIPMH_BASE_URL,
+            protocolVersion="2.0",
+            adminEmails=[config.OAIPMH_ADMIN_EMAIL],
+            earliestDatestamp=datetime(2011, 1, 1),
+            deletedRecord='persistent',
+            granularity='YYYY-MM-DDThh:mm:ssZ',
+            compression=['gzip', 'deflate'],
+            toolkit_description=False)
+
+        # Specify a custom description
+        datacite_desc = """
+        <oai-identifier xmlns="http://www.openarchives.org/OAI/2.0/oai-identifier" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai-identifier http://www.openarchives.org/OAI/2.0/oai-identifier.xsd">
+            <scheme>oai</scheme>
+            <repositoryIdentifier>oai.datacite.org</repositoryIdentifier>
+            <delimiter>:</delimiter>
+            <sampleIdentifier>oai:oai.datacite.org:12425</sampleIdentifier>
+        </oai-identifier>
+        """
+
+        identify.add_description(xml_string=datacite_desc)
+
+        return identify
+
+    def listMetadataFormats(self, identifier=None):
+        #pylint: disable=no-self-use,invalid-name
+        """Returns metadata formats available for the repository
+
+        Identifier does nothing as our repository responds in all formats for all dois
+        """
+        # PyOAI Expects result format (metadataPrefix, schema, metadataNamespace)
+
+        format_oai_dc = (
+            'oai_dc',
+            'http://www.openarchives.org/OAI/2.0/oai_dc.xsd',
+            'http://www.openarchives.org/OAI/2.0/oai_dc/'
+        )
+
+        format_oai_datacite = (
+            'oai_datacite',
+            'http://schema.datacite.org/oai/oai-1.1/oai.xsd',
+            'http://schema.datacite.org/oai/oai-1.1/'
+        )
+
+        format_datacite = (
+            'datacite',
+            'http://schema.datacite.org/meta/nonexistant/nonexistant.xsd',
+            'http://datacite.org/schema/nonexistant'
+        )
+
+        return [format_oai_dc, format_oai_datacite, format_datacite]
+
+    def getRecord(self, metadataPrefix, identifier):
+        #pylint: disable=no-self-use,invalid-name
+        """Returns pyoai data tuple for specific record"""
+
+        # We just want the DOI out of the OAI identifier.
+        _, doi = identifier.split(':', 1)
+
+        result = datacite.get_metadata(doi)
+        if not result:
+            raise error.IdDoesNotExistError(
+                "\"%s\" is unknown or illegal in this repository" % identifier
+            )
+
+        # Build metadata based on requested format and result
+        metadata = self.build_metadata_map(result)
+
+        header = self.build_header(result)
+        record = self.build_record(metadata)
+        data = (
+            header,
+            record,
+            None # About string - not used
+        )
+
+        return data
+
+    def listRecords(
+            self,
+            metadataPrefix=None,
+            from_=None,
+            until=None,
+            set=None,
+            paging_cursor=None
+        ):
+        #pylint: disable=no-self-use,invalid-name
+        """Returns pyoai data tuple for list of records"""
+
+        # If available get the search query from the set param
+        search_query = set_to_search_query(set)
+
+        # Get both a provider and client_id from the set
+        provider_id, client_id = set_to_provider_client(set)
+        results, total_records, paging_cursor = datacite.get_metadata_list(
+            query=search_query,
+            provider_id=provider_id,
+            client_id=client_id,
+            from_datetime=from_,
+            until_datetime=until,
+            cursor=paging_cursor
+        )
+
+        records = []
+        if results:
+            for result in results:
+                # Build metadata based on requested format and result
+                metadata = self.build_metadata_map(result)
+
+                header = self.build_header(result)
+                record = self.build_record(metadata)
+
+                data = (
+                    header,
+                    record,
+                    None # About string - not used
+                )
+
+                records.append(data)
+
+        # This differs from the pyoai implementation in that we have to return a cursor here
+        # But this is okay as we have a custom server to handle it.
+        return records, total_records, paging_cursor
+
+    def listIdentifiers(
+            self,
+            metadataPrefix=None,
+            from_=None,
+            until=None,
+            set=None,
+            paging_cursor=None
+        ):
+        #pylint: disable=no-self-use,invalid-name
+        """Returns pyoai data tuple for list of identifiers"""
+
+        # Get both a provider and client_id from the set
+        provider_id, client_id = set_to_provider_client(set)
+
+        results, total_records, paging_cursor = datacite.get_metadata_list(
+            provider_id=provider_id,
+            client_id=client_id,
+            from_datetime=from_,
+            until_datetime=until,
+            cursor=paging_cursor
+        )
+
+        records = []
+        if results:
+            for result in results:
+                header = self.build_header(result)
+
+                records.append(header)
+
+        # This differs from the pyoai implementation in that we have to return a cursor here
+        # But this is okay as we have a custom server to handle it.
+        return records, total_records, paging_cursor
+
+    def listSets(
+            self,
+            paging_cursor=0
+        ):
+        #pylint: disable=no-self-use,invalid-name
+        """Returns pyoai data tuple for list of sets"""
+
+        # Note this implementation is not super efficient as we request
+        # the full set everytime regardles of actual paging
+        # The paging is handled just by offsetting the records returned.
+        # This is however acceptable given sets are a small subset of data.
+
+        # We know we're always dealing with a integer value here
+        paging_cursor = int(paging_cursor)
+
+        batch_size = 50
+        next_batch = paging_cursor + batch_size
+        results, total_results = datacite.get_sets()
+        results = results[paging_cursor: next_batch]
+
+        if len(results) < batch_size:
+            paging_cursor = None
+        else:
+            paging_cursor = next_batch
+
+        records = []
+        if results:
+            for identifier, name in results:
+                # Format of a set is setSpec, setName, setDescription
+                records.append((identifier.upper(), name, None))
+
+        # This differs from the pyoai implementation in that we have to return a cursor here
+        # But this is okay as we have a custom server to handle it.
+        return records, total_results, paging_cursor
+
+    def build_header(self, result):
+        """Construct a OAI-PMH record header"""
+
+        # Provider symbol can just be extracted from the client symbol
+        provider_symbol, _ = result.client.split(".")
+
+        return common.Header(
+            None,
+            'doi:' + result.identifier,
+            result.updated_datetime,
+            setspec=[provider_symbol, result.client],
+            deleted=not result.active
+        )
+
+    def build_record(self, metadata):
+        """Construct a OAI-PMH payload for a record"""
+
+        return common.Metadata(
+            None,
+            metadata
+        )
+
+    def build_metadata_map(self, result):
+        """Construct a metadata map object for oai metadata writing"""
+        dates = []
+        if result.publication_year:
+            dates.append(str(result.publication_year))
+        dates.extend([date['type'] + ": " + str(date['date']) for date in result.dates])
+
+        rights = []
+        for right in result.rights:
+            if right['statement']:
+                rights.append(right['statement'])
+            if right['uri']:
+                rights.append(right['uri'])
+
+        identifiers = [
+            identifier_to_string(identifier) for identifier in result.identifiers
+        ]
+
+        relations = [
+            identifier_to_string(relation)
+            for relation in result.relations
+        ]
+
+        contributors = [
+            contributor.get('name') for contributor in result.contributors
+        ]
+
+        metadata = {
+            'title': result.titles,
+            'creator': result.creators,
+            'subject': result.subjects,
+            'description': result.descriptions,
+            'publisher': [result.publisher] if result.publisher else [],
+            'contributor': contributors,
+            'date': dates,
+            'type': result.resource_types,
+            'format': result.formats,
+            'identifier': identifiers,
+            'relation': relations,
+            'language': [result.language] if result.language else [],
+            'rights': rights,
+            'xml': result.xml,
+            'set': result.client,
+            'metadata_version': result.metadata_version
+        }
+
+        return metadata
+
+
 def set_to_search_query(unparsed_set):
     """Take a oai set and extract any base64url encoded search query"""
 
@@ -294,7 +564,6 @@ def set_to_search_query(unparsed_set):
 
     return ""
 
-
 def set_to_provider_client(unparsed_set):
     """Take a oai set and convert into provider_id and client_id"""
 
diff --git a/viringo/config.py b/viringo/config.py
index a65636f..d9d7665 100644
--- a/viringo/config.py
+++ b/viringo/config.py
@@ -18,3 +18,13 @@
 OAIPMH_ADMIN_EMAIL = os.getenv('OAIPMH_ADMIN_EMAIL', 'support@datacite.org')
 # Page size of results shown for result listings
 RESULT_SET_SIZE = int(os.getenv('RESULT_SET_SIZE', '50'))
+# Source metadata catalog (DataCite or Postgres)
+CATALOG_SET = os.getenv('OAIPMH_CATALOG', 'Postgres')
+# Postgres server
+POSTGRES_SERVER = os.getenv('OAIPMH_POSTGRES_SERVER', '')
+# Postgres db
+POSTGRES_DB = os.getenv('OAIPMH_POSTGRES_DB', '')
+# Postgres user
+POSTGRES_USER = os.getenv('OAIPMH_POSTGRES_USER', '')
+# Postgres password
+POSTGRES_PASSWORD = os.getenv('OAIPMH_POSTGRES_PASSWORD', '')
\ No newline at end of file
diff --git a/viringo/oai.py b/viringo/oai.py
index 41f3377..261835f 100644
--- a/viringo/oai.py
+++ b/viringo/oai.py
@@ -12,7 +12,9 @@
 import oaipmh.datestamp
 
 from .catalogs import DataCiteOAIServer
+from .catalogs import PostgresOAIServer
 from . import metadata
+from . import config
 
 BP = Blueprint('oai', __name__)
 
@@ -93,7 +95,13 @@ def handleVerb(self, verb, kw):
 def get_oai_server():
     """Returns a pyoai server object that can process and return OAI requests"""
     if 'oai' not in g:
-        catalog_server = DataCiteOAIServer()
+        if config.CATALOG_SET == 'DateCite':
+            catalog_server = DataCiteOAIServer()
+        elif config.CATALOG_SET == 'Postgres':
+            catalog_server = PostgresOAIServer()
+        else:
+            print('No valid metadata catalog configured')
+            sys.exit(1)
 
         metadata_registry = oaipmh.metadata.MetadataRegistry()
         metadata_registry.registerWriter('oai_dc', metadata.oai_dc_writer)

From 595ef3b226e6614713054ae622bb7fb1b1f16071 Mon Sep 17 00:00:00 2001
From: Alex Garnett <axfelix@gmail.com>
Date: Tue, 21 Jan 2020 15:05:34 -0800
Subject: [PATCH 02/67] add lookup functions from frdr harvester for postgres
 export

---
 vendor/docker/env.conf       |   1 +
 viringo/catalogs.py          |  16 ++--
 viringo/config.py            |   2 +
 viringo/services/postgres.py | 180 +++++++++++++++++++++++++++++++++++
 4 files changed, 193 insertions(+), 6 deletions(-)
 create mode 100644 viringo/services/postgres.py

diff --git a/vendor/docker/env.conf b/vendor/docker/env.conf
index b8c5644..24951ad 100644
--- a/vendor/docker/env.conf
+++ b/vendor/docker/env.conf
@@ -3,6 +3,7 @@ env OAIPMH_BASE_URL;
 env DATACITE_API_URL;
 env OAIPMH_REPOS_NAME;
 env OAIPMH_ADMIN_EMAIL;
+env OAIPMH_IDENTIFIER;
 env SENTRY_DSN;
 env API_ADMIN_USERNAME;
 env API_ADMIN_PASSWORD;
diff --git a/viringo/catalogs.py b/viringo/catalogs.py
index b4dc22e..201b8ae 100644
--- a/viringo/catalogs.py
+++ b/viringo/catalogs.py
@@ -12,6 +12,7 @@
 
 from viringo import config
 from .services import datacite
+from .services import postgres
 
 class DataCiteOAIServer():
     """Build OAI-PMH data responses for DataCite metadata catalog"""
@@ -111,15 +112,18 @@ def listRecords(
         # If available get the search query from the set param
         search_query = set_to_search_query(set)
 
+        # From and until parameters aren't supported with Postgres
         # Get both a provider and client_id from the set
         provider_id, client_id = set_to_provider_client(set)
-        results, total_records, paging_cursor = datacite.get_metadata_list(
+        results, total_records, paging_cursor = postgres.get_metadata_list(
             query=search_query,
             provider_id=provider_id,
             client_id=client_id,
-            from_datetime=from_,
-            until_datetime=until,
-            cursor=paging_cursor
+            cursor=paging_cursor,
+            server=config.POSTGRES_SERVER,
+            db=config.POSTGRES_DB,
+            user=config.POSTGRES_USER,
+            password=config.POSTGRES_PASSWORD
         )
 
         records = []
@@ -302,9 +306,9 @@ def identify(self):
         datacite_desc = """
         <oai-identifier xmlns="http://www.openarchives.org/OAI/2.0/oai-identifier" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai-identifier http://www.openarchives.org/OAI/2.0/oai-identifier.xsd">
             <scheme>oai</scheme>
-            <repositoryIdentifier>oai.datacite.org</repositoryIdentifier>
+            <repositoryIdentifier>""" + config.OAIPMH_IDENTIFIER + """</repositoryIdentifier>
             <delimiter>:</delimiter>
-            <sampleIdentifier>oai:oai.datacite.org:12425</sampleIdentifier>
+            <sampleIdentifier>oai""" + config.OAIPMH_IDENTIFIER + """:1</sampleIdentifier>
         </oai-identifier>
         """
 
diff --git a/viringo/config.py b/viringo/config.py
index d9d7665..c2ba38a 100644
--- a/viringo/config.py
+++ b/viringo/config.py
@@ -16,6 +16,8 @@
 OAIPMH_BASE_URL = os.getenv('OAIPMH_BASE_URL', 'https://oai.datacite.org/oai')
 # Admin e-mail for the OAI-PMH service
 OAIPMH_ADMIN_EMAIL = os.getenv('OAIPMH_ADMIN_EMAIL', 'support@datacite.org')
+# OAI repository identifier
+OAIPMH_IDENTIFIER = os.getenv('OAIPMH_IDENTIFIER', 'oai.datacite.org')
 # Page size of results shown for result listings
 RESULT_SET_SIZE = int(os.getenv('RESULT_SET_SIZE', '50'))
 # Source metadata catalog (DataCite or Postgres)
diff --git a/viringo/services/postgres.py b/viringo/services/postgres.py
new file mode 100644
index 0000000..093dd3e
--- /dev/null
+++ b/viringo/services/postgres.py
@@ -0,0 +1,180 @@
+"""Handles DB queries for retrieving metadata"""
+
+import psycopg2
+import re
+
+def construct_local_url(record):
+        # Check if the local_identifier has already been turned into a url
+        if "http" in record["local_identifier"].lower():
+            return record["local_identifier"]
+
+        # Check for OAI format of identifier (oai:domain:id)
+        oai_id = None
+        oai_search = re.search("oai:(.+):(.+)", record["local_identifier"])
+        if oai_search:
+            oai_id = oai_search.group(2)
+            # TODO: determine if this is needed for all repos, or just SFU?
+            oai_id = oai_id.replace("_", ":")
+
+        # If given a pattern then substitue in the item ID and return it
+        if "item_url_pattern" in record and record["item_url_pattern"]:
+            if oai_id:
+                local_url = re.sub("(\%id\%)", oai_id, record["item_url_pattern"])
+            else:
+                local_url = re.sub("(\%id\%)", record["local_identifier"], record["item_url_pattern"])
+            return local_url
+
+        # Check if the identifier is a DOI
+        doi = re.search("(doi|DOI):\s?\S+", record["local_identifier"])
+        if doi:
+            doi = doi.group(0).rstrip('\.')
+            local_url = re.sub("(doi|DOI):\s?", "https://dx.doi.org/", doi)
+            return local_url
+
+        # If the item has a source URL, use it
+        if ('source_url' in record) and record['source_url']:
+            return record['source_url']
+
+        # URL is in the identifier
+        local_url = re.search("(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?",
+                              record["local_identifier"])
+        if local_url:
+            return local_url.group(0)
+
+        local_url = None
+        return local_url
+
+
+def rows_to_dict(cursor):
+        newdict = []
+        if cursor:
+            for r in cursor:
+                if r:
+                    if isinstance(r, list):
+                        newdict.append(r[0])
+                    else:
+                        newdict.append(r)
+        return newdict
+
+
+def get_metadata_list(
+        query=None,
+        provider_id=None,
+        client_id=None,
+        cursor=None,
+        server,
+        db,
+        user,
+        password
+    ):
+
+    records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (
+                    db, user, password, server))
+    with records_con:
+        records_cursor = records_con.cursor()
+
+    records_sql = """SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.deleted, recs.local_identifier, recs.modified_timestamp,
+        repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp
+        FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id"""
+
+    records_cursor.execute(records_sql)
+
+    # Need to see if we can somehow page the OAI response to the DB query so it only requests a fixed number at a time
+    for row in records_cursor:
+        record = (dict(zip(
+            ['record_id', 'title', 'pub_date', 'contact', 'series', 'source_url', 'deleted', 'local_identifier',
+             'modified_timestamp',
+             'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern',
+             'last_crawl_timestamp'], row)))
+        record["deleted"] = int(record["deleted"])
+
+        record["dc:source"] = construct_local_url(record)
+        if record["dc:source"] is None:
+            continue
+            
+        if record["deleted"] == 1:
+            continue
+
+        if (len(record['title']) == 0):
+            continue
+
+        con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (
+                    db, user, password, server))
+        with con:
+            lookup_cur = con.cursor(cursor_factory=None)
+
+            lookup_cur.execute("SELECT coordinate_type, lat, lon FROM geospatial WHERE record_id=?",
+                            (record["record_id"],))
+            geodata = lookup_cur.fetchall()
+            record["frdr:geospatial"] = []
+            polycoordinates = []
+
+            try:
+                for coordinate in geodata:
+                    if coordinate[0] == "Polygon":
+                        polycoordinates.append([float(coordinate[1]), float(coordinate[2])])
+                    else:
+                        record["frdr:geospatial"].append({"frdr:geospatial_type": "Feature",
+                                                          "frdr:geospatial_geometry": {
+                                                              "frdr:geometry_type": coordinate[0],
+                                                              "frdr:geometry_coordinates": [float(coordinate[1]),
+                                                                                            float(coordinate[2])]}})
+            except:
+                pass
+
+            if polycoordinates:
+                record["frdr:geospatial"].append({"frdr:geospatial_type": "Feature",
+                                                  "frdr:geospatial_geometry": {"frdr:geometry_type": "Polygon",
+                                                                               "frdr:geometry_coordinates": polycoordinates}})
+
+        with con:
+            lookup_cur = con.cursor(cursor_factory=DictCursor)
+
+            # attach the other values to the dict
+            lookup_cur.execute("""SELECT creators.creator FROM creators JOIN records_x_creators on records_x_creators.creator_id = creators.creator_id
+                WHERE records_x_creators.record_id=? AND records_x_creators.is_contributor=0 order by records_x_creators_id asc""",
+                            (record["record_id"],))
+            record["dc:contributor.author"] = rows_to_dict(lookup_cur)
+
+            lookup_cur.execute("""SELECT affiliations.affiliation FROM affiliations JOIN records_x_affiliations on records_x_affiliations.affiliation_id = affiliations.affiliation_id
+                WHERE records_x_affiliations.record_id=?""", (record["record_id"],))
+            record["datacite:creatorAffiliation"] = rows_to_dict(lookup_cur)
+
+            lookup_cur.execute("""SELECT creators.creator FROM creators JOIN records_x_creators on records_x_creators.creator_id = creators.creator_id
+                WHERE records_x_creators.record_id=? AND records_x_creators.is_contributor=1 order by records_x_creators_id asc""",
+                            (record["record_id"],))
+            record["dc:contributor"] = rows_to_dict(lookup_cur)
+
+            lookup_cur.execute("""SELECT subjects.subject FROM subjects JOIN records_x_subjects on records_x_subjects.subject_id = subjects.subject_id
+                WHERE records_x_subjects.record_id=?""", (record["record_id"],))
+            record["dc:subject"] = rows_to_dict(lookup_cur)
+
+            lookup_cur.execute("""SELECT publishers.publisher FROM publishers JOIN records_x_publishers on records_x_publishers.publisher_id = publishers.publisher_id
+                WHERE records_x_publishers.record_id=?""", (record["record_id"],))
+            record["dc:publisher"] = rows_to_dict(lookup_cur)
+
+            lookup_cur.execute("""SELECT rights.rights FROM rights JOIN records_x_rights on records_x_rights.rights_id = rights.rights_id
+                                                   WHERE records_x_rights.record_id=?""", (record["record_id"],))
+            record["dc:rights"] = rows_to_dict(lookup_cur)
+
+            lookup_cur.execute(
+                "SELECT description FROM descriptions WHERE record_id=? and language='en' "),
+                (record["record_id"],)
+            record["dc:description"] = rows_to_dict(lookup_cur)
+
+            lookup_cur.execute(
+                "SELECT description FROM descriptions WHERE record_id=? and language='fr' "),
+                (record["record_id"],)
+            record["frdr:description_fr"] = rows_to_dict(lookup_cur)
+
+            lookup_cur.execute("""SELECT tags.tag FROM tags JOIN records_x_tags on records_x_tags.tag_id = tags.tag_id
+                WHERE records_x_tags.record_id=? and tags.language = 'en' """, (record["record_id"],))
+            record["frdr:tags"] = rows_to_dict(lookup_cur)
+
+            lookup_cur.execute("""SELECT tags.tag FROM tags JOIN records_x_tags on records_x_tags.tag_id = tags.tag_id
+                WHERE records_x_tags.record_id=? and tags.language = 'fr' """, (record["record_id"],))
+            record["frdr:tags_fr"] = rows_to_dict(lookup_cur)
+
+            lookup_cur.execute("""SELECT access.access FROM access JOIN records_x_access on records_x_access.access_id = access.access_id
+                WHERE records_x_access.record_id=?""", (record["record_id"],))
+            record["frdr:access"] = rows_to_dict(lookup_cur)
\ No newline at end of file

From fe861ee84f9369d9638a9f8389f3c6c5f49b82f6 Mon Sep 17 00:00:00 2001
From: Alex Garnett <axfelix@gmail.com>
Date: Tue, 21 Jan 2020 15:53:23 -0800
Subject: [PATCH 03/67] cleanups, add other helper function stubs

---
 viringo/catalogs.py          | 42 +++++++++++++++++++-----------------
 viringo/services/postgres.py | 18 +++++++++++++++-
 2 files changed, 39 insertions(+), 21 deletions(-)

diff --git a/viringo/catalogs.py b/viringo/catalogs.py
index 201b8ae..df8fa53 100644
--- a/viringo/catalogs.py
+++ b/viringo/catalogs.py
@@ -115,15 +115,13 @@ def listRecords(
         # From and until parameters aren't supported with Postgres
         # Get both a provider and client_id from the set
         provider_id, client_id = set_to_provider_client(set)
-        results, total_records, paging_cursor = postgres.get_metadata_list(
+        results, total_records, paging_cursor = datacite.get_metadata_list(
             query=search_query,
             provider_id=provider_id,
             client_id=client_id,
-            cursor=paging_cursor,
-            server=config.POSTGRES_SERVER,
-            db=config.POSTGRES_DB,
-            user=config.POSTGRES_USER,
-            password=config.POSTGRES_PASSWORD
+            from_datetime=from_,
+            until_datetime=until,
+            cursor=paging_cursor
         )
 
         records = []
@@ -303,7 +301,7 @@ def identify(self):
             toolkit_description=False)
 
         # Specify a custom description
-        datacite_desc = """
+        postgres_desc = """
         <oai-identifier xmlns="http://www.openarchives.org/OAI/2.0/oai-identifier" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai-identifier http://www.openarchives.org/OAI/2.0/oai-identifier.xsd">
             <scheme>oai</scheme>
             <repositoryIdentifier>""" + config.OAIPMH_IDENTIFIER + """</repositoryIdentifier>
@@ -312,7 +310,7 @@ def identify(self):
         </oai-identifier>
         """
 
-        identify.add_description(xml_string=datacite_desc)
+        identify.add_description(xml_string=postgres_desc)
 
         return identify
 
@@ -348,10 +346,9 @@ def getRecord(self, metadataPrefix, identifier):
         #pylint: disable=no-self-use,invalid-name
         """Returns pyoai data tuple for specific record"""
 
-        # We just want the DOI out of the OAI identifier.
-        _, doi = identifier.split(':', 1)
+        # Should we implement this based on source_url and local_identifier the way we currently do for the harvester? 
 
-        result = datacite.get_metadata(doi)
+        result = postgres.get_metadata(identifier)
         if not result:
             raise error.IdDoesNotExistError(
                 "\"%s\" is unknown or illegal in this repository" % identifier
@@ -386,13 +383,15 @@ def listRecords(
 
         # Get both a provider and client_id from the set
         provider_id, client_id = set_to_provider_client(set)
-        results, total_records, paging_cursor = datacite.get_metadata_list(
+        results, total_records, paging_cursor = postgres.get_metadata_list(
             query=search_query,
             provider_id=provider_id,
             client_id=client_id,
-            from_datetime=from_,
-            until_datetime=until,
-            cursor=paging_cursor
+            cursor=paging_cursor,
+            server=config.POSTGRES_SERVER,
+            db=config.POSTGRES_DB,
+            user=config.POSTGRES_USER,
+            password=config.POSTGRES_PASSWORD
         )
 
         records = []
@@ -430,12 +429,15 @@ def listIdentifiers(
         # Get both a provider and client_id from the set
         provider_id, client_id = set_to_provider_client(set)
 
-        results, total_records, paging_cursor = datacite.get_metadata_list(
+        results, total_records, paging_cursor = postgres.get_metadata_list(
+            query=search_query,
             provider_id=provider_id,
             client_id=client_id,
-            from_datetime=from_,
-            until_datetime=until,
-            cursor=paging_cursor
+            cursor=paging_cursor,
+            server=config.POSTGRES_SERVER,
+            db=config.POSTGRES_DB,
+            user=config.POSTGRES_USER,
+            password=config.POSTGRES_PASSWORD
         )
 
         records = []
@@ -466,7 +468,7 @@ def listSets(
 
         batch_size = 50
         next_batch = paging_cursor + batch_size
-        results, total_results = datacite.get_sets()
+        results, total_results = postgres.get_sets()
         results = results[paging_cursor: next_batch]
 
         if len(results) < batch_size:
diff --git a/viringo/services/postgres.py b/viringo/services/postgres.py
index 093dd3e..0cd3368 100644
--- a/viringo/services/postgres.py
+++ b/viringo/services/postgres.py
@@ -177,4 +177,20 @@ def get_metadata_list(
 
             lookup_cur.execute("""SELECT access.access FROM access JOIN records_x_access on records_x_access.access_id = access.access_id
                 WHERE records_x_access.record_id=?""", (record["record_id"],))
-            record["frdr:access"] = rows_to_dict(lookup_cur)
\ No newline at end of file
+            record["frdr:access"] = rows_to_dict(lookup_cur)
+
+
+def get_metadata(identifier):
+    # Probably need to refactor some of get_metadata_list so it can be run on one record without duplication
+    return None
+
+
+def get_sets():
+    repos_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (
+                    db, user, password, server))
+    with repos_con:
+        repos_cursor = repos_con.cursor()
+
+    repos_cursor.execute("SELECT repository_name from repositories")
+    # Format this properly to return it like the DataCite response
+    return None
\ No newline at end of file

From 2bb3759067df3a4141c4257644a570f2bfaf7866 Mon Sep 17 00:00:00 2001
From: Alex Garnett <axfelix@gmail.com>
Date: Thu, 23 Jan 2020 16:01:37 -0800
Subject: [PATCH 04/67] refactor returning single records, clarify todos,
 implement cursor

---
 viringo/catalogs.py          |   2 +-
 viringo/services/postgres.py | 218 +++++++++++++++++------------------
 2 files changed, 109 insertions(+), 111 deletions(-)

diff --git a/viringo/catalogs.py b/viringo/catalogs.py
index df8fa53..ce039aa 100644
--- a/viringo/catalogs.py
+++ b/viringo/catalogs.py
@@ -348,7 +348,7 @@ def getRecord(self, metadataPrefix, identifier):
 
         # Should we implement this based on source_url and local_identifier the way we currently do for the harvester? 
 
-        result = postgres.get_metadata(identifier)
+        result = postgres.get_metadata(identifier, db, user, password, server)
         if not result:
             raise error.IdDoesNotExistError(
                 "\"%s\" is unknown or illegal in this repository" % identifier
diff --git a/viringo/services/postgres.py b/viringo/services/postgres.py
index 0cd3368..6885462 100644
--- a/viringo/services/postgres.py
+++ b/viringo/services/postgres.py
@@ -2,6 +2,7 @@
 
 import psycopg2
 import re
+from viringo import config
 
 def construct_local_url(record):
         # Check if the local_identifier has already been turned into a url
@@ -57,140 +58,137 @@ def rows_to_dict(cursor):
         return newdict
 
 
-def get_metadata_list(
-        query=None,
-        provider_id=None,
-        client_id=None,
-        cursor=None,
-        server,
-        db,
-        user,
-        password
-    ):
+def assemble_record(record, db, user, password, server):
+    record["dc:source"] = construct_local_url(record)
+    if record["dc:source"] is None:
+        return None
+        
+    if int(record["deleted"]) == 1:
+        return None
 
-    records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (
-                    db, user, password, server))
-    with records_con:
-        records_cursor = records_con.cursor()
+    if (len(record['title']) == 0):
+        return None
 
-    records_sql = """SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.deleted, recs.local_identifier, recs.modified_timestamp,
-        repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp
-        FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id"""
+    con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (db, user, password, server))
+    with con:
+        lookup_cur = con.cursor(cursor_factory=None)
 
-    records_cursor.execute(records_sql)
+        lookup_cur.execute("SELECT coordinate_type, lat, lon FROM geospatial WHERE record_id=?",
+                        (record["record_id"],))
+        geodata = lookup_cur.fetchall()
+        record["frdr:geospatial"] = []
+        polycoordinates = []
 
-    # Need to see if we can somehow page the OAI response to the DB query so it only requests a fixed number at a time
-    for row in records_cursor:
-        record = (dict(zip(
-            ['record_id', 'title', 'pub_date', 'contact', 'series', 'source_url', 'deleted', 'local_identifier',
-             'modified_timestamp',
-             'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern',
-             'last_crawl_timestamp'], row)))
-        record["deleted"] = int(record["deleted"])
-
-        record["dc:source"] = construct_local_url(record)
-        if record["dc:source"] is None:
-            continue
-            
-        if record["deleted"] == 1:
-            continue
-
-        if (len(record['title']) == 0):
-            continue
-
-        con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (
-                    db, user, password, server))
-        with con:
-            lookup_cur = con.cursor(cursor_factory=None)
-
-            lookup_cur.execute("SELECT coordinate_type, lat, lon FROM geospatial WHERE record_id=?",
-                            (record["record_id"],))
-            geodata = lookup_cur.fetchall()
-            record["frdr:geospatial"] = []
-            polycoordinates = []
-
-            try:
-                for coordinate in geodata:
-                    if coordinate[0] == "Polygon":
-                        polycoordinates.append([float(coordinate[1]), float(coordinate[2])])
-                    else:
-                        record["frdr:geospatial"].append({"frdr:geospatial_type": "Feature",
-                                                          "frdr:geospatial_geometry": {
-                                                              "frdr:geometry_type": coordinate[0],
-                                                              "frdr:geometry_coordinates": [float(coordinate[1]),
-                                                                                            float(coordinate[2])]}})
-            except:
-                pass
+        try:
+            for coordinate in geodata:
+                if coordinate[0] == "Polygon":
+                    polycoordinates.append([float(coordinate[1]), float(coordinate[2])])
+                else:
+                    record["frdr:geospatial"].append({"frdr:geospatial_type": "Feature", "frdr:geospatial_geometry": {"frdr:geometry_type": coordinate[0], "frdr:geometry_coordinates": [float(coordinate[1]), float(coordinate[2])]}})
+        except:
+            pass
+
+        if polycoordinates:
+            record["frdr:geospatial"].append({"frdr:geospatial_type": "Feature", "frdr:geospatial_geometry": {"frdr:geometry_type": "Polygon", "frdr:geometry_coordinates": polycoordinates}})
 
-            if polycoordinates:
-                record["frdr:geospatial"].append({"frdr:geospatial_type": "Feature",
-                                                  "frdr:geospatial_geometry": {"frdr:geometry_type": "Polygon",
-                                                                               "frdr:geometry_coordinates": polycoordinates}})
+    with con:
+        lookup_cur = con.cursor(cursor_factory=DictCursor)
 
-        with con:
-            lookup_cur = con.cursor(cursor_factory=DictCursor)
+        # attach the other values to the dict
+        lookup_cur.execute("""SELECT creators.creator FROM creators JOIN records_x_creators on records_x_creators.creator_id = creators.creator_id WHERE records_x_creators.record_id=? AND records_x_creators.is_contributor=0 order by records_x_creators_id asc""", (record["record_id"],))
+        record["dc:contributor.author"] = rows_to_dict(lookup_cur)
 
-            # attach the other values to the dict
-            lookup_cur.execute("""SELECT creators.creator FROM creators JOIN records_x_creators on records_x_creators.creator_id = creators.creator_id
-                WHERE records_x_creators.record_id=? AND records_x_creators.is_contributor=0 order by records_x_creators_id asc""",
-                            (record["record_id"],))
-            record["dc:contributor.author"] = rows_to_dict(lookup_cur)
+        lookup_cur.execute("""SELECT affiliations.affiliation FROM affiliations JOIN records_x_affiliations on records_x_affiliations.affiliation_id = affiliations.affiliation_id WHERE records_x_affiliations.record_id=?""", (record["record_id"],))
+        record["datacite:creatorAffiliation"] = rows_to_dict(lookup_cur)
 
-            lookup_cur.execute("""SELECT affiliations.affiliation FROM affiliations JOIN records_x_affiliations on records_x_affiliations.affiliation_id = affiliations.affiliation_id
-                WHERE records_x_affiliations.record_id=?""", (record["record_id"],))
-            record["datacite:creatorAffiliation"] = rows_to_dict(lookup_cur)
+        lookup_cur.execute("""SELECT creators.creator FROM creators JOIN records_x_creators on records_x_creators.creator_id = creators.creator_id WHERE records_x_creators.record_id=? AND records_x_creators.is_contributor=1 order by records_x_creators_id asc""", (record["record_id"],))
+        record["dc:contributor"] = rows_to_dict(lookup_cur)
 
-            lookup_cur.execute("""SELECT creators.creator FROM creators JOIN records_x_creators on records_x_creators.creator_id = creators.creator_id
-                WHERE records_x_creators.record_id=? AND records_x_creators.is_contributor=1 order by records_x_creators_id asc""",
-                            (record["record_id"],))
-            record["dc:contributor"] = rows_to_dict(lookup_cur)
+        lookup_cur.execute("""SELECT subjects.subject FROM subjects JOIN records_x_subjects on records_x_subjects.subject_id = subjects.subject_id WHERE records_x_subjects.record_id=?""", (record["record_id"],))
+        record["dc:subject"] = rows_to_dict(lookup_cur)
 
-            lookup_cur.execute("""SELECT subjects.subject FROM subjects JOIN records_x_subjects on records_x_subjects.subject_id = subjects.subject_id
-                WHERE records_x_subjects.record_id=?""", (record["record_id"],))
-            record["dc:subject"] = rows_to_dict(lookup_cur)
+        lookup_cur.execute("""SELECT publishers.publisher FROM publishers JOIN records_x_publishers on records_x_publishers.publisher_id = publishers.publisher_id WHERE records_x_publishers.record_id=?""", (record["record_id"],))
+        record["dc:publisher"] = rows_to_dict(lookup_cur)
 
-            lookup_cur.execute("""SELECT publishers.publisher FROM publishers JOIN records_x_publishers on records_x_publishers.publisher_id = publishers.publisher_id
-                WHERE records_x_publishers.record_id=?""", (record["record_id"],))
-            record["dc:publisher"] = rows_to_dict(lookup_cur)
+        lookup_cur.execute("""SELECT rights.rights FROM rights JOIN records_x_rights on records_x_rights.rights_id = rights.rights_id WHERE records_x_rights.record_id=?""", (record["record_id"],))
+        record["dc:rights"] = rows_to_dict(lookup_cur)
 
-            lookup_cur.execute("""SELECT rights.rights FROM rights JOIN records_x_rights on records_x_rights.rights_id = rights.rights_id
-                                                   WHERE records_x_rights.record_id=?""", (record["record_id"],))
-            record["dc:rights"] = rows_to_dict(lookup_cur)
+        lookup_cur.execute("SELECT description FROM descriptions WHERE record_id=? and language='en' "), (record["record_id"],)
+        record["dc:description"] = rows_to_dict(lookup_cur)
 
-            lookup_cur.execute(
-                "SELECT description FROM descriptions WHERE record_id=? and language='en' "),
-                (record["record_id"],)
-            record["dc:description"] = rows_to_dict(lookup_cur)
+        lookup_cur.execute("SELECT description FROM descriptions WHERE record_id=? and language='fr' "), (record["record_id"],)
+        record["frdr:description_fr"] = rows_to_dict(lookup_cur)
 
-            lookup_cur.execute(
-                "SELECT description FROM descriptions WHERE record_id=? and language='fr' "),
-                (record["record_id"],)
-            record["frdr:description_fr"] = rows_to_dict(lookup_cur)
+        lookup_cur.execute("""SELECT tags.tag FROM tags JOIN records_x_tags on records_x_tags.tag_id = tags.tag_id WHERE records_x_tags.record_id=? and tags.language = 'en' """, (record["record_id"],))
+        record["frdr:tags"] = rows_to_dict(lookup_cur)
 
-            lookup_cur.execute("""SELECT tags.tag FROM tags JOIN records_x_tags on records_x_tags.tag_id = tags.tag_id
-                WHERE records_x_tags.record_id=? and tags.language = 'en' """, (record["record_id"],))
-            record["frdr:tags"] = rows_to_dict(lookup_cur)
+        lookup_cur.execute("""SELECT tags.tag FROM tags JOIN records_x_tags on records_x_tags.tag_id = tags.tag_id WHERE records_x_tags.record_id=? and tags.language = 'fr' """, (record["record_id"],))
+        record["frdr:tags_fr"] = rows_to_dict(lookup_cur)
 
-            lookup_cur.execute("""SELECT tags.tag FROM tags JOIN records_x_tags on records_x_tags.tag_id = tags.tag_id
-                WHERE records_x_tags.record_id=? and tags.language = 'fr' """, (record["record_id"],))
-            record["frdr:tags_fr"] = rows_to_dict(lookup_cur)
+        lookup_cur.execute("""SELECT access.access FROM access JOIN records_x_access on records_x_access.access_id = access.access_id WHERE records_x_access.record_id=?""", (record["record_id"],))
+        record["frdr:access"] = rows_to_dict(lookup_cur)
 
-            lookup_cur.execute("""SELECT access.access FROM access JOIN records_x_access on records_x_access.access_id = access.access_id
-                WHERE records_x_access.record_id=?""", (record["record_id"],))
-            record["frdr:access"] = rows_to_dict(lookup_cur)
+    return record
 
 
-def get_metadata(identifier):
-    # Probably need to refactor some of get_metadata_list so it can be run on one record without duplication
-    return None
+def build_metadata(full_record):
+    # TODO: construct object to match DataCite reponse and return it
+
+
+def get_metadata_list(
+        query=None,
+        provider_id=None,
+        client_id=None,
+        cursor=None,
+        server,
+        db,
+        user,
+        password
+    ):
+
+    # Trigger cursor navigation with a starting value
+    if not cursor:
+        cursor = 1
+        records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (db, user, password, server))
+        with records_con:
+            records_cursor = records_con.cursor()
+        records_sql = """SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id"""
+        records_cursor.execute(records_sql)
+
+    record_set = records_cursor.fetchmany(config.RESULT_SET_SIZE)
+    results = []
+    for row in record_set:
+        record = (dict(zip(['record_id', 'title', 'pub_date', 'contact', 'series', 'source_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp'], row)))
+
+        full_record = assemble_record(record, db, user, password, server)
+        if full_record is not None:
+            results.append(build_metadata(full_record))
+
+    cursor += config.RESULT_SET_SIZE
+    # TODO: Probably need to pass rowcount back to this function for Postgres output
+    return results, records_cursor.rowcount, cursor
+
+
+def get_metadata(identifier, db, user, password, server):
+    records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (db, user, password, server))
+    with records_con:
+        records_cursor = records_con.cursor()
+    # TODO: record_id is kind of a meaningless identifier, support local identifier + source URL
+    records_sql = ("""SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id AND recs.record_id =?""", (identifier,))
+    records_cursor.execute(records_sql)
+    row = records_cursor.fetchone()
+    record = (dict(zip(['record_id', 'title', 'pub_date', 'contact', 'series', 'source_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp'], row)))
+
+    full_record = assemble_record(record, db, user, password, server)
+    return build_metadata(full_record)
 
 
 def get_sets():
-    repos_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (
-                    db, user, password, server))
+    repos_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (db, user, password, server))
     with repos_con:
         repos_cursor = repos_con.cursor()
 
     repos_cursor.execute("SELECT repository_name from repositories")
-    # Format this properly to return it like the DataCite response
+    sets = repos_cursor.fetchall()
+    # TODO: Format this properly to return it like the DataCite response
     return None
\ No newline at end of file

From dbc1637291037f3fec1d799eeb9de6cc9e1695fd Mon Sep 17 00:00:00 2001
From: Alex Garnett <axfelix@gmail.com>
Date: Thu, 23 Jan 2020 16:31:54 -0800
Subject: [PATCH 05/67] add preliminary result formatting for datacite

---
 viringo/services/postgres.py | 107 +++++++++++++++++++++++++++++++++--
 1 file changed, 102 insertions(+), 5 deletions(-)

diff --git a/viringo/services/postgres.py b/viringo/services/postgres.py
index 6885462..2e8d81c 100644
--- a/viringo/services/postgres.py
+++ b/viringo/services/postgres.py
@@ -2,8 +2,108 @@
 
 import psycopg2
 import re
+from datetime import datetime
+import dateutil.parser
+import dateutil.tz
 from viringo import config
 
+class Metadata:
+    """Represents a DataCite metadata resultset"""
+    def __init__(
+            self,
+            identifier=None,
+            created_datetime=None,
+            updated_datetime=None,
+            xml=None,
+            metadata_version=None,
+            titles=None,
+            creators=None,
+            subjects=None,
+            descriptions=None,
+            publisher=None,
+            publication_year=None,
+            dates=None,
+            contributors=None,
+            resource_types=None,
+            funding_references=None,
+            geo_locations=None,
+            formats=None,
+            identifiers=None,
+            language=None,
+            relations=None,
+            rights=None,
+            sizes=None,
+            client=None,
+            active=True
+        ):
+
+        self.identifier = identifier
+        self.created_datetime = created_datetime or datetime.min
+        self.updated_datetime = updated_datetime or datetime.min
+        self.xml = xml
+        self.metadata_version = metadata_version
+        self.titles = titles or []
+        self.creators = creators or []
+        self.subjects = subjects or []
+        self.descriptions = descriptions or []
+        self.publisher = publisher
+        self.publication_year = publication_year
+        self.dates = dates or []
+        self.contributors = contributors or []
+        self.resource_types = resource_types or []
+        self.funding_references = funding_references or []
+        self.geo_locations = geo_locations or []
+        self.formats = formats or []
+        self.identifiers = identifiers or []
+        self.language = language
+        self.relations = relations or []
+        self.rights = rights or []
+        self.sizes = sizes or []
+        self.client = client
+        self.active = active
+
+
+def build_metadata(data):
+    """Parse single postgres result into metadata object"""
+    result = Metadata()
+
+    result.identifier = data['record_id']
+
+    # Here we want to parse a ISO date but convert to UTC and then remove the TZinfo entirely
+    # This is because OAI always works in UTC.
+    created = dateutil.parser.parse(data['pub_date'])
+    result.created_datetime = created.astimezone(dateutil.tz.UTC).replace(tzinfo=None)
+    updated = dateutil.parser.parse(data['pub_date'])
+    result.updated_datetime = updated.astimezone(dateutil.tz.UTC).replace(tzinfo=None)
+
+    result.xml = None
+
+    # TODO: should I not be hardcoding this for datacite? add other fields based on current XML export
+    result.metadata_version = 4
+
+    result.titles = [data['title']]
+    result.creators = data['dc:contributor.author']
+    result.subjects = data['dc:subject']
+    result.descriptions = data['dc:description']
+    result.publisher = data['dc:publisher']
+    result.publication_year = dateutil.parser.parse(data['pub_date']).year
+    result.dates = [data['pub_date']]
+    result.contributors = data['dc:contributor']
+    result.funding_references = []
+    result.sizes = []
+    result.geo_locations = data['frdr:geospatial']
+    result.resource_types = []
+    result.formats = []
+    result.identifiers = []
+    result.language = ''
+    result.relations = []
+    result.rights = data['dc:rights']
+    result.client = ''
+    result.active = True
+
+    return result
+
+
 def construct_local_url(record):
         # Check if the local_identifier has already been turned into a url
         if "http" in record["local_identifier"].lower():
@@ -14,7 +114,6 @@ def construct_local_url(record):
         oai_search = re.search("oai:(.+):(.+)", record["local_identifier"])
         if oai_search:
             oai_id = oai_search.group(2)
-            # TODO: determine if this is needed for all repos, or just SFU?
             oai_id = oai_id.replace("_", ":")
 
         # If given a pattern then substitue in the item ID and return it
@@ -131,10 +230,6 @@ def assemble_record(record, db, user, password, server):
     return record
 
 
-def build_metadata(full_record):
-    # TODO: construct object to match DataCite reponse and return it
-
-
 def get_metadata_list(
         query=None,
         provider_id=None,
@@ -146,6 +241,8 @@ def get_metadata_list(
         password
     ):
 
+    # TODO: support listing by set
+
     # Trigger cursor navigation with a starting value
     if not cursor:
         cursor = 1

From 9f08a03af0797dc25c37d3cf2f0adcc626bb5f99 Mon Sep 17 00:00:00 2001
From: Alex Garnett <axfelix@gmail.com>
Date: Wed, 29 Jan 2020 13:51:02 -0800
Subject: [PATCH 06/67] fix outstanding TODOs, let's test this

---
 viringo/services/postgres.py | 572 +++++++++++++++++------------------
 1 file changed, 281 insertions(+), 291 deletions(-)

diff --git a/viringo/services/postgres.py b/viringo/services/postgres.py
index 2e8d81c..258d77e 100644
--- a/viringo/services/postgres.py
+++ b/viringo/services/postgres.py
@@ -1,291 +1,281 @@
-"""Handles DB queries for retrieving metadata"""
-
-import psycopg2
-import re
-from datetime import datetime
-import dateutil.parser
-import dateutil.tz
-from viringo import config
-
-class Metadata:
-    """Represents a DataCite metadata resultset"""
-    def __init__(
-            self,
-            identifier=None,
-            created_datetime=None,
-            updated_datetime=None,
-            xml=None,
-            metadata_version=None,
-            titles=None,
-            creators=None,
-            subjects=None,
-            descriptions=None,
-            publisher=None,
-            publication_year=None,
-            dates=None,
-            contributors=None,
-            resource_types=None,
-            funding_references=None,
-            geo_locations=None,
-            formats=None,
-            identifiers=None,
-            language=None,
-            relations=None,
-            rights=None,
-            sizes=None,
-            client=None,
-            active=True
-        ):
-
-        self.identifier = identifier
-        self.created_datetime = created_datetime or datetime.min
-        self.updated_datetime = updated_datetime or datetime.min
-        self.xml = xml
-        self.metadata_version = metadata_version
-        self.titles = titles or []
-        self.creators = creators or []
-        self.subjects = subjects or []
-        self.descriptions = descriptions or []
-        self.publisher = publisher
-        self.publication_year = publication_year
-        self.dates = dates or []
-        self.contributors = contributors or []
-        self.resource_types = resource_types or []
-        self.funding_references = funding_references or []
-        self.geo_locations = geo_locations or []
-        self.formats = formats or []
-        self.identifiers = identifiers or []
-        self.language = language
-        self.relations = relations or []
-        self.rights = rights or []
-        self.sizes = sizes or []
-        self.client = client
-        self.active = active
-
-
-def build_metadata(data):
-    """Parse single postgres result into metadata object"""
-    result = Metadata()
-
-    result.identifier = data['record_id']
-
-    # Here we want to parse a ISO date but convert to UTC and then remove the TZinfo entirely
-    # This is because OAI always works in UTC.
-    created = dateutil.parser.parse(data['pub_date'])
-    result.created_datetime = created.astimezone(dateutil.tz.UTC).replace(tzinfo=None)
-    updated = dateutil.parser.parse(data['pub_date'])
-    result.updated_datetime = updated.astimezone(dateutil.tz.UTC).replace(tzinfo=None)
-
-    result.xml = None
-
-    # TODO: should I not be hardcoding this for datacite? add other fields based on current XML export
-    result.metadata_version = 4
-
-    result.titles = [data['title']]
-    result.creators = data['dc:contributor.author']
-    result.subjects = data['dc:subject']
-    result.descriptions = data['dc:description']
-    result.publisher = data['dc:publisher']
-    result.publication_year = dateutil.parser.parse(data['pub_date']).year
-    result.dates = [data['pub_date']]
-    result.contributors = data['dc:contributor']
-    result.funding_references = []
-    result.sizes = []
-    result.geo_locations = data['frdr:geospatial']
-    result.resource_types = []
-    result.formats = []
-    result.identifiers = []
-    result.language = ''
-    result.relations = []
-    result.rights = data['dc:rights']
-    result.client = ''
-    result.active = True
-
-    return result
-
-
-def construct_local_url(record):
-        # Check if the local_identifier has already been turned into a url
-        if "http" in record["local_identifier"].lower():
-            return record["local_identifier"]
-
-        # Check for OAI format of identifier (oai:domain:id)
-        oai_id = None
-        oai_search = re.search("oai:(.+):(.+)", record["local_identifier"])
-        if oai_search:
-            oai_id = oai_search.group(2)
-            oai_id = oai_id.replace("_", ":")
-
-        # If given a pattern then substitue in the item ID and return it
-        if "item_url_pattern" in record and record["item_url_pattern"]:
-            if oai_id:
-                local_url = re.sub("(\%id\%)", oai_id, record["item_url_pattern"])
-            else:
-                local_url = re.sub("(\%id\%)", record["local_identifier"], record["item_url_pattern"])
-            return local_url
-
-        # Check if the identifier is a DOI
-        doi = re.search("(doi|DOI):\s?\S+", record["local_identifier"])
-        if doi:
-            doi = doi.group(0).rstrip('\.')
-            local_url = re.sub("(doi|DOI):\s?", "https://dx.doi.org/", doi)
-            return local_url
-
-        # If the item has a source URL, use it
-        if ('source_url' in record) and record['source_url']:
-            return record['source_url']
-
-        # URL is in the identifier
-        local_url = re.search("(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?",
-                              record["local_identifier"])
-        if local_url:
-            return local_url.group(0)
-
-        local_url = None
-        return local_url
-
-
-def rows_to_dict(cursor):
-        newdict = []
-        if cursor:
-            for r in cursor:
-                if r:
-                    if isinstance(r, list):
-                        newdict.append(r[0])
-                    else:
-                        newdict.append(r)
-        return newdict
-
-
-def assemble_record(record, db, user, password, server):
-    record["dc:source"] = construct_local_url(record)
-    if record["dc:source"] is None:
-        return None
-        
-    if int(record["deleted"]) == 1:
-        return None
-
-    if (len(record['title']) == 0):
-        return None
-
-    con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (db, user, password, server))
-    with con:
-        lookup_cur = con.cursor(cursor_factory=None)
-
-        lookup_cur.execute("SELECT coordinate_type, lat, lon FROM geospatial WHERE record_id=?",
-                        (record["record_id"],))
-        geodata = lookup_cur.fetchall()
-        record["frdr:geospatial"] = []
-        polycoordinates = []
-
-        try:
-            for coordinate in geodata:
-                if coordinate[0] == "Polygon":
-                    polycoordinates.append([float(coordinate[1]), float(coordinate[2])])
-                else:
-                    record["frdr:geospatial"].append({"frdr:geospatial_type": "Feature", "frdr:geospatial_geometry": {"frdr:geometry_type": coordinate[0], "frdr:geometry_coordinates": [float(coordinate[1]), float(coordinate[2])]}})
-        except:
-            pass
-
-        if polycoordinates:
-            record["frdr:geospatial"].append({"frdr:geospatial_type": "Feature", "frdr:geospatial_geometry": {"frdr:geometry_type": "Polygon", "frdr:geometry_coordinates": polycoordinates}})
-
-    with con:
-        lookup_cur = con.cursor(cursor_factory=DictCursor)
-
-        # attach the other values to the dict
-        lookup_cur.execute("""SELECT creators.creator FROM creators JOIN records_x_creators on records_x_creators.creator_id = creators.creator_id WHERE records_x_creators.record_id=? AND records_x_creators.is_contributor=0 order by records_x_creators_id asc""", (record["record_id"],))
-        record["dc:contributor.author"] = rows_to_dict(lookup_cur)
-
-        lookup_cur.execute("""SELECT affiliations.affiliation FROM affiliations JOIN records_x_affiliations on records_x_affiliations.affiliation_id = affiliations.affiliation_id WHERE records_x_affiliations.record_id=?""", (record["record_id"],))
-        record["datacite:creatorAffiliation"] = rows_to_dict(lookup_cur)
-
-        lookup_cur.execute("""SELECT creators.creator FROM creators JOIN records_x_creators on records_x_creators.creator_id = creators.creator_id WHERE records_x_creators.record_id=? AND records_x_creators.is_contributor=1 order by records_x_creators_id asc""", (record["record_id"],))
-        record["dc:contributor"] = rows_to_dict(lookup_cur)
-
-        lookup_cur.execute("""SELECT subjects.subject FROM subjects JOIN records_x_subjects on records_x_subjects.subject_id = subjects.subject_id WHERE records_x_subjects.record_id=?""", (record["record_id"],))
-        record["dc:subject"] = rows_to_dict(lookup_cur)
-
-        lookup_cur.execute("""SELECT publishers.publisher FROM publishers JOIN records_x_publishers on records_x_publishers.publisher_id = publishers.publisher_id WHERE records_x_publishers.record_id=?""", (record["record_id"],))
-        record["dc:publisher"] = rows_to_dict(lookup_cur)
-
-        lookup_cur.execute("""SELECT rights.rights FROM rights JOIN records_x_rights on records_x_rights.rights_id = rights.rights_id WHERE records_x_rights.record_id=?""", (record["record_id"],))
-        record["dc:rights"] = rows_to_dict(lookup_cur)
-
-        lookup_cur.execute("SELECT description FROM descriptions WHERE record_id=? and language='en' "), (record["record_id"],)
-        record["dc:description"] = rows_to_dict(lookup_cur)
-
-        lookup_cur.execute("SELECT description FROM descriptions WHERE record_id=? and language='fr' "), (record["record_id"],)
-        record["frdr:description_fr"] = rows_to_dict(lookup_cur)
-
-        lookup_cur.execute("""SELECT tags.tag FROM tags JOIN records_x_tags on records_x_tags.tag_id = tags.tag_id WHERE records_x_tags.record_id=? and tags.language = 'en' """, (record["record_id"],))
-        record["frdr:tags"] = rows_to_dict(lookup_cur)
-
-        lookup_cur.execute("""SELECT tags.tag FROM tags JOIN records_x_tags on records_x_tags.tag_id = tags.tag_id WHERE records_x_tags.record_id=? and tags.language = 'fr' """, (record["record_id"],))
-        record["frdr:tags_fr"] = rows_to_dict(lookup_cur)
-
-        lookup_cur.execute("""SELECT access.access FROM access JOIN records_x_access on records_x_access.access_id = access.access_id WHERE records_x_access.record_id=?""", (record["record_id"],))
-        record["frdr:access"] = rows_to_dict(lookup_cur)
-
-    return record
-
-
-def get_metadata_list(
-        query=None,
-        provider_id=None,
-        client_id=None,
-        cursor=None,
-        server,
-        db,
-        user,
-        password
-    ):
-
-    # TODO: support listing by set
-
-    # Trigger cursor navigation with a starting value
-    if not cursor:
-        cursor = 1
-        records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (db, user, password, server))
-        with records_con:
-            records_cursor = records_con.cursor()
-        records_sql = """SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id"""
-        records_cursor.execute(records_sql)
-
-    record_set = records_cursor.fetchmany(config.RESULT_SET_SIZE)
-    results = []
-    for row in record_set:
-        record = (dict(zip(['record_id', 'title', 'pub_date', 'contact', 'series', 'source_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp'], row)))
-
-        full_record = assemble_record(record, db, user, password, server)
-        if full_record is not None:
-            results.append(build_metadata(full_record))
-
-    cursor += config.RESULT_SET_SIZE
-    # TODO: Probably need to pass rowcount back to this function for Postgres output
-    return results, records_cursor.rowcount, cursor
-
-
-def get_metadata(identifier, db, user, password, server):
-    records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (db, user, password, server))
-    with records_con:
-        records_cursor = records_con.cursor()
-    # TODO: record_id is kind of a meaningless identifier, support local identifier + source URL
-    records_sql = ("""SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id AND recs.record_id =?""", (identifier,))
-    records_cursor.execute(records_sql)
-    row = records_cursor.fetchone()
-    record = (dict(zip(['record_id', 'title', 'pub_date', 'contact', 'series', 'source_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp'], row)))
-
-    full_record = assemble_record(record, db, user, password, server)
-    return build_metadata(full_record)
-
-
-def get_sets():
-    repos_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (db, user, password, server))
-    with repos_con:
-        repos_cursor = repos_con.cursor()
-
-    repos_cursor.execute("SELECT repository_name from repositories")
-    sets = repos_cursor.fetchall()
-    # TODO: Format this properly to return it like the DataCite response
-    return None
\ No newline at end of file
+"""Handles DB queries for retrieving metadata"""
+
+import psycopg2
+import re
+from datetime import datetime
+import dateutil.parser
+import dateutil.tz
+from viringo import config
+
+class Metadata:
+    """Represents a DataCite metadata resultset"""
+    def __init__(
+            self,
+            identifier=None,
+            created_datetime=None,
+            updated_datetime=None,
+            xml=None,
+            metadata_version=None,
+            titles=None,
+            creators=None,
+            subjects=None,
+            descriptions=None,
+            publisher=None,
+            publication_year=None,
+            dates=None,
+            contributors=None,
+            resource_types=None,
+            funding_references=None,
+            geo_locations=None,
+            formats=None,
+            identifiers=None,
+            language=None,
+            relations=None,
+            rights=None,
+            sizes=None,
+            client=None,
+            active=True
+        ):
+
+        self.identifier = identifier
+        self.created_datetime = created_datetime or datetime.min
+        self.updated_datetime = updated_datetime or datetime.min
+        self.xml = xml
+        self.metadata_version = metadata_version
+        self.titles = titles or []
+        self.creators = creators or []
+        self.subjects = subjects or []
+        self.descriptions = descriptions or []
+        self.publisher = publisher
+        self.publication_year = publication_year
+        self.dates = dates or []
+        self.contributors = contributors or []
+        self.resource_types = resource_types or []
+        self.funding_references = funding_references or []
+        self.geo_locations = geo_locations or []
+        self.formats = formats or []
+        self.identifiers = identifiers or []
+        self.language = language
+        self.relations = relations or []
+        self.rights = rights or []
+        self.sizes = sizes or []
+        self.client = client
+        self.active = active
+
+
+def build_metadata(data):
+    """Parse single postgres result into metadata object"""
+    result = Metadata()
+
+    result.identifier = data['record_id']
+
+    # Here we want to parse a ISO date but convert to UTC and then remove the TZinfo entirely
+    # This is because OAI always works in UTC.
+    created = dateutil.parser.parse(data['pub_date'])
+    result.created_datetime = created.astimezone(dateutil.tz.UTC).replace(tzinfo=None)
+    updated = dateutil.parser.parse(data['pub_date'])
+    result.updated_datetime = updated.astimezone(dateutil.tz.UTC).replace(tzinfo=None)
+
+    result.xml = None
+    result.metadata_version = None
+    result.titles = [data['title']]
+    result.creators = data['dc:contributor.author']
+    result.subjects = data['dc:subject']
+    result.descriptions = data['dc:description']
+    result.publisher = data['dc:publisher']
+    result.publication_year = dateutil.parser.parse(data['pub_date']).year
+    result.dates = [data['pub_date']]
+    result.contributors = data['dc:contributor']
+    result.funding_references = []
+    result.sizes = []
+    result.geo_locations = data['frdr:geospatial']
+    result.resource_types = []
+    result.formats = []
+    result.identifiers = []
+    result.language = ''
+    result.relations = []
+    result.rights = data['dc:rights']
+    result.client = ''
+    result.active = True
+
+    return result
+
+
+def construct_local_url(record):
+        # Check if the local_identifier has already been turned into a url
+        if "http" in record["local_identifier"].lower():
+            return record["local_identifier"]
+
+        # Check for OAI format of identifier (oai:domain:id)
+        oai_id = None
+        oai_search = re.search("oai:(.+):(.+)", record["local_identifier"])
+        if oai_search:
+            oai_id = oai_search.group(2)
+            oai_id = oai_id.replace("_", ":")
+
+        # If given a pattern then substitue in the item ID and return it
+        if "item_url_pattern" in record and record["item_url_pattern"]:
+            if oai_id:
+                local_url = re.sub("(\%id\%)", oai_id, record["item_url_pattern"])
+            else:
+                local_url = re.sub("(\%id\%)", record["local_identifier"], record["item_url_pattern"])
+            return local_url
+
+        # Check if the identifier is a DOI
+        doi = re.search("(doi|DOI):\s?\S+", record["local_identifier"])
+        if doi:
+            doi = doi.group(0).rstrip('\.')
+            local_url = re.sub("(doi|DOI):\s?", "https://dx.doi.org/", doi)
+            return local_url
+
+        # If the item has a source URL, use it
+        if ('source_url' in record) and record['source_url']:
+            return record['source_url']
+
+        # URL is in the identifier
+        local_url = re.search("(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?",
+                              record["local_identifier"])
+        if local_url:
+            return local_url.group(0)
+
+        local_url = None
+        return local_url
+
+
+def rows_to_dict(cursor):
+        newdict = []
+        if cursor:
+            for r in cursor:
+                if r:
+                    if isinstance(r, list):
+                        newdict.append(r[0])
+                    else:
+                        newdict.append(r)
+        return newdict
+
+
+def assemble_record(record, db, user, password, server):
+    record["dc:source"] = construct_local_url(record)
+    if record["dc:source"] is None:
+        return None
+        
+    if int(record["deleted"]) == 1:
+        return None
+
+    if (len(record['title']) == 0):
+        return None
+
+    con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (db, user, password, server))
+    with con:
+        lookup_cur = con.cursor(cursor_factory=None)
+
+        lookup_cur.execute("SELECT coordinate_type, lat, lon FROM geospatial WHERE record_id=?",
+                        (record["record_id"],))
+        geodata = lookup_cur.fetchall()
+        record["frdr:geospatial"] = []
+        polycoordinates = []
+
+        try:
+            for coordinate in geodata:
+                if coordinate[0] == "Polygon":
+                    polycoordinates.append([float(coordinate[1]), float(coordinate[2])])
+                else:
+                    record["frdr:geospatial"].append({"frdr:geospatial_type": "Feature", "frdr:geospatial_geometry": {"frdr:geometry_type": coordinate[0], "frdr:geometry_coordinates": [float(coordinate[1]), float(coordinate[2])]}})
+        except:
+            pass
+
+        if polycoordinates:
+            record["frdr:geospatial"].append({"frdr:geospatial_type": "Feature", "frdr:geospatial_geometry": {"frdr:geometry_type": "Polygon", "frdr:geometry_coordinates": polycoordinates}})
+
+    with con:
+        lookup_cur = con.cursor(cursor_factory=DictCursor)
+
+        # attach the other values to the dict
+        lookup_cur.execute("""SELECT creators.creator FROM creators JOIN records_x_creators on records_x_creators.creator_id = creators.creator_id WHERE records_x_creators.record_id=? AND records_x_creators.is_contributor=0 order by records_x_creators_id asc""", (record["record_id"],))
+        record["dc:contributor.author"] = rows_to_dict(lookup_cur)
+
+        lookup_cur.execute("""SELECT affiliations.affiliation FROM affiliations JOIN records_x_affiliations on records_x_affiliations.affiliation_id = affiliations.affiliation_id WHERE records_x_affiliations.record_id=?""", (record["record_id"],))
+        record["datacite:creatorAffiliation"] = rows_to_dict(lookup_cur)
+
+        lookup_cur.execute("""SELECT creators.creator FROM creators JOIN records_x_creators on records_x_creators.creator_id = creators.creator_id WHERE records_x_creators.record_id=? AND records_x_creators.is_contributor=1 order by records_x_creators_id asc""", (record["record_id"],))
+        record["dc:contributor"] = rows_to_dict(lookup_cur)
+
+        lookup_cur.execute("""SELECT subjects.subject FROM subjects JOIN records_x_subjects on records_x_subjects.subject_id = subjects.subject_id WHERE records_x_subjects.record_id=?""", (record["record_id"],))
+        record["dc:subject"] = rows_to_dict(lookup_cur)
+
+        lookup_cur.execute("""SELECT publishers.publisher FROM publishers JOIN records_x_publishers on records_x_publishers.publisher_id = publishers.publisher_id WHERE records_x_publishers.record_id=?""", (record["record_id"],))
+        record["dc:publisher"] = rows_to_dict(lookup_cur)
+
+        lookup_cur.execute("""SELECT rights.rights FROM rights JOIN records_x_rights on records_x_rights.rights_id = rights.rights_id WHERE records_x_rights.record_id=?""", (record["record_id"],))
+        record["dc:rights"] = rows_to_dict(lookup_cur)
+
+        lookup_cur.execute("SELECT description FROM descriptions WHERE record_id=? and language='en' "), (record["record_id"],)
+        record["dc:description"] = rows_to_dict(lookup_cur)
+
+        lookup_cur.execute("SELECT description FROM descriptions WHERE record_id=? and language='fr' "), (record["record_id"],)
+        record["frdr:description_fr"] = rows_to_dict(lookup_cur)
+
+        lookup_cur.execute("""SELECT tags.tag FROM tags JOIN records_x_tags on records_x_tags.tag_id = tags.tag_id WHERE records_x_tags.record_id=? and tags.language = 'en' """, (record["record_id"],))
+        record["frdr:tags"] = rows_to_dict(lookup_cur)
+
+        lookup_cur.execute("""SELECT tags.tag FROM tags JOIN records_x_tags on records_x_tags.tag_id = tags.tag_id WHERE records_x_tags.record_id=? and tags.language = 'fr' """, (record["record_id"],))
+        record["frdr:tags_fr"] = rows_to_dict(lookup_cur)
+
+        lookup_cur.execute("""SELECT access.access FROM access JOIN records_x_access on records_x_access.access_id = access.access_id WHERE records_x_access.record_id=?""", (record["record_id"],))
+        record["frdr:access"] = rows_to_dict(lookup_cur)
+
+    return record
+
+
+def get_metadata_list(
+        query=None,
+        provider_id=None,
+        client_id=None,
+        records_cursor=None,
+        server,
+        db,
+        user,
+        password
+    ):
+
+    # TODO: support listing by set
+    if records_cursor is None:
+        records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (db, user, password, server))
+        with records_con:
+            records_cursor = records_con.cursor()
+        records_sql = """SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id"""
+        records_cursor.execute(records_sql)
+
+    record_set = records_cursor.fetchmany(config.RESULT_SET_SIZE)
+    results = []
+    for row in record_set:
+        record = (dict(zip(['record_id', 'title', 'pub_date', 'contact', 'series', 'source_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp'], row)))
+
+        full_record = assemble_record(record, db, user, password, server)
+        if full_record is not None:
+            results.append(build_metadata(full_record))
+
+    return results, records_cursor.rowcount, records_cursor
+
+
+def get_metadata(identifier, db, user, password, server):
+    records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (db, user, password, server))
+    with records_con:
+        records_cursor = records_con.cursor()
+    records_sql = ("""SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id AND recs.record_id =?""", (identifier,))
+    records_cursor.execute(records_sql)
+    row = records_cursor.fetchone()
+    record = (dict(zip(['record_id', 'title', 'pub_date', 'contact', 'series', 'source_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp'], row)))
+
+    full_record = assemble_record(record, db, user, password, server)
+    return build_metadata(full_record)
+
+
+def get_sets():
+    repos_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (db, user, password, server))
+    with repos_con:
+        repos_cursor = repos_con.cursor()
+
+    repos_cursor.execute("SELECT repository_name from repositories")
+    results = repos_cursor.fetchall()
+    return results, len(results)

From e628330ae7d5e3e8b96044f8dc455d841d241525 Mon Sep 17 00:00:00 2001
From: Alex Garnett <axfelix@gmail.com>
Date: Thu, 6 Feb 2020 09:01:13 -0500
Subject: [PATCH 07/67] change mentions of postgres to FRDR where appropriate

---
 viringo/catalogs.py          |  20 +--
 viringo/config.py            |  12 +-
 viringo/oai.py               |   6 +-
 viringo/services/postgres.py | 281 -----------------------------------
 4 files changed, 19 insertions(+), 300 deletions(-)
 delete mode 100644 viringo/services/postgres.py

diff --git a/viringo/catalogs.py b/viringo/catalogs.py
index ce039aa..20bcdfb 100644
--- a/viringo/catalogs.py
+++ b/viringo/catalogs.py
@@ -12,7 +12,7 @@
 
 from viringo import config
 from .services import datacite
-from .services import postgres
+from .services import frdr
 
 class DataCiteOAIServer():
     """Build OAI-PMH data responses for DataCite metadata catalog"""
@@ -112,7 +112,6 @@ def listRecords(
         # If available get the search query from the set param
         search_query = set_to_search_query(set)
 
-        # From and until parameters aren't supported with Postgres
         # Get both a provider and client_id from the set
         provider_id, client_id = set_to_provider_client(set)
         results, total_records, paging_cursor = datacite.get_metadata_list(
@@ -284,8 +283,8 @@ def build_metadata_map(self, result):
         return metadata
 
 
-class PostgresOAIServer():
-    """Build OAI-PMH responses from a Postgres server"""
+class FRDROAIServer():
+    """Build OAI-PMH responses from the FRDR Postgres server"""
     def identify(self):
         """Construct common identification for the OAI service"""
 
@@ -301,7 +300,7 @@ def identify(self):
             toolkit_description=False)
 
         # Specify a custom description
-        postgres_desc = """
+        frdr_desc = """
         <oai-identifier xmlns="http://www.openarchives.org/OAI/2.0/oai-identifier" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai-identifier http://www.openarchives.org/OAI/2.0/oai-identifier.xsd">
             <scheme>oai</scheme>
             <repositoryIdentifier>""" + config.OAIPMH_IDENTIFIER + """</repositoryIdentifier>
@@ -310,7 +309,7 @@ def identify(self):
         </oai-identifier>
         """
 
-        identify.add_description(xml_string=postgres_desc)
+        identify.add_description(xml_string=frdr_desc)
 
         return identify
 
@@ -348,7 +347,7 @@ def getRecord(self, metadataPrefix, identifier):
 
         # Should we implement this based on source_url and local_identifier the way we currently do for the harvester? 
 
-        result = postgres.get_metadata(identifier, db, user, password, server)
+        result = frdr.get_metadata(identifier, db, user, password, server)
         if not result:
             raise error.IdDoesNotExistError(
                 "\"%s\" is unknown or illegal in this repository" % identifier
@@ -381,9 +380,10 @@ def listRecords(
         # If available get the search query from the set param
         search_query = set_to_search_query(set)
 
+        # From and until parameters aren't supported with FRDR
         # Get both a provider and client_id from the set
         provider_id, client_id = set_to_provider_client(set)
-        results, total_records, paging_cursor = postgres.get_metadata_list(
+        results, total_records, paging_cursor = frdr.get_metadata_list(
             query=search_query,
             provider_id=provider_id,
             client_id=client_id,
@@ -429,7 +429,7 @@ def listIdentifiers(
         # Get both a provider and client_id from the set
         provider_id, client_id = set_to_provider_client(set)
 
-        results, total_records, paging_cursor = postgres.get_metadata_list(
+        results, total_records, paging_cursor = frdr.get_metadata_list(
             query=search_query,
             provider_id=provider_id,
             client_id=client_id,
@@ -468,7 +468,7 @@ def listSets(
 
         batch_size = 50
         next_batch = paging_cursor + batch_size
-        results, total_results = postgres.get_sets()
+        results, total_results = frdr.get_sets()
         results = results[paging_cursor: next_batch]
 
         if len(results) < batch_size:
diff --git a/viringo/config.py b/viringo/config.py
index c2ba38a..4a1ec81 100644
--- a/viringo/config.py
+++ b/viringo/config.py
@@ -20,13 +20,13 @@
 OAIPMH_IDENTIFIER = os.getenv('OAIPMH_IDENTIFIER', 'oai.datacite.org')
 # Page size of results shown for result listings
 RESULT_SET_SIZE = int(os.getenv('RESULT_SET_SIZE', '50'))
-# Source metadata catalog (DataCite or Postgres)
-CATALOG_SET = os.getenv('OAIPMH_CATALOG', 'Postgres')
-# Postgres server
+# Source metadata catalog (DataCite or FRDR)
+CATALOG_SET = os.getenv('OAIPMH_CATALOG', 'DataCite')
+# FRDR Postgres server
 POSTGRES_SERVER = os.getenv('OAIPMH_POSTGRES_SERVER', '')
-# Postgres db
+# FRDR Postgres db
 POSTGRES_DB = os.getenv('OAIPMH_POSTGRES_DB', '')
-# Postgres user
+# FRDR Postgres user
 POSTGRES_USER = os.getenv('OAIPMH_POSTGRES_USER', '')
-# Postgres password
+# FRDR Postgres password
 POSTGRES_PASSWORD = os.getenv('OAIPMH_POSTGRES_PASSWORD', '')
\ No newline at end of file
diff --git a/viringo/oai.py b/viringo/oai.py
index 261835f..04c192e 100644
--- a/viringo/oai.py
+++ b/viringo/oai.py
@@ -12,7 +12,7 @@
 import oaipmh.datestamp
 
 from .catalogs import DataCiteOAIServer
-from .catalogs import PostgresOAIServer
+from .catalogs import FRDROAIServer
 from . import metadata
 from . import config
 
@@ -97,8 +97,8 @@ def get_oai_server():
     if 'oai' not in g:
         if config.CATALOG_SET == 'DateCite':
             catalog_server = DataCiteOAIServer()
-        elif config.CATALOG_SET == 'Postgres':
-            catalog_server = PostgresOAIServer()
+        elif config.CATALOG_SET == 'FRDR':
+            catalog_server = FRDROAIServer()
         else:
             print('No valid metadata catalog configured')
             sys.exit(1)
diff --git a/viringo/services/postgres.py b/viringo/services/postgres.py
deleted file mode 100644
index 258d77e..0000000
--- a/viringo/services/postgres.py
+++ /dev/null
@@ -1,281 +0,0 @@
-"""Handles DB queries for retrieving metadata"""
-
-import psycopg2
-import re
-from datetime import datetime
-import dateutil.parser
-import dateutil.tz
-from viringo import config
-
-class Metadata:
-    """Represents a DataCite metadata resultset"""
-    def __init__(
-            self,
-            identifier=None,
-            created_datetime=None,
-            updated_datetime=None,
-            xml=None,
-            metadata_version=None,
-            titles=None,
-            creators=None,
-            subjects=None,
-            descriptions=None,
-            publisher=None,
-            publication_year=None,
-            dates=None,
-            contributors=None,
-            resource_types=None,
-            funding_references=None,
-            geo_locations=None,
-            formats=None,
-            identifiers=None,
-            language=None,
-            relations=None,
-            rights=None,
-            sizes=None,
-            client=None,
-            active=True
-        ):
-
-        self.identifier = identifier
-        self.created_datetime = created_datetime or datetime.min
-        self.updated_datetime = updated_datetime or datetime.min
-        self.xml = xml
-        self.metadata_version = metadata_version
-        self.titles = titles or []
-        self.creators = creators or []
-        self.subjects = subjects or []
-        self.descriptions = descriptions or []
-        self.publisher = publisher
-        self.publication_year = publication_year
-        self.dates = dates or []
-        self.contributors = contributors or []
-        self.resource_types = resource_types or []
-        self.funding_references = funding_references or []
-        self.geo_locations = geo_locations or []
-        self.formats = formats or []
-        self.identifiers = identifiers or []
-        self.language = language
-        self.relations = relations or []
-        self.rights = rights or []
-        self.sizes = sizes or []
-        self.client = client
-        self.active = active
-
-
-def build_metadata(data):
-    """Parse single postgres result into metadata object"""
-    result = Metadata()
-
-    result.identifier = data['record_id']
-
-    # Here we want to parse a ISO date but convert to UTC and then remove the TZinfo entirely
-    # This is because OAI always works in UTC.
-    created = dateutil.parser.parse(data['pub_date'])
-    result.created_datetime = created.astimezone(dateutil.tz.UTC).replace(tzinfo=None)
-    updated = dateutil.parser.parse(data['pub_date'])
-    result.updated_datetime = updated.astimezone(dateutil.tz.UTC).replace(tzinfo=None)
-
-    result.xml = None
-    result.metadata_version = None
-    result.titles = [data['title']]
-    result.creators = data['dc:contributor.author']
-    result.subjects = data['dc:subject']
-    result.descriptions = data['dc:description']
-    result.publisher = data['dc:publisher']
-    result.publication_year = dateutil.parser.parse(data['pub_date']).year
-    result.dates = [data['pub_date']]
-    result.contributors = data['dc:contributor']
-    result.funding_references = []
-    result.sizes = []
-    result.geo_locations = data['frdr:geospatial']
-    result.resource_types = []
-    result.formats = []
-    result.identifiers = []
-    result.language = ''
-    result.relations = []
-    result.rights = data['dc:rights']
-    result.client = ''
-    result.active = True
-
-    return result
-
-
-def construct_local_url(record):
-        # Check if the local_identifier has already been turned into a url
-        if "http" in record["local_identifier"].lower():
-            return record["local_identifier"]
-
-        # Check for OAI format of identifier (oai:domain:id)
-        oai_id = None
-        oai_search = re.search("oai:(.+):(.+)", record["local_identifier"])
-        if oai_search:
-            oai_id = oai_search.group(2)
-            oai_id = oai_id.replace("_", ":")
-
-        # If given a pattern then substitue in the item ID and return it
-        if "item_url_pattern" in record and record["item_url_pattern"]:
-            if oai_id:
-                local_url = re.sub("(\%id\%)", oai_id, record["item_url_pattern"])
-            else:
-                local_url = re.sub("(\%id\%)", record["local_identifier"], record["item_url_pattern"])
-            return local_url
-
-        # Check if the identifier is a DOI
-        doi = re.search("(doi|DOI):\s?\S+", record["local_identifier"])
-        if doi:
-            doi = doi.group(0).rstrip('\.')
-            local_url = re.sub("(doi|DOI):\s?", "https://dx.doi.org/", doi)
-            return local_url
-
-        # If the item has a source URL, use it
-        if ('source_url' in record) and record['source_url']:
-            return record['source_url']
-
-        # URL is in the identifier
-        local_url = re.search("(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?",
-                              record["local_identifier"])
-        if local_url:
-            return local_url.group(0)
-
-        local_url = None
-        return local_url
-
-
-def rows_to_dict(cursor):
-        newdict = []
-        if cursor:
-            for r in cursor:
-                if r:
-                    if isinstance(r, list):
-                        newdict.append(r[0])
-                    else:
-                        newdict.append(r)
-        return newdict
-
-
-def assemble_record(record, db, user, password, server):
-    record["dc:source"] = construct_local_url(record)
-    if record["dc:source"] is None:
-        return None
-        
-    if int(record["deleted"]) == 1:
-        return None
-
-    if (len(record['title']) == 0):
-        return None
-
-    con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (db, user, password, server))
-    with con:
-        lookup_cur = con.cursor(cursor_factory=None)
-
-        lookup_cur.execute("SELECT coordinate_type, lat, lon FROM geospatial WHERE record_id=?",
-                        (record["record_id"],))
-        geodata = lookup_cur.fetchall()
-        record["frdr:geospatial"] = []
-        polycoordinates = []
-
-        try:
-            for coordinate in geodata:
-                if coordinate[0] == "Polygon":
-                    polycoordinates.append([float(coordinate[1]), float(coordinate[2])])
-                else:
-                    record["frdr:geospatial"].append({"frdr:geospatial_type": "Feature", "frdr:geospatial_geometry": {"frdr:geometry_type": coordinate[0], "frdr:geometry_coordinates": [float(coordinate[1]), float(coordinate[2])]}})
-        except:
-            pass
-
-        if polycoordinates:
-            record["frdr:geospatial"].append({"frdr:geospatial_type": "Feature", "frdr:geospatial_geometry": {"frdr:geometry_type": "Polygon", "frdr:geometry_coordinates": polycoordinates}})
-
-    with con:
-        lookup_cur = con.cursor(cursor_factory=DictCursor)
-
-        # attach the other values to the dict
-        lookup_cur.execute("""SELECT creators.creator FROM creators JOIN records_x_creators on records_x_creators.creator_id = creators.creator_id WHERE records_x_creators.record_id=? AND records_x_creators.is_contributor=0 order by records_x_creators_id asc""", (record["record_id"],))
-        record["dc:contributor.author"] = rows_to_dict(lookup_cur)
-
-        lookup_cur.execute("""SELECT affiliations.affiliation FROM affiliations JOIN records_x_affiliations on records_x_affiliations.affiliation_id = affiliations.affiliation_id WHERE records_x_affiliations.record_id=?""", (record["record_id"],))
-        record["datacite:creatorAffiliation"] = rows_to_dict(lookup_cur)
-
-        lookup_cur.execute("""SELECT creators.creator FROM creators JOIN records_x_creators on records_x_creators.creator_id = creators.creator_id WHERE records_x_creators.record_id=? AND records_x_creators.is_contributor=1 order by records_x_creators_id asc""", (record["record_id"],))
-        record["dc:contributor"] = rows_to_dict(lookup_cur)
-
-        lookup_cur.execute("""SELECT subjects.subject FROM subjects JOIN records_x_subjects on records_x_subjects.subject_id = subjects.subject_id WHERE records_x_subjects.record_id=?""", (record["record_id"],))
-        record["dc:subject"] = rows_to_dict(lookup_cur)
-
-        lookup_cur.execute("""SELECT publishers.publisher FROM publishers JOIN records_x_publishers on records_x_publishers.publisher_id = publishers.publisher_id WHERE records_x_publishers.record_id=?""", (record["record_id"],))
-        record["dc:publisher"] = rows_to_dict(lookup_cur)
-
-        lookup_cur.execute("""SELECT rights.rights FROM rights JOIN records_x_rights on records_x_rights.rights_id = rights.rights_id WHERE records_x_rights.record_id=?""", (record["record_id"],))
-        record["dc:rights"] = rows_to_dict(lookup_cur)
-
-        lookup_cur.execute("SELECT description FROM descriptions WHERE record_id=? and language='en' "), (record["record_id"],)
-        record["dc:description"] = rows_to_dict(lookup_cur)
-
-        lookup_cur.execute("SELECT description FROM descriptions WHERE record_id=? and language='fr' "), (record["record_id"],)
-        record["frdr:description_fr"] = rows_to_dict(lookup_cur)
-
-        lookup_cur.execute("""SELECT tags.tag FROM tags JOIN records_x_tags on records_x_tags.tag_id = tags.tag_id WHERE records_x_tags.record_id=? and tags.language = 'en' """, (record["record_id"],))
-        record["frdr:tags"] = rows_to_dict(lookup_cur)
-
-        lookup_cur.execute("""SELECT tags.tag FROM tags JOIN records_x_tags on records_x_tags.tag_id = tags.tag_id WHERE records_x_tags.record_id=? and tags.language = 'fr' """, (record["record_id"],))
-        record["frdr:tags_fr"] = rows_to_dict(lookup_cur)
-
-        lookup_cur.execute("""SELECT access.access FROM access JOIN records_x_access on records_x_access.access_id = access.access_id WHERE records_x_access.record_id=?""", (record["record_id"],))
-        record["frdr:access"] = rows_to_dict(lookup_cur)
-
-    return record
-
-
-def get_metadata_list(
-        query=None,
-        provider_id=None,
-        client_id=None,
-        records_cursor=None,
-        server,
-        db,
-        user,
-        password
-    ):
-
-    # TODO: support listing by set
-    if records_cursor is None:
-        records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (db, user, password, server))
-        with records_con:
-            records_cursor = records_con.cursor()
-        records_sql = """SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id"""
-        records_cursor.execute(records_sql)
-
-    record_set = records_cursor.fetchmany(config.RESULT_SET_SIZE)
-    results = []
-    for row in record_set:
-        record = (dict(zip(['record_id', 'title', 'pub_date', 'contact', 'series', 'source_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp'], row)))
-
-        full_record = assemble_record(record, db, user, password, server)
-        if full_record is not None:
-            results.append(build_metadata(full_record))
-
-    return results, records_cursor.rowcount, records_cursor
-
-
-def get_metadata(identifier, db, user, password, server):
-    records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (db, user, password, server))
-    with records_con:
-        records_cursor = records_con.cursor()
-    records_sql = ("""SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id AND recs.record_id =?""", (identifier,))
-    records_cursor.execute(records_sql)
-    row = records_cursor.fetchone()
-    record = (dict(zip(['record_id', 'title', 'pub_date', 'contact', 'series', 'source_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp'], row)))
-
-    full_record = assemble_record(record, db, user, password, server)
-    return build_metadata(full_record)
-
-
-def get_sets():
-    repos_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (db, user, password, server))
-    with repos_con:
-        repos_cursor = repos_con.cursor()
-
-    repos_cursor.execute("SELECT repository_name from repositories")
-    results = repos_cursor.fetchall()
-    return results, len(results)

From 41f0428a48b7643811a14b0e5ec203fd33783d0c Mon Sep 17 00:00:00 2001
From: Alex Garnett <axfelix@gmail.com>
Date: Thu, 13 Feb 2020 16:07:31 -0800
Subject: [PATCH 08/67] fix up pipfile for postgres, clean up some syntax

---
 Pipfile                  |   3 +-
 Pipfile.lock             | 301 +++++++++++++++++++++++----------------
 docker-compose.yml       |   2 +-
 viringo/catalogs.py      |  24 ++--
 viringo/oai.py           |   2 +
 viringo/services/frdr.py | 282 ++++++++++++++++++++++++++++++++++++
 6 files changed, 480 insertions(+), 134 deletions(-)
 create mode 100644 viringo/services/frdr.py

diff --git a/Pipfile b/Pipfile
index c58d20c..679071b 100644
--- a/Pipfile
+++ b/Pipfile
@@ -14,6 +14,7 @@ python-dotenv = "*"
 [packages]
 flask = "*"
 pyoai = "*"
+psycopg2-binary = "*"
 requests = "*"
 python-dateutil = "*"
 lxml = "*"
@@ -23,4 +24,4 @@ sentry-sdk = {extras = ["flask"],version = "*"}
 python-dotenv = "*"
 
 [requires]
-python_version = "3.6"
+python_version = "3.5"
diff --git a/Pipfile.lock b/Pipfile.lock
index 032c439..9bf8991 100644
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -1,11 +1,11 @@
 {
     "_meta": {
         "hash": {
-            "sha256": "408ef5f3e6921b3359ecee3e8d293c34bd619136293244fe829928ab30c6cf06"
+            "sha256": "875a3074f25d395b53d43cb957d5d4567234ba291132d91c88cbc1a2f20c77b2"
         },
         "pipfile-spec": 6,
         "requires": {
-            "python_version": "3.6"
+            "python_version": "3.5"
         },
         "sources": [
             {
@@ -24,10 +24,10 @@
         },
         "certifi": {
             "hashes": [
-                "sha256:e4f3620cfea4f83eedc95b24abd9cd56f3c4b146dd0177e83a21b4eb49e21e50",
-                "sha256:fd7c7c74727ddcf00e9acd26bba8da604ffec95bf1c2144e67aff7a8b50e6cef"
+                "sha256:017c25db2a153ce562900032d5bc68e9f191e44e9a0f762f373977de9df1fbb3",
+                "sha256:25b64c7da4cd7479594d035c08c2d809eb4aab3a26e5a990ea98cc450c320f1f"
             ],
-            "version": "==2019.9.11"
+            "version": "==2019.11.28"
         },
         "chardet": {
             "hashes": [
@@ -53,10 +53,10 @@
         },
         "faker": {
             "hashes": [
-                "sha256:5902379d8df308a204fc11c4f621590ee83975805a6c7b2228203b9defa45250",
-                "sha256:5e8c755c619f332d5ec28b7586389665f136bcf528e165eb925e87c06a63eda7"
+                "sha256:047d4d1791bfb3756264da670d99df13d799bb36e7d88774b1585a82d05dbaec",
+                "sha256:1b1a58961683b30c574520d0c739c4443e0ef6a185c04382e8cc888273dbebed"
             ],
-            "version": "==2.0.3"
+            "version": "==4.0.0"
         },
         "flask": {
             "hashes": [
@@ -82,50 +82,50 @@
         },
         "jinja2": {
             "hashes": [
-                "sha256:74320bb91f31270f9551d46522e33af46a80c3d619f4a4bf42b3164d30b5911f",
-                "sha256:9fe95f19286cfefaa917656583d020be14e7859c6b0252588391e47db34527de"
+                "sha256:93187ffbc7808079673ef52771baa950426fd664d3aad1d0fa3e95644360e250",
+                "sha256:b0eaf100007721b5c16c1fc1eecb87409464edc10469ddc9a22a27a99123be49"
             ],
-            "version": "==2.10.3"
+            "version": "==2.11.1"
         },
         "json-log-formatter": {
             "hashes": [
-                "sha256:8bb02773a37274c08f4de748e3accbed269c8395d27149bc2e7e9109af342eee",
-                "sha256:cdc1f1dabc0b9c808ed4e4f26e73885a9e7955bf7190dd9f1b86be967feb5b29"
+                "sha256:ee187c9a80936cbf1259f73573973450fc24b84a4fb54e53eb0dcff86ea1e759"
             ],
             "index": "pypi",
-            "version": "==0.2.0"
+            "version": "==0.3.0"
         },
         "lxml": {
             "hashes": [
-                "sha256:02ca7bf899da57084041bb0f6095333e4d239948ad3169443f454add9f4e9cb4",
-                "sha256:096b82c5e0ea27ce9138bcbb205313343ee66a6e132f25c5ed67e2c8d960a1bc",
-                "sha256:0a920ff98cf1aac310470c644bc23b326402d3ef667ddafecb024e1713d485f1",
-                "sha256:1409b14bf83a7d729f92e2a7fbfe7ec929d4883ca071b06e95c539ceedb6497c",
-                "sha256:17cae1730a782858a6e2758fd20dd0ef7567916c47757b694a06ffafdec20046",
-                "sha256:17e3950add54c882e032527795c625929613adbd2ce5162b94667334458b5a36",
-                "sha256:1f4f214337f6ee5825bf90a65d04d70aab05526c08191ab888cb5149501923c5",
-                "sha256:2e8f77db25b0a96af679e64ff9bf9dddb27d379c9900c3272f3041c4d1327c9d",
-                "sha256:4dffd405390a45ecb95ab5ab1c1b847553c18b0ef8ed01e10c1c8b1a76452916",
-                "sha256:6b899931a5648862c7b88c795eddff7588fb585e81cecce20f8d9da16eff96e0",
-                "sha256:726c17f3e0d7a7200718c9a890ccfeab391c9133e363a577a44717c85c71db27",
-                "sha256:760c12276fee05c36f95f8040180abc7fbebb9e5011447a97cdc289b5d6ab6fc",
-                "sha256:796685d3969815a633827c818863ee199440696b0961e200b011d79b9394bbe7",
-                "sha256:891fe897b49abb7db470c55664b198b1095e4943b9f82b7dcab317a19116cd38",
-                "sha256:9277562f175d2334744ad297568677056861070399cec56ff06abbe2564d1232",
-                "sha256:a471628e20f03dcdfde00770eeaf9c77811f0c331c8805219ca7b87ac17576c5",
-                "sha256:a63b4fd3e2cabdcc9d918ed280bdde3e8e9641e04f3c59a2a3109644a07b9832",
-                "sha256:ae88588d687bd476be588010cbbe551e9c2872b816f2da8f01f6f1fda74e1ef0",
-                "sha256:b0b84408d4eabc6de9dd1e1e0bc63e7731e890c0b378a62443e5741cfd0ae90a",
-                "sha256:be78485e5d5f3684e875dab60f40cddace2f5b2a8f7fede412358ab3214c3a6f",
-                "sha256:c27eaed872185f047bb7f7da2d21a7d8913457678c9a100a50db6da890bc28b9",
-                "sha256:c7fccd08b14aa437fe096c71c645c0f9be0655a9b1a4b7cffc77bcb23b3d61d2",
-                "sha256:c81cb40bff373ab7a7446d6bbca0190bccc5be3448b47b51d729e37799bb5692",
-                "sha256:d11874b3c33ee441059464711cd365b89fa1a9cf19ae75b0c189b01fbf735b84",
-                "sha256:e9c028b5897901361d81a4718d1db217b716424a0283afe9d6735fe0caf70f79",
-                "sha256:fe489d486cd00b739be826e8c1be188ddb74c7a1ca784d93d06fda882a6a1681"
+                "sha256:06d4e0bbb1d62e38ae6118406d7cdb4693a3fa34ee3762238bcb96c9e36a93cd",
+                "sha256:0701f7965903a1c3f6f09328c1278ac0eee8f56f244e66af79cb224b7ef3801c",
+                "sha256:1f2c4ec372bf1c4a2c7e4bb20845e8bcf8050365189d86806bad1e3ae473d081",
+                "sha256:4235bc124fdcf611d02047d7034164897ade13046bda967768836629bc62784f",
+                "sha256:5828c7f3e615f3975d48f40d4fe66e8a7b25f16b5e5705ffe1d22e43fb1f6261",
+                "sha256:585c0869f75577ac7a8ff38d08f7aac9033da2c41c11352ebf86a04652758b7a",
+                "sha256:5d467ce9c5d35b3bcc7172c06320dddb275fea6ac2037f72f0a4d7472035cea9",
+                "sha256:63dbc21efd7e822c11d5ddbedbbb08cd11a41e0032e382a0fd59b0b08e405a3a",
+                "sha256:7bc1b221e7867f2e7ff1933165c0cec7153dce93d0cdba6554b42a8beb687bdb",
+                "sha256:8620ce80f50d023d414183bf90cc2576c2837b88e00bea3f33ad2630133bbb60",
+                "sha256:8a0ebda56ebca1a83eb2d1ac266649b80af8dd4b4a3502b2c1e09ac2f88fe128",
+                "sha256:90ed0e36455a81b25b7034038e40880189169c308a3df360861ad74da7b68c1a",
+                "sha256:95e67224815ef86924fbc2b71a9dbd1f7262384bca4bc4793645794ac4200717",
+                "sha256:afdb34b715daf814d1abea0317b6d672476b498472f1e5aacbadc34ebbc26e89",
+                "sha256:b4b2c63cc7963aedd08a5f5a454c9f67251b1ac9e22fd9d72836206c42dc2a72",
+                "sha256:d068f55bda3c2c3fcaec24bd083d9e2eede32c583faf084d6e4b9daaea77dde8",
+                "sha256:d5b3c4b7edd2e770375a01139be11307f04341ec709cf724e0f26ebb1eef12c3",
+                "sha256:deadf4df349d1dcd7b2853a2c8796593cc346600726eff680ed8ed11812382a7",
+                "sha256:df533af6f88080419c5a604d0d63b2c33b1c0c4409aba7d0cb6de305147ea8c8",
+                "sha256:e4aa948eb15018a657702fee0b9db47e908491c64d36b4a90f59a64741516e77",
+                "sha256:e5d842c73e4ef6ed8c1bd77806bf84a7cb535f9c0cf9b2c74d02ebda310070e1",
+                "sha256:ebec08091a22c2be870890913bdadd86fcd8e9f0f22bcb398abd3af914690c15",
+                "sha256:edc15fcfd77395e24543be48871c251f38132bb834d9fdfdad756adb6ea37679",
+                "sha256:f2b74784ed7e0bc2d02bd53e48ad6ba523c9b36c194260b7a5045071abbb1012",
+                "sha256:fa071559f14bd1e92077b1b5f6c22cf09756c6de7139370249eb372854ce51e6",
+                "sha256:fd52e796fee7171c4361d441796b64df1acfceb51f29e545e812f16d023c4bbc",
+                "sha256:fe976a0f1ef09b3638778024ab9fb8cde3118f203364212c198f71341c0715ca"
             ],
             "index": "pypi",
-            "version": "==4.4.1"
+            "version": "==4.5.0"
         },
         "markupsafe": {
             "hashes": [
@@ -133,13 +133,16 @@
                 "sha256:09027a7803a62ca78792ad89403b1b7a73a01c8cb65909cd876f7fcebd79b161",
                 "sha256:09c4b7f37d6c648cb13f9230d847adf22f8171b1ccc4d5682398e77f40309235",
                 "sha256:1027c282dad077d0bae18be6794e6b6b8c91d58ed8a8d89a89d59693b9131db5",
+                "sha256:13d3144e1e340870b25e7b10b98d779608c02016d5184cfb9927a9f10c689f42",
                 "sha256:24982cc2533820871eba85ba648cd53d8623687ff11cbb805be4ff7b4c971aff",
                 "sha256:29872e92839765e546828bb7754a68c418d927cd064fd4708fab9fe9c8bb116b",
                 "sha256:43a55c2930bbc139570ac2452adf3d70cdbb3cfe5912c71cdce1c2c6bbd9c5d1",
                 "sha256:46c99d2de99945ec5cb54f23c8cd5689f6d7177305ebff350a58ce5f8de1669e",
                 "sha256:500d4957e52ddc3351cabf489e79c91c17f6e0899158447047588650b5e69183",
                 "sha256:535f6fc4d397c1563d08b88e485c3496cf5784e927af890fb3c3aac7f933ec66",
+                "sha256:596510de112c685489095da617b5bcbbac7dd6384aeebeda4df6025d0256a81b",
                 "sha256:62fe6c95e3ec8a7fad637b7f3d372c15ec1caa01ab47926cfdf7a75b40e0eac1",
+                "sha256:6788b695d50a51edb699cb55e35487e430fa21f1ed838122d722e0ff0ac5ba15",
                 "sha256:6dd73240d2af64df90aa7c4e7481e23825ea70af4b4922f8ede5b9e35f78a3b1",
                 "sha256:717ba8fe3ae9cc0006d7c451f0bb265ee07739daf76355d06366154ee68d221e",
                 "sha256:79855e1c5b8da654cf486b830bd42c06e8780cea587384cf6545b7d9ac013a0b",
@@ -156,10 +159,50 @@
                 "sha256:ba59edeaa2fc6114428f1637ffff42da1e311e29382d81b339c1817d37ec93c6",
                 "sha256:c8716a48d94b06bb3b2524c2b77e055fb313aeb4ea620c8dd03a105574ba704f",
                 "sha256:cd5df75523866410809ca100dc9681e301e3c27567cf498077e8551b6d20e42f",
-                "sha256:e249096428b3ae81b08327a63a485ad0878de3fb939049038579ac0ef61e17e7"
+                "sha256:cdb132fc825c38e1aeec2c8aa9338310d29d337bebbd7baa06889d09a60a1fa2",
+                "sha256:e249096428b3ae81b08327a63a485ad0878de3fb939049038579ac0ef61e17e7",
+                "sha256:e8313f01ba26fbbe36c7be1966a7b7424942f670f38e666995b88d012765b9be"
             ],
             "version": "==1.1.1"
         },
+        "psycopg2-binary": {
+            "hashes": [
+                "sha256:040234f8a4a8dfd692662a8308d78f63f31a97e1c42d2480e5e6810c48966a29",
+                "sha256:086f7e89ec85a6704db51f68f0dcae432eff9300809723a6e8782c41c2f48e03",
+                "sha256:18ca813fdb17bc1db73fe61b196b05dd1ca2165b884dd5ec5568877cabf9b039",
+                "sha256:19dc39616850342a2a6db70559af55b22955f86667b5f652f40c0e99253d9881",
+                "sha256:2166e770cb98f02ed5ee2b0b569d40db26788e0bf2ec3ae1a0d864ea6f1d8309",
+                "sha256:3a2522b1d9178575acee4adf8fd9f979f9c0449b00b4164bb63c3475ea6528ed",
+                "sha256:3aa773580f85a28ffdf6f862e59cb5a3cc7ef6885121f2de3fca8d6ada4dbf3b",
+                "sha256:3b5deaa3ee7180585a296af33e14c9b18c218d148e735c7accf78130765a47e3",
+                "sha256:407af6d7e46593415f216c7f56ba087a9a42bd6dc2ecb86028760aa45b802bd7",
+                "sha256:4c3c09fb674401f630626310bcaf6cd6285daf0d5e4c26d6e55ca26a2734e39b",
+                "sha256:4c6717962247445b4f9e21c962ea61d2e884fc17df5ddf5e35863b016f8a1f03",
+                "sha256:50446fae5681fc99f87e505d4e77c9407e683ab60c555ec302f9ac9bffa61103",
+                "sha256:5057669b6a66aa9ca118a2a860159f0ee3acf837eda937bdd2a64f3431361a2d",
+                "sha256:5dd90c5438b4f935c9d01fcbad3620253da89d19c1f5fca9158646407ed7df35",
+                "sha256:659c815b5b8e2a55193ede2795c1e2349b8011497310bb936da7d4745652823b",
+                "sha256:69b13fdf12878b10dc6003acc8d0abf3ad93e79813fd5f3812497c1c9fb9be49",
+                "sha256:7a1cb80e35e1ccea3e11a48afe65d38744a0e0bde88795cc56a4d05b6e4f9d70",
+                "sha256:7e6e3c52e6732c219c07bd97fff6c088f8df4dae3b79752ee3a817e6f32e177e",
+                "sha256:7f42a8490c4fe854325504ce7a6e4796b207960dabb2cbafe3c3959cb00d1d7e",
+                "sha256:84156313f258eafff716b2961644a4483a9be44a5d43551d554844d15d4d224e",
+                "sha256:8578d6b8192e4c805e85f187bc530d0f52ba86c39172e61cd51f68fddd648103",
+                "sha256:890167d5091279a27e2505ff0e1fb273f8c48c41d35c5b92adbf4af80e6b2ed6",
+                "sha256:98e10634792ac0e9e7a92a76b4991b44c2325d3e7798270a808407355e7bb0a1",
+                "sha256:9aadff9032e967865f9778485571e93908d27dab21d0fdfdec0ca779bb6f8ad9",
+                "sha256:9f24f383a298a0c0f9b3113b982e21751a8ecde6615494a3f1470eb4a9d70e9e",
+                "sha256:a73021b44813b5c84eda4a3af5826dd72356a900bac9bd9dd1f0f81ee1c22c2f",
+                "sha256:afd96845e12638d2c44d213d4810a08f4dc4a563f9a98204b7428e567014b1cd",
+                "sha256:b73ddf033d8cd4cc9dfed6324b1ad2a89ba52c410ef6877998422fcb9c23e3a8",
+                "sha256:b8f490f5fad1767a1331df1259763b3bad7d7af12a75b950c2843ba319b2415f",
+                "sha256:dbc5cd56fff1a6152ca59445178652756f4e509f672e49ccdf3d79c1043113a4",
+                "sha256:eac8a3499754790187bb00574ab980df13e754777d346f85e0ff6df929bcd964",
+                "sha256:eaed1c65f461a959284649e37b5051224f4db6ebdc84e40b5e65f2986f101a08"
+            ],
+            "index": "pypi",
+            "version": "==2.8.4"
+        },
         "pyoai": {
             "hashes": [
                 "sha256:029521e1f6a819511feb4299a6181b5c312e8a71f7cddc4547e27001e7552be0"
@@ -177,11 +220,11 @@
         },
         "python-dotenv": {
             "hashes": [
-                "sha256:debd928b49dbc2bf68040566f55cdb3252458036464806f4094487244e2a4093",
-                "sha256:f157d71d5fec9d4bd5f51c82746b6344dffa680ee85217c123f4a0c8117c4544"
+                "sha256:8429f459fc041237d98c9ff32e1938e7e5535b5ff24388876315a098027c3a57",
+                "sha256:ca9f3debf2262170d6f46571ce4d6ca1add60bb93b69c3a29dcb3d1a00a65c93"
             ],
             "index": "pypi",
-            "version": "==0.10.3"
+            "version": "==0.11.0"
         },
         "requests": {
             "hashes": [
@@ -196,18 +239,18 @@
                 "flask"
             ],
             "hashes": [
-                "sha256:09e1e8f00f22ea580348f83bbbd880adf40b29f1dec494a8e4b33e22f77184fb",
-                "sha256:ff1fa7fb85703ae9414c8b427ee73f8363232767c9cd19158f08f6e4f0b58fc7"
+                "sha256:b06dd27391fd11fb32f84fe054e6a64736c469514a718a99fb5ce1dff95d6b28",
+                "sha256:e023da07cfbead3868e1e2ba994160517885a32dfd994fc455b118e37989479b"
             ],
             "index": "pypi",
-            "version": "==0.13.2"
+            "version": "==0.14.1"
         },
         "six": {
             "hashes": [
-                "sha256:1f1b7d42e254082a9db6279deae68afb421ceba6158efa6131de7b3003ee93fd",
-                "sha256:30f610279e8b2578cab6db20741130331735c781b56053c59c4076da27f06b66"
+                "sha256:236bdbdce46e6e6a3d61a337c0f8b763ca1e8717c03b369e87a7ec7ce1319c0a",
+                "sha256:8f3cd2e254d8f793e7f3d6d9df77b92252b52637291d0f0da013c76ea2724b6c"
             ],
-            "version": "==1.13.0"
+            "version": "==1.14.0"
         },
         "text-unidecode": {
             "hashes": [
@@ -218,17 +261,17 @@
         },
         "urllib3": {
             "hashes": [
-                "sha256:3de946ffbed6e6746608990594d08faac602528ac7015ac28d33cee6a45b7398",
-                "sha256:9a107b99a5393caf59c7aa3c1249c16e6879447533d0887f4336dde834c7be86"
+                "sha256:2f3db8b19923a873b3e5256dc9c2dedfa883e33d87c690d9c7913e1f40673cdc",
+                "sha256:87716c2d2a7121198ebcb7ce7cccf6ce5e9ba539041cfbaeecfb641dc0bf6acc"
             ],
-            "version": "==1.25.6"
+            "version": "==1.25.8"
         },
         "werkzeug": {
             "hashes": [
-                "sha256:7280924747b5733b246fe23972186c6b348f9ae29724135a6dfc1e53cea433e7",
-                "sha256:e5f4a1f98b52b18a93da705a7458e55afb26f32bff83ff5d19189f92462d65c4"
+                "sha256:169ba8a33788476292d04186ab33b01d6add475033dfc07215e6d219cc077096",
+                "sha256:6dc65cf9091cf750012f56f2cad759fa9e879f511b5ff8685e456b4e3bf90d16"
             ],
-            "version": "==0.16.0"
+            "version": "==1.0.0"
         }
     },
     "develop": {
@@ -244,6 +287,7 @@
                 "sha256:03472c30eb2c5d1ba9227e4c2ca66ab8287fbfbbda3888aa93dc2e28fc6811b4",
                 "sha256:75a9445bac02d8d058d5e1fe689654ba5a6556a1dfd8ce6ec55a0ed79866cfa6"
             ],
+            "markers": "sys_platform == 'win32'",
             "version": "==1.3.0"
         },
         "attrs": {
@@ -253,6 +297,14 @@
             ],
             "version": "==19.3.0"
         },
+        "colorama": {
+            "hashes": [
+                "sha256:7d73d2a99753107a36ac6b455ee49046802e59d9d076ef8e47b61499fa29afff",
+                "sha256:e96da0d330793e2cb9485e9ddfd918d456036c7149416295932478192f4436a1"
+            ],
+            "markers": "sys_platform == 'win32'",
+            "version": "==0.4.3"
+        },
         "factory-boy": {
             "hashes": [
                 "sha256:728df59b372c9588b83153facf26d3d28947fc750e8e3c95cefa9bed0e6394ee",
@@ -263,18 +315,18 @@
         },
         "faker": {
             "hashes": [
-                "sha256:5902379d8df308a204fc11c4f621590ee83975805a6c7b2228203b9defa45250",
-                "sha256:5e8c755c619f332d5ec28b7586389665f136bcf528e165eb925e87c06a63eda7"
+                "sha256:047d4d1791bfb3756264da670d99df13d799bb36e7d88774b1585a82d05dbaec",
+                "sha256:1b1a58961683b30c574520d0c739c4443e0ef6a185c04382e8cc888273dbebed"
             ],
-            "version": "==2.0.3"
+            "version": "==4.0.0"
         },
         "importlib-metadata": {
             "hashes": [
-                "sha256:aa18d7378b00b40847790e7c27e11673d7fed219354109d0e7b9e5b25dc3ad26",
-                "sha256:d5f18a79777f3aa179c145737780282e27b508fc8fd688cb17c7a813e8bd39af"
+                "sha256:06f5b3a99029c7134207dd882428a66992a9de2bef7c2b699b5641f9886c3302",
+                "sha256:b97607a1a18a5100839aec1dc26a1ea17ee0d93b20b0f008d80a5a050afb200b"
             ],
             "markers": "python_version < '3.8'",
-            "version": "==0.23"
+            "version": "==1.5.0"
         },
         "isort": {
             "hashes": [
@@ -318,62 +370,70 @@
         },
         "more-itertools": {
             "hashes": [
-                "sha256:409cd48d4db7052af495b09dec721011634af3753ae1ef92d2b32f73a745f832",
-                "sha256:92b8c4b06dac4f0611c0729b2f2ede52b2e1bac1ab48f089c7ddc12e26bb60c4"
+                "sha256:5dd8bcf33e5f9513ffa06d5ad33d78f31e1931ac9a18f33d37e77a180d393a7c",
+                "sha256:b1ddb932186d8a6ac451e1d95844b382f55e12686d51ca0c68b6f61f2ab7a507"
             ],
-            "version": "==7.2.0"
+            "version": "==8.2.0"
         },
         "packaging": {
             "hashes": [
-                "sha256:28b924174df7a2fa32c1953825ff29c61e2f5e082343165438812f00d3a7fc47",
-                "sha256:d9551545c6d761f3def1677baf08ab2a3ca17c56879e70fecba2fc4dde4ed108"
+                "sha256:170748228214b70b672c581a3dd610ee51f733018650740e98c7df862a583f73",
+                "sha256:e665345f9eef0c621aa0bf2f8d78cf6d21904eef16a93f020240b704a57f1334"
             ],
-            "version": "==19.2"
+            "version": "==20.1"
+        },
+        "pathlib2": {
+            "hashes": [
+                "sha256:0ec8205a157c80d7acc301c0b18fbd5d44fe655968f5d947b6ecef5290fc35db",
+                "sha256:6cd9a47b597b37cc57de1c05e56fb1a1c9cc9fab04fe78c29acd090418529868"
+            ],
+            "markers": "python_version < '3.6'",
+            "version": "==2.3.5"
         },
         "pluggy": {
             "hashes": [
-                "sha256:0db4b7601aae1d35b4a033282da476845aa19185c1e6964b25cf324b5e4ec3e6",
-                "sha256:fa5fa1622fa6dd5c030e9cad086fa19ef6a0cf6d7a2d12318e10cb49d6d68f34"
+                "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0",
+                "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d"
             ],
-            "version": "==0.13.0"
+            "version": "==0.13.1"
         },
         "py": {
             "hashes": [
-                "sha256:64f65755aee5b381cea27766a3a147c3f15b9b6b9ac88676de66ba2ae36793fa",
-                "sha256:dc639b046a6e2cff5bbe40194ad65936d6ba360b52b3c3fe1d08a82dd50b5e53"
+                "sha256:5e27081401262157467ad6e7f851b7aa402c5852dbcb3dae06768434de5752aa",
+                "sha256:c20fdd83a5dbc0af9efd622bee9a5564e278f6380fffcacc43ba6f43db2813b0"
             ],
-            "version": "==1.8.0"
+            "version": "==1.8.1"
         },
         "pylint": {
             "hashes": [
-                "sha256:7b76045426c650d2b0f02fc47c14d7934d17898779da95288a74c2a7ec440702",
-                "sha256:856476331f3e26598017290fd65bebe81c960e806776f324093a46b76fb2d1c0"
+                "sha256:3db5468ad013380e987410a8d6956226963aed94ecb5f9d3a28acca6d9ac36cd",
+                "sha256:886e6afc935ea2590b462664b161ca9a5e40168ea99e5300935f6591ad467df4"
             ],
             "index": "pypi",
-            "version": "==2.4.3"
+            "version": "==2.4.4"
         },
         "pyparsing": {
             "hashes": [
-                "sha256:4acadc9a2b96c19fe00932a38ca63e601180c39a189a696abce1eaab641447e1",
-                "sha256:61b5ed888beab19ddccab3478910e2076a6b5a0295dffc43021890e136edf764"
+                "sha256:4c830582a84fb022400b85429791bc551f1f4871c33f23e44f353119e92f969f",
+                "sha256:c342dccb5250c08d45fd6f8b4a559613ca603b57498511740e65cd11a2e7dcec"
             ],
-            "version": "==2.4.4"
+            "version": "==2.4.6"
         },
         "pytest": {
             "hashes": [
-                "sha256:27abc3fef618a01bebb1f0d6d303d2816a99aa87a5968ebc32fe971be91eb1e6",
-                "sha256:58cee9e09242937e136dbb3dab466116ba20d6b7828c7620f23947f37eb4dae4"
+                "sha256:0d5fe9189a148acc3c3eb2ac8e1ac0742cb7618c084f3d228baaec0c254b318d",
+                "sha256:ff615c761e25eb25df19edddc0b970302d2a9091fbce0e7213298d85fb61fef6"
             ],
             "index": "pypi",
-            "version": "==5.2.2"
+            "version": "==5.3.5"
         },
         "pytest-mock": {
             "hashes": [
-                "sha256:b3514caac35fe3f05555923eabd9546abce11571cc2ddf7d8615959d04f2c89e",
-                "sha256:ea502c3891599c26243a3a847ccf0b1d20556678c528f86c98e3cd6d40c5cf11"
+                "sha256:b35eb281e93aafed138db25c8772b95d3756108b601947f89af503f8c629413f",
+                "sha256:cb67402d87d5f53c579263d37971a164743dc33c159dfb4fb4a86f37c5552307"
             ],
             "index": "pypi",
-            "version": "==1.11.2"
+            "version": "==2.0.0"
         },
         "python-dateutil": {
             "hashes": [
@@ -385,18 +445,18 @@
         },
         "python-dotenv": {
             "hashes": [
-                "sha256:debd928b49dbc2bf68040566f55cdb3252458036464806f4094487244e2a4093",
-                "sha256:f157d71d5fec9d4bd5f51c82746b6344dffa680ee85217c123f4a0c8117c4544"
+                "sha256:8429f459fc041237d98c9ff32e1938e7e5535b5ff24388876315a098027c3a57",
+                "sha256:ca9f3debf2262170d6f46571ce4d6ca1add60bb93b69c3a29dcb3d1a00a65c93"
             ],
             "index": "pypi",
-            "version": "==0.10.3"
+            "version": "==0.11.0"
         },
         "six": {
             "hashes": [
-                "sha256:1f1b7d42e254082a9db6279deae68afb421ceba6158efa6131de7b3003ee93fd",
-                "sha256:30f610279e8b2578cab6db20741130331735c781b56053c59c4076da27f06b66"
+                "sha256:236bdbdce46e6e6a3d61a337c0f8b763ca1e8717c03b369e87a7ec7ce1319c0a",
+                "sha256:8f3cd2e254d8f793e7f3d6d9df77b92252b52637291d0f0da013c76ea2724b6c"
             ],
-            "version": "==1.13.0"
+            "version": "==1.14.0"
         },
         "termcolor": {
             "hashes": [
@@ -414,36 +474,37 @@
         },
         "typed-ast": {
             "hashes": [
-                "sha256:1170afa46a3799e18b4c977777ce137bb53c7485379d9706af8a59f2ea1aa161",
-                "sha256:18511a0b3e7922276346bcb47e2ef9f38fb90fd31cb9223eed42c85d1312344e",
-                "sha256:262c247a82d005e43b5b7f69aff746370538e176131c32dda9cb0f324d27141e",
-                "sha256:2b907eb046d049bcd9892e3076c7a6456c93a25bebfe554e931620c90e6a25b0",
-                "sha256:354c16e5babd09f5cb0ee000d54cfa38401d8b8891eefa878ac772f827181a3c",
-                "sha256:48e5b1e71f25cfdef98b013263a88d7145879fbb2d5185f2a0c79fa7ebbeae47",
-                "sha256:4e0b70c6fc4d010f8107726af5fd37921b666f5b31d9331f0bd24ad9a088e631",
-                "sha256:630968c5cdee51a11c05a30453f8cd65e0cc1d2ad0d9192819df9978984529f4",
-                "sha256:66480f95b8167c9c5c5c87f32cf437d585937970f3fc24386f313a4c97b44e34",
-                "sha256:71211d26ffd12d63a83e079ff258ac9d56a1376a25bc80b1cdcdf601b855b90b",
-                "sha256:7954560051331d003b4e2b3eb822d9dd2e376fa4f6d98fee32f452f52dd6ebb2",
-                "sha256:838997f4310012cf2e1ad3803bce2f3402e9ffb71ded61b5ee22617b3a7f6b6e",
-                "sha256:95bd11af7eafc16e829af2d3df510cecfd4387f6453355188342c3e79a2ec87a",
-                "sha256:bc6c7d3fa1325a0c6613512a093bc2a2a15aeec350451cbdf9e1d4bffe3e3233",
-                "sha256:cc34a6f5b426748a507dd5d1de4c1978f2eb5626d51326e43280941206c209e1",
-                "sha256:d755f03c1e4a51e9b24d899561fec4ccaf51f210d52abdf8c07ee2849b212a36",
-                "sha256:d7c45933b1bdfaf9f36c579671fec15d25b06c8398f113dab64c18ed1adda01d",
-                "sha256:d896919306dd0aa22d0132f62a1b78d11aaf4c9fc5b3410d3c666b818191630a",
-                "sha256:fdc1c9bbf79510b76408840e009ed65958feba92a88833cdceecff93ae8fff66",
-                "sha256:ffde2fbfad571af120fcbfbbc61c72469e72f550d676c3342492a9dfdefb8f12"
+                "sha256:0666aa36131496aed8f7be0410ff974562ab7eeac11ef351def9ea6fa28f6355",
+                "sha256:0c2c07682d61a629b68433afb159376e24e5b2fd4641d35424e462169c0a7919",
+                "sha256:249862707802d40f7f29f6e1aad8d84b5aa9e44552d2cc17384b209f091276aa",
+                "sha256:24995c843eb0ad11a4527b026b4dde3da70e1f2d8806c99b7b4a7cf491612652",
+                "sha256:269151951236b0f9a6f04015a9004084a5ab0d5f19b57de779f908621e7d8b75",
+                "sha256:4083861b0aa07990b619bd7ddc365eb7fa4b817e99cf5f8d9cf21a42780f6e01",
+                "sha256:498b0f36cc7054c1fead3d7fc59d2150f4d5c6c56ba7fb150c013fbc683a8d2d",
+                "sha256:4e3e5da80ccbebfff202a67bf900d081906c358ccc3d5e3c8aea42fdfdfd51c1",
+                "sha256:6daac9731f172c2a22ade6ed0c00197ee7cc1221aa84cfdf9c31defeb059a907",
+                "sha256:715ff2f2df46121071622063fc7543d9b1fd19ebfc4f5c8895af64a77a8c852c",
+                "sha256:73d785a950fc82dd2a25897d525d003f6378d1cb23ab305578394694202a58c3",
+                "sha256:8c8aaad94455178e3187ab22c8b01a3837f8ee50e09cf31f1ba129eb293ec30b",
+                "sha256:8ce678dbaf790dbdb3eba24056d5364fb45944f33553dd5869b7580cdbb83614",
+                "sha256:aaee9905aee35ba5905cfb3c62f3e83b3bec7b39413f0a7f19be4e547ea01ebb",
+                "sha256:bcd3b13b56ea479b3650b82cabd6b5343a625b0ced5429e4ccad28a8973f301b",
+                "sha256:c9e348e02e4d2b4a8b2eedb48210430658df6951fa484e59de33ff773fbd4b41",
+                "sha256:d205b1b46085271b4e15f670058ce182bd1199e56b317bf2ec004b6a44f911f6",
+                "sha256:d43943ef777f9a1c42bf4e552ba23ac77a6351de620aa9acf64ad54933ad4d34",
+                "sha256:d5d33e9e7af3b34a40dc05f498939f0ebf187f07c385fd58d591c533ad8562fe",
+                "sha256:fc0fea399acb12edbf8a628ba8d2312f583bdbdb3335635db062fa98cf71fca4",
+                "sha256:fe460b922ec15dd205595c9b5b99e2f056fd98ae8f9f56b888e7a17dc2b757e7"
             ],
             "markers": "implementation_name == 'cpython' and python_version < '3.8'",
-            "version": "==1.4.0"
+            "version": "==1.4.1"
         },
         "wcwidth": {
             "hashes": [
-                "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e",
-                "sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c"
+                "sha256:8fd29383f539be45b20bd4df0dc29c20ba48654a41e661925e612311e9f3c603",
+                "sha256:f28b3e8a6483e5d49e7f8949ac1a78314e740333ae305b4ba5defd3e74fb37a8"
             ],
-            "version": "==0.1.7"
+            "version": "==0.1.8"
         },
         "wrapt": {
             "hashes": [
@@ -453,10 +514,10 @@
         },
         "zipp": {
             "hashes": [
-                "sha256:3718b1cbcd963c7d4c5511a8240812904164b7f381b647143a89d3b98f9bcd8e",
-                "sha256:f06903e9f1f43b12d371004b4ac7b06ab39a44adc747266928ae6debfa7b3335"
+                "sha256:15428d652e993b6ce86694c3cccf0d71aa7afdc6ef1807fa25a920e9444e0281",
+                "sha256:d9d2efe11d3a3fb9184da550d35bd1319dc8e30a63255927c82bb42fca1f4f7c"
             ],
-            "version": "==0.6.0"
+            "version": "==1.1.0"
         }
     }
 }
diff --git a/docker-compose.yml b/docker-compose.yml
index 09ed2a1..5c412aa 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -10,4 +10,4 @@ services:
     volumes:
       - ./:/home/app/webapp/
     env_file:
-      - .env
+      - .env
\ No newline at end of file
diff --git a/viringo/catalogs.py b/viringo/catalogs.py
index 20bcdfb..fd3d424 100644
--- a/viringo/catalogs.py
+++ b/viringo/catalogs.py
@@ -347,7 +347,7 @@ def getRecord(self, metadataPrefix, identifier):
 
         # Should we implement this based on source_url and local_identifier the way we currently do for the harvester? 
 
-        result = frdr.get_metadata(identifier, db, user, password, server)
+        result = frdr.get_metadata(identifier, db=config.POSTGRES_DB, user=config.POSTGRES_USER, password=config.POSTGRES_PASSWORD, server=config.POSTGRES_SERVER)
         if not result:
             raise error.IdDoesNotExistError(
                 "\"%s\" is unknown or illegal in this repository" % identifier
@@ -384,14 +384,14 @@ def listRecords(
         # Get both a provider and client_id from the set
         provider_id, client_id = set_to_provider_client(set)
         results, total_records, paging_cursor = frdr.get_metadata_list(
-            query=search_query,
-            provider_id=provider_id,
-            client_id=client_id,
-            cursor=paging_cursor,
             server=config.POSTGRES_SERVER,
             db=config.POSTGRES_DB,
             user=config.POSTGRES_USER,
-            password=config.POSTGRES_PASSWORD
+            password=config.POSTGRES_PASSWORD,
+            query=search_query,
+            provider_id=provider_id,
+            client_id=client_id,
+            records_cursor=paging_cursor
         )
 
         records = []
@@ -430,14 +430,14 @@ def listIdentifiers(
         provider_id, client_id = set_to_provider_client(set)
 
         results, total_records, paging_cursor = frdr.get_metadata_list(
-            query=search_query,
-            provider_id=provider_id,
-            client_id=client_id,
-            cursor=paging_cursor,
             server=config.POSTGRES_SERVER,
             db=config.POSTGRES_DB,
             user=config.POSTGRES_USER,
-            password=config.POSTGRES_PASSWORD
+            password=config.POSTGRES_PASSWORD,
+            query=search_query,
+            provider_id=provider_id,
+            client_id=client_id,
+            records_cursor=paging_cursor
         )
 
         records = []
@@ -468,7 +468,7 @@ def listSets(
 
         batch_size = 50
         next_batch = paging_cursor + batch_size
-        results, total_results = frdr.get_sets()
+        results, total_results = frdr.get_sets(db=config.POSTGRES_DB, user=config.POSTGRES_USER, password=config.POSTGRES_PASSWORD, server=config.POSTGRES_SERVER)
         results = results[paging_cursor: next_batch]
 
         if len(results) < batch_size:
diff --git a/viringo/oai.py b/viringo/oai.py
index 04c192e..9ef1b7e 100644
--- a/viringo/oai.py
+++ b/viringo/oai.py
@@ -16,6 +16,8 @@
 from . import metadata
 from . import config
 
+import sys
+
 BP = Blueprint('oai', __name__)
 
 class XMLTreeServer(oaipmh.server.XMLTreeServer):
diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py
new file mode 100644
index 0000000..1ab7fb5
--- /dev/null
+++ b/viringo/services/frdr.py
@@ -0,0 +1,282 @@
+"""Handles DB queries for retrieving metadata"""
+
+import psycopg2
+import re
+from datetime import datetime
+import dateutil.parser
+import dateutil.tz
+from viringo import config
+
+class Metadata:
+    """Represents a DataCite metadata resultset"""
+    def __init__(
+            self,
+            identifier=None,
+            created_datetime=None,
+            updated_datetime=None,
+            xml=None,
+            metadata_version=None,
+            titles=None,
+            creators=None,
+            subjects=None,
+            descriptions=None,
+            publisher=None,
+            publication_year=None,
+            dates=None,
+            contributors=None,
+            resource_types=None,
+            funding_references=None,
+            geo_locations=None,
+            formats=None,
+            identifiers=None,
+            language=None,
+            relations=None,
+            rights=None,
+            sizes=None,
+            client=None,
+            active=True
+        ):
+
+        self.identifier = identifier
+        self.created_datetime = created_datetime or datetime.min
+        self.updated_datetime = updated_datetime or datetime.min
+        self.xml = xml
+        self.metadata_version = metadata_version
+        self.titles = titles or []
+        self.creators = creators or []
+        self.subjects = subjects or []
+        self.descriptions = descriptions or []
+        self.publisher = publisher
+        self.publication_year = publication_year
+        self.dates = dates or []
+        self.contributors = contributors or []
+        self.resource_types = resource_types or []
+        self.funding_references = funding_references or []
+        self.geo_locations = geo_locations or []
+        self.formats = formats or []
+        self.identifiers = identifiers or []
+        self.language = language
+        self.relations = relations or []
+        self.rights = rights or []
+        self.sizes = sizes or []
+        self.client = client
+        self.active = active
+
+
+def build_metadata(data):
+    """Parse single FRDR result into metadata object"""
+    result = Metadata()
+
+    result.identifier = data['record_id']
+
+    # Here we want to parse a ISO date but convert to UTC and then remove the TZinfo entirely
+    # This is because OAI always works in UTC.
+    created = dateutil.parser.parse(data['pub_date'])
+    result.created_datetime = created.astimezone(dateutil.tz.UTC).replace(tzinfo=None)
+    updated = dateutil.parser.parse(data['pub_date'])
+    result.updated_datetime = updated.astimezone(dateutil.tz.UTC).replace(tzinfo=None)
+
+    result.xml = None
+    result.metadata_version = None
+    result.titles = [data['title']]
+    result.creators = data['dc:contributor.author']
+    result.subjects = data['dc:subject']
+    result.descriptions = data['dc:description']
+    result.publisher = data['dc:publisher']
+    result.publication_year = dateutil.parser.parse(data['pub_date']).year
+    result.dates = [data['pub_date']]
+    result.contributors = data['dc:contributor']
+    result.funding_references = []
+    result.sizes = []
+    result.geo_locations = data['frdr:geospatial']
+    result.resource_types = []
+    result.formats = []
+    result.identifiers = []
+    result.language = ''
+    result.relations = []
+    result.rights = data['dc:rights']
+    result.client = ''
+    result.active = True
+
+    return result
+
+
+def construct_local_url(record):
+        # Check if the local_identifier has already been turned into a url
+        if "http" in record["local_identifier"].lower():
+            return record["local_identifier"]
+
+        # Check for OAI format of identifier (oai:domain:id)
+        oai_id = None
+        oai_search = re.search("oai:(.+):(.+)", record["local_identifier"])
+        if oai_search:
+            oai_id = oai_search.group(2)
+            oai_id = oai_id.replace("_", ":")
+
+        # If given a pattern then substitue in the item ID and return it
+        if "item_url_pattern" in record and record["item_url_pattern"]:
+            if oai_id:
+                local_url = re.sub("(\%id\%)", oai_id, record["item_url_pattern"])
+            else:
+                local_url = re.sub("(\%id\%)", record["local_identifier"], record["item_url_pattern"])
+            return local_url
+
+        # Check if the identifier is a DOI
+        doi = re.search("(doi|DOI):\s?\S+", record["local_identifier"])
+        if doi:
+            doi = doi.group(0).rstrip('\.')
+            local_url = re.sub("(doi|DOI):\s?", "https://dx.doi.org/", doi)
+            return local_url
+
+        # If the item has a source URL, use it
+        if ('source_url' in record) and record['source_url']:
+            return record['source_url']
+
+        # URL is in the identifier
+        local_url = re.search("(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?",
+                              record["local_identifier"])
+        if local_url:
+            return local_url.group(0)
+
+        local_url = None
+        return local_url
+
+
+def rows_to_dict(cursor):
+        newdict = []
+        if cursor:
+            for r in cursor:
+                if r:
+                    if isinstance(r, list):
+                        newdict.append(r[0])
+                    else:
+                        newdict.append(r)
+        return newdict
+
+
+def assemble_record(record, db, user, password, server):
+    record["dc:source"] = construct_local_url(record)
+    if record["dc:source"] is None:
+        return None
+        
+    if int(record["deleted"]) == 1:
+        return None
+
+    if (len(record['title']) == 0):
+        return None
+
+    con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (db, user, password, server))
+    with con:
+        lookup_cur = con.cursor(cursor_factory=None)
+
+        lookup_cur.execute("SELECT coordinate_type, lat, lon FROM geospatial WHERE record_id=?",
+                        (record["record_id"],))
+        geodata = lookup_cur.fetchall()
+        record["frdr:geospatial"] = []
+        polycoordinates = []
+
+        try:
+            for coordinate in geodata:
+                if coordinate[0] == "Polygon":
+                    polycoordinates.append([float(coordinate[1]), float(coordinate[2])])
+                else:
+                    record["frdr:geospatial"].append({"frdr:geospatial_type": "Feature", "frdr:geospatial_geometry": {"frdr:geometry_type": coordinate[0], "frdr:geometry_coordinates": [float(coordinate[1]), float(coordinate[2])]}})
+        except:
+            pass
+
+        if polycoordinates:
+            record["frdr:geospatial"].append({"frdr:geospatial_type": "Feature", "frdr:geospatial_geometry": {"frdr:geometry_type": "Polygon", "frdr:geometry_coordinates": polycoordinates}})
+
+    with con:
+        lookup_cur = con.cursor(cursor_factory=DictCursor)
+
+        # attach the other values to the dict
+        lookup_cur.execute("""SELECT creators.creator FROM creators JOIN records_x_creators on records_x_creators.creator_id = creators.creator_id WHERE records_x_creators.record_id=? AND records_x_creators.is_contributor=0 order by records_x_creators_id asc""", (record["record_id"],))
+        record["dc:contributor.author"] = rows_to_dict(lookup_cur)
+
+        lookup_cur.execute("""SELECT affiliations.affiliation FROM affiliations JOIN records_x_affiliations on records_x_affiliations.affiliation_id = affiliations.affiliation_id WHERE records_x_affiliations.record_id=?""", (record["record_id"],))
+        record["datacite:creatorAffiliation"] = rows_to_dict(lookup_cur)
+
+        lookup_cur.execute("""SELECT creators.creator FROM creators JOIN records_x_creators on records_x_creators.creator_id = creators.creator_id WHERE records_x_creators.record_id=? AND records_x_creators.is_contributor=1 order by records_x_creators_id asc""", (record["record_id"],))
+        record["dc:contributor"] = rows_to_dict(lookup_cur)
+
+        lookup_cur.execute("""SELECT subjects.subject FROM subjects JOIN records_x_subjects on records_x_subjects.subject_id = subjects.subject_id WHERE records_x_subjects.record_id=?""", (record["record_id"],))
+        record["dc:subject"] = rows_to_dict(lookup_cur)
+
+        lookup_cur.execute("""SELECT publishers.publisher FROM publishers JOIN records_x_publishers on records_x_publishers.publisher_id = publishers.publisher_id WHERE records_x_publishers.record_id=?""", (record["record_id"],))
+        record["dc:publisher"] = rows_to_dict(lookup_cur)
+
+        lookup_cur.execute("""SELECT rights.rights FROM rights JOIN records_x_rights on records_x_rights.rights_id = rights.rights_id WHERE records_x_rights.record_id=?""", (record["record_id"],))
+        record["dc:rights"] = rows_to_dict(lookup_cur)
+
+        lookup_cur.execute("SELECT description FROM descriptions WHERE record_id=? and language='en' "), (record["record_id"],)
+        record["dc:description"] = rows_to_dict(lookup_cur)
+
+        lookup_cur.execute("SELECT description FROM descriptions WHERE record_id=? and language='fr' "), (record["record_id"],)
+        record["frdr:description_fr"] = rows_to_dict(lookup_cur)
+
+        lookup_cur.execute("""SELECT tags.tag FROM tags JOIN records_x_tags on records_x_tags.tag_id = tags.tag_id WHERE records_x_tags.record_id=? and tags.language = 'en' """, (record["record_id"],))
+        record["frdr:tags"] = rows_to_dict(lookup_cur)
+
+        lookup_cur.execute("""SELECT tags.tag FROM tags JOIN records_x_tags on records_x_tags.tag_id = tags.tag_id WHERE records_x_tags.record_id=? and tags.language = 'fr' """, (record["record_id"],))
+        record["frdr:tags_fr"] = rows_to_dict(lookup_cur)
+
+        lookup_cur.execute("""SELECT access.access FROM access JOIN records_x_access on records_x_access.access_id = access.access_id WHERE records_x_access.record_id=?""", (record["record_id"],))
+        record["frdr:access"] = rows_to_dict(lookup_cur)
+
+    return record
+
+
+def get_metadata_list(
+        server,
+        db,
+        user,
+        password,
+        query=None,
+        provider_id=None,
+        client_id=None,
+        records_cursor=None
+    ):
+
+    # TODO: support listing by set
+    if records_cursor is None:
+        records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (db, user, password, server))
+        with records_con:
+            records_cursor = records_con.cursor()
+        records_sql = """SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id"""
+        records_cursor.execute(records_sql)
+
+    record_set = records_cursor.fetchmany(config.RESULT_SET_SIZE)
+    results = []
+    for row in record_set:
+        record = (dict(zip(['record_id', 'title', 'pub_date', 'contact', 'series', 'source_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp'], row)))
+
+        full_record = assemble_record(record, db, user, password, server)
+        if full_record is not None:
+            results.append(build_metadata(full_record))
+
+    return results, records_cursor.rowcount, records_cursor
+
+
+def get_metadata(identifier, db, user, password, server):
+    records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (db, user, password, server))
+    with records_con:
+        records_cursor = records_con.cursor()
+    records_sql = ("""SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id AND recs.record_id =?""", (identifier,))
+    records_cursor.execute(records_sql)
+    row = records_cursor.fetchone()
+    record = (dict(zip(['record_id', 'title', 'pub_date', 'contact', 'series', 'source_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp'], row)))
+
+    full_record = assemble_record(record, db, user, password, server)
+    return build_metadata(full_record)
+
+
+def get_sets(db, user, password, server):
+    # TODO: this is returning the wrong number of parameters
+    repos_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (db, user, password, server))
+    with repos_con:
+        repos_cursor = repos_con.cursor()
+
+    repos_cursor.execute("SELECT repository_name from repositories")
+    results = repos_cursor.fetchall()
+    return results, len(results)

From 8d9c829103ae2f9558b7313f2aec12f91889fc4c Mon Sep 17 00:00:00 2001
From: Alex Garnett <axfelix@gmail.com>
Date: Fri, 14 Feb 2020 15:59:39 -0800
Subject: [PATCH 09/67] lots of recasting, postgres cleanups

---
 viringo/catalogs.py      | 29 +++++------------------------
 viringo/metadata.py      |  2 +-
 viringo/services/frdr.py | 30 +++++++++++++++---------------
 3 files changed, 21 insertions(+), 40 deletions(-)

diff --git a/viringo/catalogs.py b/viringo/catalogs.py
index fd3d424..8b25894 100644
--- a/viringo/catalogs.py
+++ b/viringo/catalogs.py
@@ -489,14 +489,11 @@ def listSets(
     def build_header(self, result):
         """Construct a OAI-PMH record header"""
 
-        # Provider symbol can just be extracted from the client symbol
-        provider_symbol, _ = result.client.split(".")
-
         return common.Header(
             None,
-            'doi:' + result.identifier,
+            'doi:' + str(result.identifier),
             result.updated_datetime,
-            setspec=[provider_symbol, result.client],
+            setspec=[result.client],
             deleted=not result.active
         )
 
@@ -510,18 +507,6 @@ def build_record(self, metadata):
 
     def build_metadata_map(self, result):
         """Construct a metadata map object for oai metadata writing"""
-        dates = []
-        if result.publication_year:
-            dates.append(str(result.publication_year))
-        dates.extend([date['type'] + ": " + str(date['date']) for date in result.dates])
-
-        rights = []
-        for right in result.rights:
-            if right['statement']:
-                rights.append(right['statement'])
-            if right['uri']:
-                rights.append(right['uri'])
-
         identifiers = [
             identifier_to_string(identifier) for identifier in result.identifiers
         ]
@@ -531,24 +516,20 @@ def build_metadata_map(self, result):
             for relation in result.relations
         ]
 
-        contributors = [
-            contributor.get('name') for contributor in result.contributors
-        ]
-
         metadata = {
             'title': result.titles,
             'creator': result.creators,
             'subject': result.subjects,
             'description': result.descriptions,
             'publisher': [result.publisher] if result.publisher else [],
-            'contributor': contributors,
-            'date': dates,
+            'contributor': result.contributors,
+            'date': result.dates,
             'type': result.resource_types,
             'format': result.formats,
             'identifier': identifiers,
             'relation': relations,
             'language': [result.language] if result.language else [],
-            'rights': rights,
+            'rights': result.rights,
             'xml': result.xml,
             'set': result.client,
             'metadata_version': result.metadata_version
diff --git a/viringo/metadata.py b/viringo/metadata.py
index 27c7bfb..8b1bccb 100644
--- a/viringo/metadata.py
+++ b/viringo/metadata.py
@@ -45,7 +45,7 @@ def nsdc(name):
             new_element = etree.SubElement(e_dc, nsdc(name))
             # The regular expression here is to filter only valid XML chars
             # Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
-            new_element.text = re.sub(u'[^\u0020-\uD7FF\u0009\u000A\u000D\uE000-\uFFFD\U00010000-\U0010FFFF]+', '', value)
+            new_element.text = re.sub(u'[^\u0020-\uD7FF\u0009\u000A\u000D\uE000-\uFFFD\U00010000-\U0010FFFF]+', '', str(value))
 
 def datacite_writer(element: etree.Element, metadata):
     """Writer for writing data in a metadata object out into raw datacite format"""
diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py
index 1ab7fb5..af04a5d 100644
--- a/viringo/services/frdr.py
+++ b/viringo/services/frdr.py
@@ -95,7 +95,7 @@ def build_metadata(data):
     result.language = ''
     result.relations = []
     result.rights = data['dc:rights']
-    result.client = ''
+    result.client = data['repository_name']
     result.active = True
 
     return result
@@ -169,8 +169,7 @@ def assemble_record(record, db, user, password, server):
     with con:
         lookup_cur = con.cursor(cursor_factory=None)
 
-        lookup_cur.execute("SELECT coordinate_type, lat, lon FROM geospatial WHERE record_id=?",
-                        (record["record_id"],))
+        lookup_cur.execute("SELECT coordinate_type, lat, lon FROM geospatial WHERE record_id=%s", [record["record_id"]])
         geodata = lookup_cur.fetchall()
         record["frdr:geospatial"] = []
         polycoordinates = []
@@ -188,40 +187,41 @@ def assemble_record(record, db, user, password, server):
             record["frdr:geospatial"].append({"frdr:geospatial_type": "Feature", "frdr:geospatial_geometry": {"frdr:geometry_type": "Polygon", "frdr:geometry_coordinates": polycoordinates}})
 
     with con:
+        from psycopg2.extras import DictCursor
         lookup_cur = con.cursor(cursor_factory=DictCursor)
 
         # attach the other values to the dict
-        lookup_cur.execute("""SELECT creators.creator FROM creators JOIN records_x_creators on records_x_creators.creator_id = creators.creator_id WHERE records_x_creators.record_id=? AND records_x_creators.is_contributor=0 order by records_x_creators_id asc""", (record["record_id"],))
+        lookup_cur.execute("""SELECT creators.creator FROM creators JOIN records_x_creators on records_x_creators.creator_id = creators.creator_id WHERE records_x_creators.record_id=%s AND records_x_creators.is_contributor=0 order by records_x_creators_id asc""", [record["record_id"]])
         record["dc:contributor.author"] = rows_to_dict(lookup_cur)
 
-        lookup_cur.execute("""SELECT affiliations.affiliation FROM affiliations JOIN records_x_affiliations on records_x_affiliations.affiliation_id = affiliations.affiliation_id WHERE records_x_affiliations.record_id=?""", (record["record_id"],))
+        lookup_cur.execute("""SELECT affiliations.affiliation FROM affiliations JOIN records_x_affiliations on records_x_affiliations.affiliation_id = affiliations.affiliation_id WHERE records_x_affiliations.record_id=%s""", [record["record_id"]])
         record["datacite:creatorAffiliation"] = rows_to_dict(lookup_cur)
 
-        lookup_cur.execute("""SELECT creators.creator FROM creators JOIN records_x_creators on records_x_creators.creator_id = creators.creator_id WHERE records_x_creators.record_id=? AND records_x_creators.is_contributor=1 order by records_x_creators_id asc""", (record["record_id"],))
+        lookup_cur.execute("""SELECT creators.creator FROM creators JOIN records_x_creators on records_x_creators.creator_id = creators.creator_id WHERE records_x_creators.record_id=%s AND records_x_creators.is_contributor=1 order by records_x_creators_id asc""", [record["record_id"]])
         record["dc:contributor"] = rows_to_dict(lookup_cur)
 
-        lookup_cur.execute("""SELECT subjects.subject FROM subjects JOIN records_x_subjects on records_x_subjects.subject_id = subjects.subject_id WHERE records_x_subjects.record_id=?""", (record["record_id"],))
+        lookup_cur.execute("""SELECT subjects.subject FROM subjects JOIN records_x_subjects on records_x_subjects.subject_id = subjects.subject_id WHERE records_x_subjects.record_id=%s""", [record["record_id"]])
         record["dc:subject"] = rows_to_dict(lookup_cur)
 
-        lookup_cur.execute("""SELECT publishers.publisher FROM publishers JOIN records_x_publishers on records_x_publishers.publisher_id = publishers.publisher_id WHERE records_x_publishers.record_id=?""", (record["record_id"],))
+        lookup_cur.execute("""SELECT publishers.publisher FROM publishers JOIN records_x_publishers on records_x_publishers.publisher_id = publishers.publisher_id WHERE records_x_publishers.record_id=%s""", [record["record_id"]])
         record["dc:publisher"] = rows_to_dict(lookup_cur)
 
-        lookup_cur.execute("""SELECT rights.rights FROM rights JOIN records_x_rights on records_x_rights.rights_id = rights.rights_id WHERE records_x_rights.record_id=?""", (record["record_id"],))
+        lookup_cur.execute("""SELECT rights.rights FROM rights JOIN records_x_rights on records_x_rights.rights_id = rights.rights_id WHERE records_x_rights.record_id=%s""", [record["record_id"]])
         record["dc:rights"] = rows_to_dict(lookup_cur)
 
-        lookup_cur.execute("SELECT description FROM descriptions WHERE record_id=? and language='en' "), (record["record_id"],)
+        lookup_cur.execute("SELECT description FROM descriptions WHERE record_id=%s and language='en' ", [record["record_id"]])
         record["dc:description"] = rows_to_dict(lookup_cur)
 
-        lookup_cur.execute("SELECT description FROM descriptions WHERE record_id=? and language='fr' "), (record["record_id"],)
+        lookup_cur.execute("SELECT description FROM descriptions WHERE record_id=%s and language='fr' ", [record["record_id"]])
         record["frdr:description_fr"] = rows_to_dict(lookup_cur)
 
-        lookup_cur.execute("""SELECT tags.tag FROM tags JOIN records_x_tags on records_x_tags.tag_id = tags.tag_id WHERE records_x_tags.record_id=? and tags.language = 'en' """, (record["record_id"],))
+        lookup_cur.execute("""SELECT tags.tag FROM tags JOIN records_x_tags on records_x_tags.tag_id = tags.tag_id WHERE records_x_tags.record_id=%s and tags.language = 'en' """, [record["record_id"]])
         record["frdr:tags"] = rows_to_dict(lookup_cur)
 
-        lookup_cur.execute("""SELECT tags.tag FROM tags JOIN records_x_tags on records_x_tags.tag_id = tags.tag_id WHERE records_x_tags.record_id=? and tags.language = 'fr' """, (record["record_id"],))
+        lookup_cur.execute("""SELECT tags.tag FROM tags JOIN records_x_tags on records_x_tags.tag_id = tags.tag_id WHERE records_x_tags.record_id=%s and tags.language = 'fr' """, [record["record_id"]])
         record["frdr:tags_fr"] = rows_to_dict(lookup_cur)
 
-        lookup_cur.execute("""SELECT access.access FROM access JOIN records_x_access on records_x_access.access_id = access.access_id WHERE records_x_access.record_id=?""", (record["record_id"],))
+        lookup_cur.execute("""SELECT access.access FROM access JOIN records_x_access on records_x_access.access_id = access.access_id WHERE records_x_access.record_id=%s""", [record["record_id"]])
         record["frdr:access"] = rows_to_dict(lookup_cur)
 
     return record
@@ -262,7 +262,7 @@ def get_metadata(identifier, db, user, password, server):
     records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (db, user, password, server))
     with records_con:
         records_cursor = records_con.cursor()
-    records_sql = ("""SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id AND recs.record_id =?""", (identifier,))
+    records_sql = ("""SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id AND recs.record_id =%s""", [identifier])
     records_cursor.execute(records_sql)
     row = records_cursor.fetchone()
     record = (dict(zip(['record_id', 'title', 'pub_date', 'contact', 'series', 'source_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp'], row)))

From f1d876cbaf3e63b0950874d3bc5f3bc73799296c Mon Sep 17 00:00:00 2001
From: Alex Garnett <axfelix@gmail.com>
Date: Tue, 18 Feb 2020 13:35:57 -0800
Subject: [PATCH 10/67] properly support resumption token

---
 viringo/catalogs.py      | 12 +++++++-----
 viringo/config.py        |  4 +++-
 viringo/services/frdr.py | 38 ++++++++++++++++++++------------------
 3 files changed, 30 insertions(+), 24 deletions(-)

diff --git a/viringo/catalogs.py b/viringo/catalogs.py
index 8b25894..7f5da4c 100644
--- a/viringo/catalogs.py
+++ b/viringo/catalogs.py
@@ -347,7 +347,7 @@ def getRecord(self, metadataPrefix, identifier):
 
         # Should we implement this based on source_url and local_identifier the way we currently do for the harvester? 
 
-        result = frdr.get_metadata(identifier, db=config.POSTGRES_DB, user=config.POSTGRES_USER, password=config.POSTGRES_PASSWORD, server=config.POSTGRES_SERVER)
+        result = frdr.get_metadata(identifier, db=config.POSTGRES_DB, user=config.POSTGRES_USER, password=config.POSTGRES_PASSWORD, server=config.POSTGRES_SERVER, port=config.POSTGRES_PORT)
         if not result:
             raise error.IdDoesNotExistError(
                 "\"%s\" is unknown or illegal in this repository" % identifier
@@ -388,10 +388,11 @@ def listRecords(
             db=config.POSTGRES_DB,
             user=config.POSTGRES_USER,
             password=config.POSTGRES_PASSWORD,
+            port=config.POSTGRES_PORT,
             query=search_query,
             provider_id=provider_id,
             client_id=client_id,
-            records_cursor=paging_cursor
+            cursor=paging_cursor
         )
 
         records = []
@@ -421,7 +422,7 @@ def listIdentifiers(
             from_=None,
             until=None,
             set=None,
-            paging_cursor=None
+            cursor=None
         ):
         #pylint: disable=no-self-use,invalid-name
         """Returns pyoai data tuple for list of identifiers"""
@@ -434,10 +435,11 @@ def listIdentifiers(
             db=config.POSTGRES_DB,
             user=config.POSTGRES_USER,
             password=config.POSTGRES_PASSWORD,
+            port=config.POSTGRES_PORT,
             query=search_query,
             provider_id=provider_id,
             client_id=client_id,
-            records_cursor=paging_cursor
+            cursor=paging_cursor
         )
 
         records = []
@@ -468,7 +470,7 @@ def listSets(
 
         batch_size = 50
         next_batch = paging_cursor + batch_size
-        results, total_results = frdr.get_sets(db=config.POSTGRES_DB, user=config.POSTGRES_USER, password=config.POSTGRES_PASSWORD, server=config.POSTGRES_SERVER)
+        results, total_results = frdr.get_sets(db=config.POSTGRES_DB, user=config.POSTGRES_USER, password=config.POSTGRES_PASSWORD, server=config.POSTGRES_SERVER, port=config.POSTGRES_PORT)
         results = results[paging_cursor: next_batch]
 
         if len(results) < batch_size:
diff --git a/viringo/config.py b/viringo/config.py
index 4a1ec81..baddaa0 100644
--- a/viringo/config.py
+++ b/viringo/config.py
@@ -29,4 +29,6 @@
 # FRDR Postgres user
 POSTGRES_USER = os.getenv('OAIPMH_POSTGRES_USER', '')
 # FRDR Postgres password
-POSTGRES_PASSWORD = os.getenv('OAIPMH_POSTGRES_PASSWORD', '')
\ No newline at end of file
+POSTGRES_PASSWORD = os.getenv('OAIPMH_POSTGRES_PASSWORD', '')
+# FRDR Postgres port
+POSTGRES_PORT = os.getenv('OAIPMH_POSTGRES_PORT', '5432')
\ No newline at end of file
diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py
index af04a5d..3faf7a4 100644
--- a/viringo/services/frdr.py
+++ b/viringo/services/frdr.py
@@ -154,7 +154,7 @@ def rows_to_dict(cursor):
         return newdict
 
 
-def assemble_record(record, db, user, password, server):
+def assemble_record(record, db, user, password, server, port):
     record["dc:source"] = construct_local_url(record)
     if record["dc:source"] is None:
         return None
@@ -165,7 +165,7 @@ def assemble_record(record, db, user, password, server):
     if (len(record['title']) == 0):
         return None
 
-    con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (db, user, password, server))
+    con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s' port='%s'" % (db, user, password, server, port))
     with con:
         lookup_cur = con.cursor(cursor_factory=None)
 
@@ -232,34 +232,36 @@ def get_metadata_list(
         db,
         user,
         password,
+        port,
         query=None,
         provider_id=None,
         client_id=None,
-        records_cursor=None
+        cursor=None
     ):
 
     # TODO: support listing by set
-    if records_cursor is None:
-        records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (db, user, password, server))
-        with records_con:
-            records_cursor = records_con.cursor()
-        records_sql = """SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id"""
-        records_cursor.execute(records_sql)
-
-    record_set = records_cursor.fetchmany(config.RESULT_SET_SIZE)
+    records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s' port='%s'" % (db, user, password, server, port))
+    with records_con:
+        db_cursor = records_con.cursor()
+    records_sql = """SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id"""
+    if cursor is not None:
+        records_sql = records_sql + " OFFSET " + cursor
+    db_cursor.execute(records_sql)
+
+    record_set = db_cursor.fetchmany(config.RESULT_SET_SIZE)
     results = []
     for row in record_set:
         record = (dict(zip(['record_id', 'title', 'pub_date', 'contact', 'series', 'source_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp'], row)))
 
-        full_record = assemble_record(record, db, user, password, server)
+        full_record = assemble_record(record, db, user, password, server, port)
         if full_record is not None:
             results.append(build_metadata(full_record))
 
-    return results, records_cursor.rowcount, records_cursor
+    return results, db_cursor.rowcount, len(record_set)
 
 
-def get_metadata(identifier, db, user, password, server):
-    records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (db, user, password, server))
+def get_metadata(identifier, db, user, password, server, port):
+    records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s' port='%s'" % (db, user, password, server, port))
     with records_con:
         records_cursor = records_con.cursor()
     records_sql = ("""SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id AND recs.record_id =%s""", [identifier])
@@ -267,13 +269,13 @@ def get_metadata(identifier, db, user, password, server):
     row = records_cursor.fetchone()
     record = (dict(zip(['record_id', 'title', 'pub_date', 'contact', 'series', 'source_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp'], row)))
 
-    full_record = assemble_record(record, db, user, password, server)
+    full_record = assemble_record(record, db, user, password, server, port)
     return build_metadata(full_record)
 
 
-def get_sets(db, user, password, server):
+def get_sets(db, user, password, server, port):
     # TODO: this is returning the wrong number of parameters
-    repos_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (db, user, password, server))
+    repos_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s' port='%s'" % (db, user, password, server, port))
     with repos_con:
         repos_cursor = repos_con.cursor()
 

From e7e6a822df8021bab1c73ad5469356d91fa3ce63 Mon Sep 17 00:00:00 2001
From: Alex Garnett <axfelix@gmail.com>
Date: Wed, 19 Feb 2020 14:27:31 -0800
Subject: [PATCH 11/67] fix calls to get sets and identifiers

---
 viringo/catalogs.py      | 19 +++++++------------
 viringo/services/frdr.py | 17 ++++++++---------
 2 files changed, 15 insertions(+), 21 deletions(-)

diff --git a/viringo/catalogs.py b/viringo/catalogs.py
index 7f5da4c..a937934 100644
--- a/viringo/catalogs.py
+++ b/viringo/catalogs.py
@@ -380,9 +380,6 @@ def listRecords(
         # If available get the search query from the set param
         search_query = set_to_search_query(set)
 
-        # From and until parameters aren't supported with FRDR
-        # Get both a provider and client_id from the set
-        provider_id, client_id = set_to_provider_client(set)
         results, total_records, paging_cursor = frdr.get_metadata_list(
             server=config.POSTGRES_SERVER,
             db=config.POSTGRES_DB,
@@ -390,8 +387,7 @@ def listRecords(
             password=config.POSTGRES_PASSWORD,
             port=config.POSTGRES_PORT,
             query=search_query,
-            provider_id=provider_id,
-            client_id=client_id,
+            set=set,
             cursor=paging_cursor
         )
 
@@ -422,13 +418,13 @@ def listIdentifiers(
             from_=None,
             until=None,
             set=None,
-            cursor=None
+            paging_cursor=None
         ):
         #pylint: disable=no-self-use,invalid-name
         """Returns pyoai data tuple for list of identifiers"""
 
-        # Get both a provider and client_id from the set
-        provider_id, client_id = set_to_provider_client(set)
+        # If available get the search query from the set param
+        search_query = set_to_search_query(set)
 
         results, total_records, paging_cursor = frdr.get_metadata_list(
             server=config.POSTGRES_SERVER,
@@ -437,8 +433,7 @@ def listIdentifiers(
             password=config.POSTGRES_PASSWORD,
             port=config.POSTGRES_PORT,
             query=search_query,
-            provider_id=provider_id,
-            client_id=client_id,
+            set=set,
             cursor=paging_cursor
         )
 
@@ -482,7 +477,7 @@ def listSets(
         if results:
             for identifier, name in results:
                 # Format of a set is setSpec, setName, setDescription
-                records.append((identifier.upper(), name, None))
+                records.append((identifier.split('//')[1].split('/')[0], name, None))
 
         # This differs from the pyoai implementation in that we have to return a cursor here
         # But this is okay as we have a custom server to handle it.
@@ -493,7 +488,7 @@ def build_header(self, result):
 
         return common.Header(
             None,
-            'doi:' + str(result.identifier),
+            str(result.identifier),
             result.updated_datetime,
             setspec=[result.client],
             deleted=not result.active
diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py
index 3faf7a4..9467d15 100644
--- a/viringo/services/frdr.py
+++ b/viringo/services/frdr.py
@@ -86,16 +86,16 @@ def build_metadata(data):
     result.publication_year = dateutil.parser.parse(data['pub_date']).year
     result.dates = [data['pub_date']]
     result.contributors = data['dc:contributor']
-    result.funding_references = []
+    result.funding_references = data['contact']
     result.sizes = []
     result.geo_locations = data['frdr:geospatial']
-    result.resource_types = []
+    result.resource_types = [data['frdr:tags']]
     result.formats = []
     result.identifiers = []
     result.language = ''
     result.relations = []
     result.rights = data['dc:rights']
-    result.client = data['repository_name']
+    result.client = data['repository_url'].split('//')[1].split('/')[0]
     result.active = True
 
     return result
@@ -234,16 +234,16 @@ def get_metadata_list(
         password,
         port,
         query=None,
-        provider_id=None,
-        client_id=None,
+        set=None,
         cursor=None
     ):
 
-    # TODO: support listing by set
     records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s' port='%s'" % (db, user, password, server, port))
     with records_con:
         db_cursor = records_con.cursor()
     records_sql = """SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id"""
+    if set is not None:
+        records_sql = records_sql + " AND (repos.homepage_url='http://" + set + "/' OR repos.homepage_url='https://" + set + "/')"
     if cursor is not None:
         records_sql = records_sql + " OFFSET " + cursor
     db_cursor.execute(records_sql)
@@ -264,7 +264,7 @@ def get_metadata(identifier, db, user, password, server, port):
     records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s' port='%s'" % (db, user, password, server, port))
     with records_con:
         records_cursor = records_con.cursor()
-    records_sql = ("""SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id AND recs.record_id =%s""", [identifier])
+    records_sql = """SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id AND recs.record_id =""" + identifier
     records_cursor.execute(records_sql)
     row = records_cursor.fetchone()
     record = (dict(zip(['record_id', 'title', 'pub_date', 'contact', 'series', 'source_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp'], row)))
@@ -274,11 +274,10 @@ def get_metadata(identifier, db, user, password, server, port):
 
 
 def get_sets(db, user, password, server, port):
-    # TODO: this is returning the wrong number of parameters
     repos_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s' port='%s'" % (db, user, password, server, port))
     with repos_con:
         repos_cursor = repos_con.cursor()
 
-    repos_cursor.execute("SELECT repository_name from repositories")
+    repos_cursor.execute("SELECT homepage_url, repository_name from repositories")
     results = repos_cursor.fetchall()
     return results, len(results)

From 6352d5b576e4036d6083488e7a0bc0d276f98563 Mon Sep 17 00:00:00 2001
From: Kelly Stathis <kelly.a.stathis@gmail.com>
Date: Thu, 16 Apr 2020 14:33:21 -0700
Subject: [PATCH 12/67] Add new set for openaire_data Do not transform setSpec
 URL (keep as homepage_url)

---
 viringo/catalogs.py      |  2 +-
 viringo/services/frdr.py | 18 +++++++++++-------
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/viringo/catalogs.py b/viringo/catalogs.py
index a937934..c103cef 100644
--- a/viringo/catalogs.py
+++ b/viringo/catalogs.py
@@ -477,7 +477,7 @@ def listSets(
         if results:
             for identifier, name in results:
                 # Format of a set is setSpec, setName, setDescription
-                records.append((identifier.split('//')[1].split('/')[0], name, None))
+                records.append((identifier, name, None))
 
         # This differs from the pyoai implementation in that we have to return a cursor here
         # But this is okay as we have a custom server to handle it.
diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py
index 9467d15..3047c6d 100644
--- a/viringo/services/frdr.py
+++ b/viringo/services/frdr.py
@@ -95,7 +95,7 @@ def build_metadata(data):
     result.language = ''
     result.relations = []
     result.rights = data['dc:rights']
-    result.client = data['repository_url'].split('//')[1].split('/')[0]
+    result.client = data['homepage_url']
     result.active = True
 
     return result
@@ -241,17 +241,18 @@ def get_metadata_list(
     records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s' port='%s'" % (db, user, password, server, port))
     with records_con:
         db_cursor = records_con.cursor()
-    records_sql = """SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id"""
-    if set is not None:
-        records_sql = records_sql + " AND (repos.homepage_url='http://" + set + "/' OR repos.homepage_url='https://" + set + "/')"
+    records_sql = """SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp, repos.homepage_url FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id"""
+    if set is not None and set != 'openaire_data':
+        records_sql = records_sql + " AND (repos.homepage_url='" + set + "')"
     if cursor is not None:
         records_sql = records_sql + " OFFSET " + cursor
     db_cursor.execute(records_sql)
 
     record_set = db_cursor.fetchmany(config.RESULT_SET_SIZE)
+
     results = []
     for row in record_set:
-        record = (dict(zip(['record_id', 'title', 'pub_date', 'contact', 'series', 'source_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp'], row)))
+        record = (dict(zip(['record_id', 'title', 'pub_date', 'contact', 'series', 'source_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp', 'homepage_url'], row)))
 
         full_record = assemble_record(record, db, user, password, server, port)
         if full_record is not None:
@@ -264,10 +265,10 @@ def get_metadata(identifier, db, user, password, server, port):
     records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s' port='%s'" % (db, user, password, server, port))
     with records_con:
         records_cursor = records_con.cursor()
-    records_sql = """SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id AND recs.record_id =""" + identifier
+    records_sql = """SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp, repos.homepage_url FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id AND recs.record_id =""" + identifier
     records_cursor.execute(records_sql)
     row = records_cursor.fetchone()
-    record = (dict(zip(['record_id', 'title', 'pub_date', 'contact', 'series', 'source_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp'], row)))
+    record = (dict(zip(['record_id', 'title', 'pub_date', 'contact', 'series', 'source_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp', 'homepage_url'], row)))
 
     full_record = assemble_record(record, db, user, password, server, port)
     return build_metadata(full_record)
@@ -280,4 +281,7 @@ def get_sets(db, user, password, server, port):
 
     repos_cursor.execute("SELECT homepage_url, repository_name from repositories")
     results = repos_cursor.fetchall()
+
+    results.append(['openaire_data', 'OpenAIRE'])
+
     return results, len(results)

From 321abe00b32b96680259756e6627f2b9b40a6672 Mon Sep 17 00:00:00 2001
From: Kelly Stathis <kelly.a.stathis@gmail.com>
Date: Thu, 16 Apr 2020 18:04:20 -0700
Subject: [PATCH 13/67] Construct XML from FRDR metadata for oai_datacite

---
 viringo/services/frdr.py | 90 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 89 insertions(+), 1 deletion(-)

diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py
index 3047c6d..852df97 100644
--- a/viringo/services/frdr.py
+++ b/viringo/services/frdr.py
@@ -6,6 +6,7 @@
 import dateutil.parser
 import dateutil.tz
 from viringo import config
+import xml.etree.cElementTree as ET
 
 class Metadata:
     """Represents a DataCite metadata resultset"""
@@ -62,6 +63,93 @@ def __init__(
         self.client = client
         self.active = active
 
+def construct_datacite_xml(data):
+    resource = ET.Element("resource")
+    resource.set("xmlns", "http://datacite.org/schema/kernel-4")
+    resource.set("xmlns:xsi", "http://www.w3.org/2001/XMLSchema-instance")
+    resource.set("xsi:schemaLocation",
+                 "http://datacite.org/schema/kernel-4 http://schema.datacite.org/meta/kernel-4/metadata.xsd")
+
+    # Add resource URL as identifier
+    identifier = ET.SubElement(resource, "identifier")
+    identifier.set("identifierType", "URL")
+    identifier.text = data['source_url']
+    if data['source_url'] == '':
+        if data['item_url_pattern'] != '' and "%id%" in data['item_url_pattern'] and data['local_identifier'] != '':
+            identifier.text = data['item_url_pattern'].replace("%id%", data['local_identifier'])
+
+    # Add creators
+    creators = ET.SubElement(resource, "creators")
+    for creator_entry in data['dc:contributor.author']:
+        creator = ET.SubElement(creators, "creator")
+        creatorName = ET.SubElement(creator, "creatorName")
+        creatorName.text = creator_entry
+
+    # Add title
+    titles = ET.SubElement(resource, "titles")
+    title = ET.SubElement(titles, "title")
+    title.text = data['title']
+
+    # Add publisher
+    publisher = ET.SubElement(resource, "publisher")
+    publisher.text = data['repository_name']
+
+    # Add publication year
+    publicationyear = ET.SubElement(resource, "publicationyear")
+    publicationyear.text = data['pub_date'][:4]
+
+    # Add subjects
+    subject_and_tags = []
+    subjects = ET.SubElement(resource, "subjects")
+    for subject_entry in data['dc:subject'] + data['frdr:tags'] + data['frdr:tags_fr']:
+        if subject_entry not in subject_and_tags:
+            subject_and_tags.append(subject_entry)
+            subject = ET.SubElement(subjects, "subject")
+            subject.text = subject_entry
+
+    # Add dates
+    dates = ET.SubElement(resource, "dates")
+    date = ET.SubElement(dates, "date")
+    date.set("dateType", "Issued")
+    date.text = data['pub_date']
+
+    # Add resourceType
+    resourceType = ET.SubElement(resource, "resourceType")
+    resourceType.set("resourceTypeGeneral", "Dataset")
+    resourceType.text = "Dataset"
+
+    # Add alternateIdentifiers
+    alternateIdentifiers = ET.SubElement(resource, "alternateIdentifiers")
+    alternateIdentifier = ET.SubElement(alternateIdentifiers, "alternateIdentifier")
+    alternateIdentifier.set("alternateIdentifierType", "local")
+    alternateIdentifier.text = data['local_identifier']
+
+    # Add relatedIdentifiers (series)
+    if data['series'] != "":
+        relatedIdentifiers = ET.SubElement(resource, "relatedIdentifiers")
+        relatedIdentifier = ET.SubElement(relatedIdentifiers, "relatedIdentifier")
+        relatedIdentifier.set("relationType", "isPartOf")
+        relatedIdentifier.text = data['series']
+
+    # Add rightsList
+    rightsList = ET.SubElement(resource, "rightsList")
+    for rights_entry in data['dc:rights']:
+        rights = ET.SubElement(rightsList, "rights")
+        rights.text = rights_entry
+        if "http" in rights_entry:
+            rights.set("rightsURI", rights_entry[rights_entry.find("http"):].strip())
+            rights.text = rights_entry[:rights_entry.find("http")].strip()
+
+    # Add description(s)
+    descriptions = ET.SubElement(resource, "descriptions")
+    for description_entry in data['dc:description'] + data['frdr:description_fr']:
+        if description_entry != "":
+            description = ET.SubElement(descriptions, "description")
+            description.set("descriptionType", "Abstract")
+            description.text = description_entry
+    xml_string = ET.tostring(resource)
+    print(xml_string)
+    return xml_string
 
 def build_metadata(data):
     """Parse single FRDR result into metadata object"""
@@ -76,7 +164,7 @@ def build_metadata(data):
     updated = dateutil.parser.parse(data['pub_date'])
     result.updated_datetime = updated.astimezone(dateutil.tz.UTC).replace(tzinfo=None)
 
-    result.xml = None
+    result.xml = construct_datacite_xml(data)
     result.metadata_version = None
     result.titles = [data['title']]
     result.creators = data['dc:contributor.author']

From fbd7aae603acd350c57b456ff7d65f64931095be Mon Sep 17 00:00:00 2001
From: Kelly Stathis <kelly.a.stathis@gmail.com>
Date: Fri, 17 Apr 2020 09:42:36 -0700
Subject: [PATCH 14/67] Only include rightsList if there is a rights entry

---
 viringo/services/frdr.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py
index 852df97..47e2f73 100644
--- a/viringo/services/frdr.py
+++ b/viringo/services/frdr.py
@@ -132,13 +132,14 @@ def construct_datacite_xml(data):
         relatedIdentifier.text = data['series']
 
     # Add rightsList
-    rightsList = ET.SubElement(resource, "rightsList")
-    for rights_entry in data['dc:rights']:
-        rights = ET.SubElement(rightsList, "rights")
-        rights.text = rights_entry
-        if "http" in rights_entry:
-            rights.set("rightsURI", rights_entry[rights_entry.find("http"):].strip())
-            rights.text = rights_entry[:rights_entry.find("http")].strip()
+    if len(data['dc:rights']) > 0:
+        rightsList = ET.SubElement(resource, "rightsList")
+        for rights_entry in data['dc:rights']:
+            rights = ET.SubElement(rightsList, "rights")
+            rights.text = rights_entry
+            if "http" in rights_entry:
+                rights.set("rightsURI", rights_entry[rights_entry.find("http"):].strip())
+                rights.text = rights_entry[:rights_entry.find("http")].strip()
 
     # Add description(s)
     descriptions = ET.SubElement(resource, "descriptions")

From b39e5f3e675115b944a2e6e9a2c7917a27e988c1 Mon Sep 17 00:00:00 2001
From: Kelly Stathis <kelly.a.stathis@gmail.com>
Date: Fri, 17 Apr 2020 09:43:04 -0700
Subject: [PATCH 15/67] Hardcode "Dataset" for resourcetypes (for Dublin Core)

---
 viringo/services/frdr.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py
index 47e2f73..a500acd 100644
--- a/viringo/services/frdr.py
+++ b/viringo/services/frdr.py
@@ -178,7 +178,7 @@ def build_metadata(data):
     result.funding_references = data['contact']
     result.sizes = []
     result.geo_locations = data['frdr:geospatial']
-    result.resource_types = [data['frdr:tags']]
+    result.resource_types = ['Dataset']
     result.formats = []
     result.identifiers = []
     result.language = ''

From fde0bd8aee837ed833c3cffa6ff2e8e95eeee58f Mon Sep 17 00:00:00 2001
From: Kelly Stathis <kelly.a.stathis@gmail.com>
Date: Fri, 17 Apr 2020 09:45:20 -0700
Subject: [PATCH 16/67] Include subjects and tags in dc:subjects, deduplicate
 when needed

---
 viringo/services/frdr.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py
index a500acd..7c617f9 100644
--- a/viringo/services/frdr.py
+++ b/viringo/services/frdr.py
@@ -169,7 +169,14 @@ def build_metadata(data):
     result.metadata_version = None
     result.titles = [data['title']]
     result.creators = data['dc:contributor.author']
-    result.subjects = data['dc:subject']
+    result.subjects = []
+
+    # De-duplicate subjects and tags
+    for subject in data['dc:subject'] + data['frdr:tags'] + data['frdr:tags_fr']:
+        if subject not in result.subjects:
+            result.subjects.append(subject)
+
+    # TODO: Add French description
     result.descriptions = data['dc:description']
     result.publisher = data['dc:publisher']
     result.publication_year = dateutil.parser.parse(data['pub_date']).year

From 6a074251177dd1719d342df2ea1805a78b57fd86 Mon Sep 17 00:00:00 2001
From: Kelly Stathis <kelly.a.stathis@gmail.com>
Date: Fri, 17 Apr 2020 09:49:50 -0700
Subject: [PATCH 17/67] Use dateutil to parse year for oai_datacite

---
 viringo/services/frdr.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py
index 7c617f9..0803f9e 100644
--- a/viringo/services/frdr.py
+++ b/viringo/services/frdr.py
@@ -96,7 +96,7 @@ def construct_datacite_xml(data):
 
     # Add publication year
     publicationyear = ET.SubElement(resource, "publicationyear")
-    publicationyear.text = data['pub_date'][:4]
+    publicationyear.text = dateutil.parser.parse(data['pub_date']).year
 
     # Add subjects
     subject_and_tags = []

From d9de944145688c34cbe0b549c291ce967451079d Mon Sep 17 00:00:00 2001
From: Kelly Stathis <kelly.a.stathis@gmail.com>
Date: Fri, 17 Apr 2020 09:50:55 -0700
Subject: [PATCH 18/67] Revert "Use dateutil to parse year for oai_datacite"

This reverts commit 6a074251177dd1719d342df2ea1805a78b57fd86.
---
 viringo/services/frdr.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py
index 0803f9e..7c617f9 100644
--- a/viringo/services/frdr.py
+++ b/viringo/services/frdr.py
@@ -96,7 +96,7 @@ def construct_datacite_xml(data):
 
     # Add publication year
     publicationyear = ET.SubElement(resource, "publicationyear")
-    publicationyear.text = dateutil.parser.parse(data['pub_date']).year
+    publicationyear.text = data['pub_date'][:4]
 
     # Add subjects
     subject_and_tags = []

From 229ae49f774a061146d4ec328f38423f4478637c Mon Sep 17 00:00:00 2001
From: Kelly Stathis <kelly.a.stathis@gmail.com>
Date: Fri, 17 Apr 2020 14:34:37 -0700
Subject: [PATCH 19/67] Remove contact info (email) from funding_references and
 remove print for datacite XML

---
 viringo/services/frdr.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py
index 7c617f9..3f56323 100644
--- a/viringo/services/frdr.py
+++ b/viringo/services/frdr.py
@@ -149,7 +149,6 @@ def construct_datacite_xml(data):
             description.set("descriptionType", "Abstract")
             description.text = description_entry
     xml_string = ET.tostring(resource)
-    print(xml_string)
     return xml_string
 
 def build_metadata(data):
@@ -182,7 +181,7 @@ def build_metadata(data):
     result.publication_year = dateutil.parser.parse(data['pub_date']).year
     result.dates = [data['pub_date']]
     result.contributors = data['dc:contributor']
-    result.funding_references = data['contact']
+    result.funding_references = ''
     result.sizes = []
     result.geo_locations = data['frdr:geospatial']
     result.resource_types = ['Dataset']

From cf17a3a8ec7bca8aac6fe2e26fd0b4a8f4d8c611 Mon Sep 17 00:00:00 2001
From: Kelly Stathis <kelly.a.stathis@gmail.com>
Date: Fri, 17 Apr 2020 15:41:09 -0700
Subject: [PATCH 20/67] Add FRDR as contributor with type HostingInstitution

---
 viringo/services/frdr.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py
index 3f56323..917980c 100644
--- a/viringo/services/frdr.py
+++ b/viringo/services/frdr.py
@@ -107,6 +107,13 @@ def construct_datacite_xml(data):
             subject = ET.SubElement(subjects, "subject")
             subject.text = subject_entry
 
+    # Add FRDR as HostingInstituton
+    contributors = ET.SubElement(resource, "contributors")
+    contributor = ET.SubElement(contributors, "contributor")
+    contributor.set("contributorType", "HostingInstitution")
+    contributorName = ET.SubElement(contributor, "contributorName")
+    contributorName.text = "Federated Research Data Repository / dépôt fédéré de données de recherche"
+
     # Add dates
     dates = ET.SubElement(resource, "dates")
     date = ET.SubElement(dates, "date")

From 1c9139258cce45666942f7d96bbdf56f782f34e1 Mon Sep 17 00:00:00 2001
From: Kelly Stathis <kelly.a.stathis@gmail.com>
Date: Fri, 17 Apr 2020 15:42:53 -0700
Subject: [PATCH 21/67] Include access in rightsList where applicable. Remove
 container elements (rightsList, descriptions) when empty

---
 viringo/services/frdr.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py
index 917980c..c781826 100644
--- a/viringo/services/frdr.py
+++ b/viringo/services/frdr.py
@@ -139,14 +139,17 @@ def construct_datacite_xml(data):
         relatedIdentifier.text = data['series']
 
     # Add rightsList
-    if len(data['dc:rights']) > 0:
-        rightsList = ET.SubElement(resource, "rightsList")
-        for rights_entry in data['dc:rights']:
+    rightsList = ET.SubElement(resource, "rightsList")
+    for rights_entry in data['dc:rights'] + data['frdr:access']:
+        if rights_entry != '':
             rights = ET.SubElement(rightsList, "rights")
             rights.text = rights_entry
             if "http" in rights_entry:
                 rights.set("rightsURI", rights_entry[rights_entry.find("http"):].strip())
                 rights.text = rights_entry[:rights_entry.find("http")].strip()
+    # If rightsList is empty, remove it
+    if len(rightsList) == 0:
+        resource.remove(rightsList)
 
     # Add description(s)
     descriptions = ET.SubElement(resource, "descriptions")
@@ -155,6 +158,10 @@ def construct_datacite_xml(data):
             description = ET.SubElement(descriptions, "description")
             description.set("descriptionType", "Abstract")
             description.text = description_entry
+    # If descriptions is empty, remove it
+    if len(descriptions) == 0:
+        resource.remove(descriptions)
+
     xml_string = ET.tostring(resource)
     return xml_string
 

From fa46e9272c56623f508ed90a83b92c00f83009f7 Mon Sep 17 00:00:00 2001
From: Alex Garnett <axfelix@gmail.com>
Date: Mon, 20 Apr 2020 09:31:03 -0700
Subject: [PATCH 22/67] always use datacite unless frdr configuration provided

---
 viringo/oai.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/viringo/oai.py b/viringo/oai.py
index 9ef1b7e..5e4fbc7 100644
--- a/viringo/oai.py
+++ b/viringo/oai.py
@@ -97,13 +97,10 @@ def handleVerb(self, verb, kw):
 def get_oai_server():
     """Returns a pyoai server object that can process and return OAI requests"""
     if 'oai' not in g:
-        if config.CATALOG_SET == 'DateCite':
-            catalog_server = DataCiteOAIServer()
-        elif config.CATALOG_SET == 'FRDR':
+        if config.CATALOG_SET == 'FRDR':
             catalog_server = FRDROAIServer()
         else:
-            print('No valid metadata catalog configured')
-            sys.exit(1)
+            catalog_server = DataCiteOAIServer()
 
         metadata_registry = oaipmh.metadata.MetadataRegistry()
         metadata_registry.registerWriter('oai_dc', metadata.oai_dc_writer)

From 6a49965f12c8da88bab52fd869995a551d2999ad Mon Sep 17 00:00:00 2001
From: Alex Garnett <axfelix@gmail.com>
Date: Thu, 23 Apr 2020 13:47:53 -0700
Subject: [PATCH 23/67] support from and until parameters for frdr

---
 viringo/catalogs.py      | 4 ++++
 viringo/services/frdr.py | 6 ++++++
 2 files changed, 10 insertions(+)

diff --git a/viringo/catalogs.py b/viringo/catalogs.py
index c103cef..43b46be 100644
--- a/viringo/catalogs.py
+++ b/viringo/catalogs.py
@@ -388,6 +388,8 @@ def listRecords(
             port=config.POSTGRES_PORT,
             query=search_query,
             set=set,
+            from_datetime=from_,
+            until_datetime=until,
             cursor=paging_cursor
         )
 
@@ -434,6 +436,8 @@ def listIdentifiers(
             port=config.POSTGRES_PORT,
             query=search_query,
             set=set,
+            from_datetime=from_,
+            until_datetime=until,
             cursor=paging_cursor
         )
 
diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py
index c781826..db37697 100644
--- a/viringo/services/frdr.py
+++ b/viringo/services/frdr.py
@@ -344,6 +344,8 @@ def get_metadata_list(
         port,
         query=None,
         set=None,
+        from_datetime=None,
+        until_datetime=None,
         cursor=None
     ):
 
@@ -353,6 +355,10 @@ def get_metadata_list(
     records_sql = """SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp, repos.homepage_url FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id"""
     if set is not None and set != 'openaire_data':
         records_sql = records_sql + " AND (repos.homepage_url='" + set + "')"
+    if from_datetime is not None:
+        records_sql = records_sql + " AND recs.pub_date>='" + from_datetime + "'"
+    if until_datetime is not None:
+        records_sql = records_sql + " AND recs.pub_date<'" + until_datetime + "'"
     if cursor is not None:
         records_sql = records_sql + " OFFSET " + cursor
     db_cursor.execute(records_sql)

From 6ab32f16f676f6536a0fc2d93bc91ec7a527bde7 Mon Sep 17 00:00:00 2001
From: Kelly Stathis <kelly.a.stathis@gmail.com>
Date: Thu, 23 Apr 2020 13:50:37 -0700
Subject: [PATCH 24/67] Use URL for identifier, remove dx from DOIs

---
 viringo/services/frdr.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py
index db37697..192b8ad 100644
--- a/viringo/services/frdr.py
+++ b/viringo/services/frdr.py
@@ -169,7 +169,7 @@ def build_metadata(data):
     """Parse single FRDR result into metadata object"""
     result = Metadata()
 
-    result.identifier = data['record_id']
+    result.identifier = data['dc:source']
 
     # Here we want to parse a ISO date but convert to UTC and then remove the TZinfo entirely
     # This is because OAI always works in UTC.
@@ -234,7 +234,7 @@ def construct_local_url(record):
         doi = re.search("(doi|DOI):\s?\S+", record["local_identifier"])
         if doi:
             doi = doi.group(0).rstrip('\.')
-            local_url = re.sub("(doi|DOI):\s?", "https://dx.doi.org/", doi)
+            local_url = re.sub("(doi|DOI):\s?", "https://doi.org/", doi)
             return local_url
 
         # If the item has a source URL, use it

From 14a16ab56724bb75071560a0d70b9f39def95119 Mon Sep 17 00:00:00 2001
From: Alex Garnett <axfelix@gmail.com>
Date: Thu, 23 Apr 2020 13:54:26 -0700
Subject: [PATCH 25/67] update deletedRecord policy for frdr

---
 viringo/catalogs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/viringo/catalogs.py b/viringo/catalogs.py
index 43b46be..e944bd7 100644
--- a/viringo/catalogs.py
+++ b/viringo/catalogs.py
@@ -294,7 +294,7 @@ def identify(self):
             protocolVersion="2.0",
             adminEmails=[config.OAIPMH_ADMIN_EMAIL],
             earliestDatestamp=datetime(2011, 1, 1),
-            deletedRecord='persistent',
+            deletedRecord='no',
             granularity='YYYY-MM-DDThh:mm:ssZ',
             compression=['gzip', 'deflate'],
             toolkit_description=False)

From 369a478adcc3d8146ca439ce73780ba3ef239e1b Mon Sep 17 00:00:00 2001
From: Kelly Stathis <kelly.a.stathis@gmail.com>
Date: Thu, 7 May 2020 16:38:20 -0700
Subject: [PATCH 26/67] Use database pk for identifiers and add URL to the
 identifiers list (appears in oai_dc)

---
 viringo/catalogs.py      | 4 +---
 viringo/services/frdr.py | 4 +++-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/viringo/catalogs.py b/viringo/catalogs.py
index e944bd7..4d708c3 100644
--- a/viringo/catalogs.py
+++ b/viringo/catalogs.py
@@ -508,9 +508,7 @@ def build_record(self, metadata):
 
     def build_metadata_map(self, result):
         """Construct a metadata map object for oai metadata writing"""
-        identifiers = [
-            identifier_to_string(identifier) for identifier in result.identifiers
-        ]
+        identifiers = result.identifiers
 
         relations = [
             identifier_to_string(relation)
diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py
index 192b8ad..397fecf 100644
--- a/viringo/services/frdr.py
+++ b/viringo/services/frdr.py
@@ -169,7 +169,7 @@ def build_metadata(data):
     """Parse single FRDR result into metadata object"""
     result = Metadata()
 
-    result.identifier = data['dc:source']
+    result.identifier = data['record_id']
 
     # Here we want to parse a ISO date but convert to UTC and then remove the TZinfo entirely
     # This is because OAI always works in UTC.
@@ -207,6 +207,8 @@ def build_metadata(data):
     result.client = data['homepage_url']
     result.active = True
 
+    result.identifiers.append(construct_local_url(data))
+
     return result
 
 

From e2610cfeb09ea8bd752eaea7d28b4cd676920971 Mon Sep 17 00:00:00 2001
From: Kelly Stathis <kelly.a.stathis@gmail.com>
Date: Mon, 25 May 2020 17:48:24 -0700
Subject: [PATCH 27/67] Use item_url as OAI identifier with "oai:" prefix

---
 viringo/services/frdr.py | 66 ++++++++--------------------------------
 1 file changed, 13 insertions(+), 53 deletions(-)

diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py
index 397fecf..0757764 100644
--- a/viringo/services/frdr.py
+++ b/viringo/services/frdr.py
@@ -73,10 +73,7 @@ def construct_datacite_xml(data):
     # Add resource URL as identifier
     identifier = ET.SubElement(resource, "identifier")
     identifier.set("identifierType", "URL")
-    identifier.text = data['source_url']
-    if data['source_url'] == '':
-        if data['item_url_pattern'] != '' and "%id%" in data['item_url_pattern'] and data['local_identifier'] != '':
-            identifier.text = data['item_url_pattern'].replace("%id%", data['local_identifier'])
+    identifier.text = data['item_url']
 
     # Add creators
     creators = ET.SubElement(resource, "creators")
@@ -169,7 +166,7 @@ def build_metadata(data):
     """Parse single FRDR result into metadata object"""
     result = Metadata()
 
-    result.identifier = data['record_id']
+    result.identifier = "oai:" + data['item_url'] # Add oai: to identifier URL
 
     # Here we want to parse a ISO date but convert to UTC and then remove the TZinfo entirely
     # This is because OAI always works in UTC.
@@ -207,52 +204,11 @@ def build_metadata(data):
     result.client = data['homepage_url']
     result.active = True
 
-    result.identifiers.append(construct_local_url(data))
+    result.identifiers.append(data['item_url'])
 
     return result
 
 
-def construct_local_url(record):
-        # Check if the local_identifier has already been turned into a url
-        if "http" in record["local_identifier"].lower():
-            return record["local_identifier"]
-
-        # Check for OAI format of identifier (oai:domain:id)
-        oai_id = None
-        oai_search = re.search("oai:(.+):(.+)", record["local_identifier"])
-        if oai_search:
-            oai_id = oai_search.group(2)
-            oai_id = oai_id.replace("_", ":")
-
-        # If given a pattern then substitue in the item ID and return it
-        if "item_url_pattern" in record and record["item_url_pattern"]:
-            if oai_id:
-                local_url = re.sub("(\%id\%)", oai_id, record["item_url_pattern"])
-            else:
-                local_url = re.sub("(\%id\%)", record["local_identifier"], record["item_url_pattern"])
-            return local_url
-
-        # Check if the identifier is a DOI
-        doi = re.search("(doi|DOI):\s?\S+", record["local_identifier"])
-        if doi:
-            doi = doi.group(0).rstrip('\.')
-            local_url = re.sub("(doi|DOI):\s?", "https://doi.org/", doi)
-            return local_url
-
-        # If the item has a source URL, use it
-        if ('source_url' in record) and record['source_url']:
-            return record['source_url']
-
-        # URL is in the identifier
-        local_url = re.search("(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?",
-                              record["local_identifier"])
-        if local_url:
-            return local_url.group(0)
-
-        local_url = None
-        return local_url
-
-
 def rows_to_dict(cursor):
         newdict = []
         if cursor:
@@ -266,8 +222,8 @@ def rows_to_dict(cursor):
 
 
 def assemble_record(record, db, user, password, server, port):
-    record["dc:source"] = construct_local_url(record)
-    if record["dc:source"] is None:
+
+    if record["item_url"] is None:
         return None
         
     if int(record["deleted"]) == 1:
@@ -354,7 +310,7 @@ def get_metadata_list(
     records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s' port='%s'" % (db, user, password, server, port))
     with records_con:
         db_cursor = records_con.cursor()
-    records_sql = """SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp, repos.homepage_url FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id"""
+    records_sql = """SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.item_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp, repos.homepage_url FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id"""
     if set is not None and set != 'openaire_data':
         records_sql = records_sql + " AND (repos.homepage_url='" + set + "')"
     if from_datetime is not None:
@@ -369,7 +325,7 @@ def get_metadata_list(
 
     results = []
     for row in record_set:
-        record = (dict(zip(['record_id', 'title', 'pub_date', 'contact', 'series', 'source_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp', 'homepage_url'], row)))
+        record = (dict(zip(['record_id', 'title', 'pub_date', 'contact', 'series', 'source_url', 'item_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp', 'homepage_url'], row)))
 
         full_record = assemble_record(record, db, user, password, server, port)
         if full_record is not None:
@@ -382,10 +338,14 @@ def get_metadata(identifier, db, user, password, server, port):
     records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s' port='%s'" % (db, user, password, server, port))
     with records_con:
         records_cursor = records_con.cursor()
-    records_sql = """SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp, repos.homepage_url FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id AND recs.record_id =""" + identifier
+    records_sql = ("""SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, 
+    recs.item_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, 
+    repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp, 
+    repos.homepage_url FROM records recs, repositories repos 
+    WHERE recs.repository_id = repos.repository_id AND recs.item_url =\'""" + identifier[4:] + "\'")  # use identifier substring excluding oai: prefix
     records_cursor.execute(records_sql)
     row = records_cursor.fetchone()
-    record = (dict(zip(['record_id', 'title', 'pub_date', 'contact', 'series', 'source_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp', 'homepage_url'], row)))
+    record = (dict(zip(['record_id', 'title', 'pub_date', 'contact', 'series', 'source_url', 'item_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp', 'homepage_url'], row)))
 
     full_record = assemble_record(record, db, user, password, server, port)
     return build_metadata(full_record)

From 36bb95b03b4ec069ba4af0c36e7e94c49b3bb7ce Mon Sep 17 00:00:00 2001
From: Kelly Stathis <kelly.a.stathis@gmail.com>
Date: Wed, 27 May 2020 11:29:03 -0700
Subject: [PATCH 28/67] Capitalize publicationYear

---
 viringo/services/frdr.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py
index 0757764..d179360 100644
--- a/viringo/services/frdr.py
+++ b/viringo/services/frdr.py
@@ -92,7 +92,7 @@ def construct_datacite_xml(data):
     publisher.text = data['repository_name']
 
     # Add publication year
-    publicationyear = ET.SubElement(resource, "publicationyear")
+    publicationyear = ET.SubElement(resource, "publicationYear")
     publicationyear.text = data['pub_date'][:4]
 
     # Add subjects

From 174b8708ef5cea2877446c160eabaffb8edb5613 Mon Sep 17 00:00:00 2001
From: Kelly Stathis <kelly.a.stathis@gmail.com>
Date: Wed, 27 May 2020 11:29:29 -0700
Subject: [PATCH 29/67] Move series into description with SeriesInformation
 type

---
 viringo/services/frdr.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py
index d179360..dda7a35 100644
--- a/viringo/services/frdr.py
+++ b/viringo/services/frdr.py
@@ -128,13 +128,6 @@ def construct_datacite_xml(data):
     alternateIdentifier.set("alternateIdentifierType", "local")
     alternateIdentifier.text = data['local_identifier']
 
-    # Add relatedIdentifiers (series)
-    if data['series'] != "":
-        relatedIdentifiers = ET.SubElement(resource, "relatedIdentifiers")
-        relatedIdentifier = ET.SubElement(relatedIdentifiers, "relatedIdentifier")
-        relatedIdentifier.set("relationType", "isPartOf")
-        relatedIdentifier.text = data['series']
-
     # Add rightsList
     rightsList = ET.SubElement(resource, "rightsList")
     for rights_entry in data['dc:rights'] + data['frdr:access']:
@@ -155,6 +148,13 @@ def construct_datacite_xml(data):
             description = ET.SubElement(descriptions, "description")
             description.set("descriptionType", "Abstract")
             description.text = description_entry
+
+    # Add series (series)
+    if data['series'] != "":
+        description_series = ET.SubElement(descriptions, "description")
+        description_series.set("descriptionType", "SeriesInformation")
+        description_series.text = data['series']
+
     # If descriptions is empty, remove it
     if len(descriptions) == 0:
         resource.remove(descriptions)

From bc8143fcc0013fe1905cf9119745db6b35dae1c6 Mon Sep 17 00:00:00 2001
From: Kelly Stathis <kelly.a.stathis@gmail.com>
Date: Thu, 28 May 2020 16:06:52 -0700
Subject: [PATCH 30/67] Put openaire_data set first in list

---
 viringo/services/frdr.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py
index dda7a35..251593b 100644
--- a/viringo/services/frdr.py
+++ b/viringo/services/frdr.py
@@ -356,9 +356,10 @@ def get_sets(db, user, password, server, port):
     with repos_con:
         repos_cursor = repos_con.cursor()
 
-    repos_cursor.execute("SELECT homepage_url, repository_name from repositories")
-    results = repos_cursor.fetchall()
-
+    results = []
     results.append(['openaire_data', 'OpenAIRE'])
 
-    return results, len(results)
+    repos_cursor.execute("SELECT homepage_url, repository_name from repositories")
+    results.extend(repos_cursor.fetchall())
+
+    return results, len(results)
\ No newline at end of file

From 355c351a3275350a2d6dac56612fbed965c06fc3 Mon Sep 17 00:00:00 2001
From: Kelly Stathis <kelly.a.stathis@gmail.com>
Date: Thu, 2 Jul 2020 13:17:39 -0700
Subject: [PATCH 31/67] Only include rights entries that have a URL

---
 viringo/services/frdr.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py
index 251593b..583bfca 100644
--- a/viringo/services/frdr.py
+++ b/viringo/services/frdr.py
@@ -130,13 +130,11 @@ def construct_datacite_xml(data):
 
     # Add rightsList
     rightsList = ET.SubElement(resource, "rightsList")
-    for rights_entry in data['dc:rights'] + data['frdr:access']:
-        if rights_entry != '':
+    for rights_entry in data['dc:rights']:
+        if rights_entry != '' and "http" in rights_entry:
             rights = ET.SubElement(rightsList, "rights")
-            rights.text = rights_entry
-            if "http" in rights_entry:
-                rights.set("rightsURI", rights_entry[rights_entry.find("http"):].strip())
-                rights.text = rights_entry[:rights_entry.find("http")].strip()
+            rights.set("rightsURI", rights_entry[rights_entry.find("http"):].strip())
+            rights.text = rights_entry[:rights_entry.find("http")].strip()
     # If rightsList is empty, remove it
     if len(rightsList) == 0:
         resource.remove(rightsList)

From e28231ff0fd2ff66daba4c1030ef77b367379e7a Mon Sep 17 00:00:00 2001
From: Kelly Stathis <kelly.a.stathis@gmail.com>
Date: Thu, 2 Jul 2020 13:18:28 -0700
Subject: [PATCH 32/67] Update contributor to have xml:lang attribute, separate
 en/fr

---
 viringo/services/frdr.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py
index 583bfca..ee9bde0 100644
--- a/viringo/services/frdr.py
+++ b/viringo/services/frdr.py
@@ -106,10 +106,16 @@ def construct_datacite_xml(data):
 
     # Add FRDR as HostingInstituton
     contributors = ET.SubElement(resource, "contributors")
-    contributor = ET.SubElement(contributors, "contributor")
-    contributor.set("contributorType", "HostingInstitution")
-    contributorName = ET.SubElement(contributor, "contributorName")
-    contributorName.text = "Federated Research Data Repository / dépôt fédéré de données de recherche"
+    contributor_en = ET.SubElement(contributors, "contributor")
+    contributor_en.set("contributorType", "HostingInstitution")
+    contributor_en.set("xml:lang", "en")
+    contributorName_en = ET.SubElement(contributor_en, "contributorName")
+    contributorName_en.text = "Federated Research Data Repository"
+    contributor_fr = ET.SubElement(contributors, "contributor")
+    contributor_fr.set("contributorType", "HostingInstitution")
+    contributor_fr.set("xml:lang", "fr")
+    contributorName_fr = ET.SubElement(contributor_fr, "contributorName")
+    contributorName_fr.text = "Dépôt fédéré de données de recherche"
 
     # Add dates
     dates = ET.SubElement(resource, "dates")

From 0e7e3fa6732bf1146e7439c203177d3f9007e21c Mon Sep 17 00:00:00 2001
From: Alex Garnett <axfelix@gmail.com>
Date: Thu, 2 Jul 2020 14:06:59 -0700
Subject: [PATCH 33/67] fix resetting cursor position after two pages

---
 viringo/services/frdr.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py
index ee9bde0..1fea665 100644
--- a/viringo/services/frdr.py
+++ b/viringo/services/frdr.py
@@ -335,7 +335,10 @@ def get_metadata_list(
         if full_record is not None:
             results.append(build_metadata(full_record))
 
-    return results, db_cursor.rowcount, len(record_set)
+    if cursor is not None:
+        return results, db_cursor.rowcount, (len(record_set) + cursor)
+    else:
+        return results, db_cursor.rowcount, len(record_set)
 
 
 def get_metadata(identifier, db, user, password, server, port):

From 2eb82e6c409c14b9e3dbc52511cbc5623c65e402 Mon Sep 17 00:00:00 2001
From: Kelly Stathis <kelly.a.stathis@gmail.com>
Date: Mon, 13 Jul 2020 11:59:16 -0700
Subject: [PATCH 34/67] Include rights entries that do not have a URL

---
 viringo/services/frdr.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py
index 1fea665..7fc9f53 100644
--- a/viringo/services/frdr.py
+++ b/viringo/services/frdr.py
@@ -136,11 +136,14 @@ def construct_datacite_xml(data):
 
     # Add rightsList
     rightsList = ET.SubElement(resource, "rightsList")
-    for rights_entry in data['dc:rights']:
-        if rights_entry != '' and "http" in rights_entry:
+    for rights_entry in data['dc:rights'] + data['frdr:access']:
+        if rights_entry != '':
             rights = ET.SubElement(rightsList, "rights")
-            rights.set("rightsURI", rights_entry[rights_entry.find("http"):].strip())
-            rights.text = rights_entry[:rights_entry.find("http")].strip()
+            rights.text = rights_entry
+            if "http" in rights_entry:
+                rights.set("rightsURI", rights_entry[rights_entry.find("http"):].strip())
+                rights.text = rights_entry[:rights_entry.find("http")].strip()
+
     # If rightsList is empty, remove it
     if len(rightsList) == 0:
         resource.remove(rightsList)

From daa5938f54f0d9c2f2bb6cc06125f50592e56e45 Mon Sep 17 00:00:00 2001
From: Kelly Stathis <kelly.a.stathis@gmail.com>
Date: Mon, 13 Jul 2020 11:59:43 -0700
Subject: [PATCH 35/67] Add xml:lang for description and subject

---
 viringo/services/frdr.py | 27 ++++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py
index 7fc9f53..b4b0586 100644
--- a/viringo/services/frdr.py
+++ b/viringo/services/frdr.py
@@ -71,6 +71,7 @@ def construct_datacite_xml(data):
                  "http://datacite.org/schema/kernel-4 http://schema.datacite.org/meta/kernel-4/metadata.xsd")
 
     # Add resource URL as identifier
+    # TODO: Check if the URL is a DOI, reformat and use identifierType="DOI"
     identifier = ET.SubElement(resource, "identifier")
     identifier.set("identifierType", "URL")
     identifier.text = data['item_url']
@@ -98,10 +99,23 @@ def construct_datacite_xml(data):
     # Add subjects
     subject_and_tags = []
     subjects = ET.SubElement(resource, "subjects")
-    for subject_entry in data['dc:subject'] + data['frdr:tags'] + data['frdr:tags_fr']:
-        if subject_entry not in subject_and_tags:
+    for subject_entry in data['dc:subject']:
+        if subject_entry not in subject_and_tags and subject_entry != "":
             subject_and_tags.append(subject_entry)
             subject = ET.SubElement(subjects, "subject")
+            subject.set("xml:lang", "en")
+            subject.text = subject_entry
+    for subject_entry in data['frdr:tags']:
+        if subject_entry not in subject_and_tags and subject_entry != "":
+            subject_and_tags.append(subject_entry)
+            subject = ET.SubElement(subjects, "subject")
+            subject.set("xml:lang", "en")
+            subject.text = subject_entry
+    for subject_entry in data['frdr:tags_fr']:
+        if subject_entry not in subject_and_tags and subject_entry != "":
+            subject_and_tags.append(subject_entry)
+            subject = ET.SubElement(subjects, "subject")
+            subject.set("xml:lang", "fr")
             subject.text = subject_entry
 
     # Add FRDR as HostingInstituton
@@ -150,10 +164,17 @@ def construct_datacite_xml(data):
 
     # Add description(s)
     descriptions = ET.SubElement(resource, "descriptions")
-    for description_entry in data['dc:description'] + data['frdr:description_fr']:
+    for description_entry in data['dc:description']:
+        if description_entry != "":
+            description = ET.SubElement(descriptions, "description")
+            description.set("descriptionType", "Abstract")
+            description.set("xml:lang", "en")
+            description.text = description_entry
+    for description_entry in data['frdr:description_fr']:
         if description_entry != "":
             description = ET.SubElement(descriptions, "description")
             description.set("descriptionType", "Abstract")
+            description.set("xml:lang", "fr")
             description.text = description_entry
 
     # Add series (series)

From 66d8bfe627a5c66397e852f6732e50407bc73e1a Mon Sep 17 00:00:00 2001
From: Alex Garnett <axfelix@gmail.com>
Date: Mon, 13 Jul 2020 13:35:31 -0700
Subject: [PATCH 36/67] fix parsing single-element list values, make sure
 cursor is cast as int on reusme

---
 .gitignore               | 4 +++-
 viringo/metadata.py      | 2 ++
 viringo/services/frdr.py | 2 +-
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index 8c5f805..923fb3b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -119,4 +119,6 @@ dmypy.json
 .vscode
 
 # Env configs
-*.env
\ No newline at end of file
+*.env
+
+.DS_Store
diff --git a/viringo/metadata.py b/viringo/metadata.py
index 4ea2ee8..ff123c6 100644
--- a/viringo/metadata.py
+++ b/viringo/metadata.py
@@ -43,6 +43,8 @@ def nsdc(name):
         ]:
         for value in _map.get(name, []):
             if value:
+                if isinstance(value, list) and len(value) == 1:
+                    value = value[0]
                 new_element = etree.SubElement(e_dc, nsdc(name))
                 # The regular expression here is to filter only valid XML chars
                 # Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py
index b4b0586..8f297af 100644
--- a/viringo/services/frdr.py
+++ b/viringo/services/frdr.py
@@ -360,7 +360,7 @@ def get_metadata_list(
             results.append(build_metadata(full_record))
 
     if cursor is not None:
-        return results, db_cursor.rowcount, (len(record_set) + cursor)
+        return results, db_cursor.rowcount, (len(record_set) + int(cursor))
     else:
         return results, db_cursor.rowcount, len(record_set)
 

From 7699d722741c792e645f80b870350aad7426e4fc Mon Sep 17 00:00:00 2001
From: Kelly Stathis <kelly.a.stathis@gmail.com>
Date: Mon, 20 Jul 2020 12:32:45 -0700
Subject: [PATCH 37/67] Only include subjects block if not empty

---
 viringo/services/frdr.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py
index 8f297af..9bc764c 100644
--- a/viringo/services/frdr.py
+++ b/viringo/services/frdr.py
@@ -118,6 +118,10 @@ def construct_datacite_xml(data):
             subject.set("xml:lang", "fr")
             subject.text = subject_entry
 
+    # If subjects is empty, remove it
+    if len(subjects) == 0:
+        resource.remove(subjects)
+
     # Add FRDR as HostingInstituton
     contributors = ET.SubElement(resource, "contributors")
     contributor_en = ET.SubElement(contributors, "contributor")

From f6efb24a95f6a55513f03d417e7e84388ab9aafb Mon Sep 17 00:00:00 2001
From: Kelly Stathis <kelly.a.stathis@gmail.com>
Date: Mon, 20 Jul 2020 12:34:38 -0700
Subject: [PATCH 38/67] Use OAI-compliant identifier and query database using
 local_identifier

---
 viringo/services/frdr.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py
index 9bc764c..379b941 100644
--- a/viringo/services/frdr.py
+++ b/viringo/services/frdr.py
@@ -198,7 +198,11 @@ def build_metadata(data):
     """Parse single FRDR result into metadata object"""
     result = Metadata()
 
-    result.identifier = "oai:" + data['item_url'] # Add oai: to identifier URL
+    # Construct identifier compliant with OAI spec
+    namespace = data['homepage_url'].replace("https://", "").replace("www.", "").replace("http://", "")
+    if namespace[-1] == "/":
+        namespace = namespace[:-1]
+    result.identifier = "oai:" + namespace + ":" + data['local_identifier']
 
     # Here we want to parse a ISO date but convert to UTC and then remove the TZinfo entirely
     # This is because OAI always works in UTC.
@@ -229,15 +233,13 @@ def build_metadata(data):
     result.geo_locations = data['frdr:geospatial']
     result.resource_types = ['Dataset']
     result.formats = []
-    result.identifiers = []
+    result.identifiers = [data['item_url']]
     result.language = ''
     result.relations = []
     result.rights = data['dc:rights']
     result.client = data['homepage_url']
     result.active = True
 
-    result.identifiers.append(data['item_url'])
-
     return result
 
 
@@ -370,6 +372,7 @@ def get_metadata_list(
 
 
 def get_metadata(identifier, db, user, password, server, port):
+    local_identifier = identifier.split(":")[len(identifier.split(":"))-1] # get local_identifier substring from identifier
     records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s' port='%s'" % (db, user, password, server, port))
     with records_con:
         records_cursor = records_con.cursor()
@@ -377,7 +380,7 @@ def get_metadata(identifier, db, user, password, server, port):
     recs.item_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, 
     repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp, 
     repos.homepage_url FROM records recs, repositories repos 
-    WHERE recs.repository_id = repos.repository_id AND recs.item_url =\'""" + identifier[4:] + "\'")  # use identifier substring excluding oai: prefix
+    WHERE recs.repository_id = repos.repository_id AND recs.local_identifier =\'""" + local_identifier + "\'")  # use local_identifier
     records_cursor.execute(records_sql)
     row = records_cursor.fetchone()
     record = (dict(zip(['record_id', 'title', 'pub_date', 'contact', 'series', 'source_url', 'item_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp', 'homepage_url'], row)))

From f87f9746648699e56e1aecd97f339d0993e675ab Mon Sep 17 00:00:00 2001
From: Kelly Stathis <kelly.a.stathis@gmail.com>
Date: Mon, 20 Jul 2020 13:00:47 -0700
Subject: [PATCH 39/67] Use info:eu-repo-Access-Terms vocabulary for access
 metadata (openAccess for Public)

---
 viringo/services/frdr.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py
index 379b941..ac5d035 100644
--- a/viringo/services/frdr.py
+++ b/viringo/services/frdr.py
@@ -154,13 +154,20 @@ def construct_datacite_xml(data):
 
     # Add rightsList
     rightsList = ET.SubElement(resource, "rightsList")
-    for rights_entry in data['dc:rights'] + data['frdr:access']:
+    for rights_entry in data['dc:rights']:
         if rights_entry != '':
             rights = ET.SubElement(rightsList, "rights")
             rights.text = rights_entry
             if "http" in rights_entry:
                 rights.set("rightsURI", rights_entry[rights_entry.find("http"):].strip())
                 rights.text = rights_entry[:rights_entry.find("http")].strip()
+    for access_entry in data["frdr:access"]:
+        rights = ET.SubElement(rightsList, "rights")
+        if access_entry == "Public":
+            rights.text = "info:eu-repo/semantics/openAccess"
+        else:
+            rights.text = "info:eu-repo/semantics/restrictedAccess"
+
 
     # If rightsList is empty, remove it
     if len(rightsList) == 0:

From c1dda75ad3a5a50a5058b61bd86f57f0dfd59fc3 Mon Sep 17 00:00:00 2001
From: Kelly Stathis <kelly.a.stathis@gmail.com>
Date: Fri, 24 Jul 2020 14:15:14 -0700
Subject: [PATCH 40/67] Use repo_oai_name in identifier and for setSpec

---
 viringo/services/frdr.py | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py
index ac5d035..5287119 100644
--- a/viringo/services/frdr.py
+++ b/viringo/services/frdr.py
@@ -206,9 +206,7 @@ def build_metadata(data):
     result = Metadata()
 
     # Construct identifier compliant with OAI spec
-    namespace = data['homepage_url'].replace("https://", "").replace("www.", "").replace("http://", "")
-    if namespace[-1] == "/":
-        namespace = namespace[:-1]
+    namespace = data['repo_oai_name']
     result.identifier = "oai:" + namespace + ":" + data['local_identifier']
 
     # Here we want to parse a ISO date but convert to UTC and then remove the TZinfo entirely
@@ -244,7 +242,7 @@ def build_metadata(data):
     result.language = ''
     result.relations = []
     result.rights = data['dc:rights']
-    result.client = data['homepage_url']
+    result.client = data['repo_oai_name']
     result.active = True
 
     return result
@@ -351,9 +349,9 @@ def get_metadata_list(
     records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s' port='%s'" % (db, user, password, server, port))
     with records_con:
         db_cursor = records_con.cursor()
-    records_sql = """SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.item_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp, repos.homepage_url FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id"""
+    records_sql = """SELECT recs.record_id, recs.title, recs.pub_date, recs.series, recs.source_url, recs.item_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp, repos.homepage_url, repos.repo_oai_name FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id"""
     if set is not None and set != 'openaire_data':
-        records_sql = records_sql + " AND (repos.homepage_url='" + set + "')"
+        records_sql = records_sql + " AND (repos.repo_oai_name='" + set + "')"
     if from_datetime is not None:
         records_sql = records_sql + " AND recs.pub_date>='" + from_datetime + "'"
     if until_datetime is not None:
@@ -366,7 +364,7 @@ def get_metadata_list(
 
     results = []
     for row in record_set:
-        record = (dict(zip(['record_id', 'title', 'pub_date', 'contact', 'series', 'source_url', 'item_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp', 'homepage_url'], row)))
+        record = (dict(zip(['record_id', 'title', 'pub_date', 'series', 'source_url', 'item_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp', 'homepage_url', 'repo_oai_name'], row)))
 
         full_record = assemble_record(record, db, user, password, server, port)
         if full_record is not None:
@@ -379,18 +377,20 @@ def get_metadata_list(
 
 
 def get_metadata(identifier, db, user, password, server, port):
-    local_identifier = identifier.split(":")[len(identifier.split(":"))-1] # get local_identifier substring from identifier
+    namespace = identifier.split(":")[1]
+    local_identifier = identifier.split(":")[2] # get local_identifier substring from identifier
     records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s' port='%s'" % (db, user, password, server, port))
     with records_con:
         records_cursor = records_con.cursor()
-    records_sql = ("""SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, 
+    records_sql = ("""SELECT recs.record_id, recs.title, recs.pub_date, recs.series, recs.source_url, 
     recs.item_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, 
     repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp, 
-    repos.homepage_url FROM records recs, repositories repos 
-    WHERE recs.repository_id = repos.repository_id AND recs.local_identifier =\'""" + local_identifier + "\'")  # use local_identifier
+    repos.homepage_url, repos.repo_oai_name FROM records recs, repositories repos 
+    WHERE recs.repository_id = repos.repository_id 
+        AND recs.local_identifier =\'""" + local_identifier + "\'" + "AND repos.repo_oai_name=\'""" + namespace + "\'")
     records_cursor.execute(records_sql)
     row = records_cursor.fetchone()
-    record = (dict(zip(['record_id', 'title', 'pub_date', 'contact', 'series', 'source_url', 'item_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp', 'homepage_url'], row)))
+    record = (dict(zip(['record_id', 'title', 'pub_date', 'series', 'source_url', 'item_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp', 'homepage_url', 'repo_oai_name'], row)))
 
     full_record = assemble_record(record, db, user, password, server, port)
     return build_metadata(full_record)
@@ -404,7 +404,7 @@ def get_sets(db, user, password, server, port):
     results = []
     results.append(['openaire_data', 'OpenAIRE'])
 
-    repos_cursor.execute("SELECT homepage_url, repository_name from repositories")
+    repos_cursor.execute("SELECT repo_oai_name, repository_name from repositories")
     results.extend(repos_cursor.fetchall())
 
     return results, len(results)
\ No newline at end of file

From b9750ab7365be491058ca1d832b1925df0f6274e Mon Sep 17 00:00:00 2001
From: Kelly Stathis <kelly.a.stathis@gmail.com>
Date: Fri, 24 Jul 2020 14:50:06 -0700
Subject: [PATCH 41/67] Add French titles, descriptions, and categories;
 subject/tags renamed to category/keywords

---
 viringo/services/frdr.py | 62 +++++++++++++++++++++++++---------------
 1 file changed, 39 insertions(+), 23 deletions(-)

diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py
index 5287119..cd073c1 100644
--- a/viringo/services/frdr.py
+++ b/viringo/services/frdr.py
@@ -83,10 +83,18 @@ def construct_datacite_xml(data):
         creatorName = ET.SubElement(creator, "creatorName")
         creatorName.text = creator_entry
 
-    # Add title
+    # Add titles
     titles = ET.SubElement(resource, "titles")
-    title = ET.SubElement(titles, "title")
-    title.text = data['title']
+    if data['title_en'] != "":
+        title = ET.SubElement(titles, "title")
+        title.text = data['title_en']
+        title.set("xml:lang", "en")
+    if data['title_fr'] != "":
+        title = ET.SubElement(titles, "title")
+        title.text = data['title_fr']
+        title.set("xml:lang", "fr")
+        if data['title_en'] != "":
+            title.set("titleType", "TranslatedTitle")
 
     # Add publisher
     publisher = ET.SubElement(resource, "publisher")
@@ -99,19 +107,25 @@ def construct_datacite_xml(data):
     # Add subjects
     subject_and_tags = []
     subjects = ET.SubElement(resource, "subjects")
-    for subject_entry in data['dc:subject']:
+    for subject_entry in data['frdr:category_en']:
         if subject_entry not in subject_and_tags and subject_entry != "":
             subject_and_tags.append(subject_entry)
             subject = ET.SubElement(subjects, "subject")
             subject.set("xml:lang", "en")
             subject.text = subject_entry
-    for subject_entry in data['frdr:tags']:
+    for subject_entry in data['frdr:category_fr']:
+        if subject_entry not in subject_and_tags and subject_entry != "":
+            subject_and_tags.append(subject_entry)
+            subject = ET.SubElement(subjects, "subject")
+            subject.set("xml:lang", "fr")
+            subject.text = subject_entry
+    for subject_entry in data['frdr:keywords_en']:
         if subject_entry not in subject_and_tags and subject_entry != "":
             subject_and_tags.append(subject_entry)
             subject = ET.SubElement(subjects, "subject")
             subject.set("xml:lang", "en")
             subject.text = subject_entry
-    for subject_entry in data['frdr:tags_fr']:
+    for subject_entry in data['frdr:keywords_fr']:
         if subject_entry not in subject_and_tags and subject_entry != "":
             subject_and_tags.append(subject_entry)
             subject = ET.SubElement(subjects, "subject")
@@ -175,13 +189,13 @@ def construct_datacite_xml(data):
 
     # Add description(s)
     descriptions = ET.SubElement(resource, "descriptions")
-    for description_entry in data['dc:description']:
+    for description_entry in data['dc:description_en']:
         if description_entry != "":
             description = ET.SubElement(descriptions, "description")
             description.set("descriptionType", "Abstract")
             description.set("xml:lang", "en")
             description.text = description_entry
-    for description_entry in data['frdr:description_fr']:
+    for description_entry in data['dc:description_fr']:
         if description_entry != "":
             description = ET.SubElement(descriptions, "description")
             description.set("descriptionType", "Abstract")
@@ -218,17 +232,16 @@ def build_metadata(data):
 
     result.xml = construct_datacite_xml(data)
     result.metadata_version = None
-    result.titles = [data['title']]
+    result.titles = [data['title_en'], data['title_fr']]
     result.creators = data['dc:contributor.author']
     result.subjects = []
 
     # De-duplicate subjects and tags
-    for subject in data['dc:subject'] + data['frdr:tags'] + data['frdr:tags_fr']:
+    for subject in data['frdr:category_en'] + data['frdr:category_fr'] + data['frdr:keywords_en'] + data['frdr:keywords_fr']:
         if subject not in result.subjects:
             result.subjects.append(subject)
 
-    # TODO: Add French description
-    result.descriptions = data['dc:description']
+    result.descriptions = data['dc:description_en'] + data['dc:description_fr']
     result.publisher = data['dc:publisher']
     result.publication_year = dateutil.parser.parse(data['pub_date']).year
     result.dates = [data['pub_date']]
@@ -268,7 +281,7 @@ def assemble_record(record, db, user, password, server, port):
     if int(record["deleted"]) == 1:
         return None
 
-    if (len(record['title']) == 0):
+    if (len(record['title_en']) == 0 and len(record['title_fr']) == 0):
         return None
 
     con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s' port='%s'" % (db, user, password, server, port))
@@ -306,8 +319,11 @@ def assemble_record(record, db, user, password, server, port):
         lookup_cur.execute("""SELECT creators.creator FROM creators JOIN records_x_creators on records_x_creators.creator_id = creators.creator_id WHERE records_x_creators.record_id=%s AND records_x_creators.is_contributor=1 order by records_x_creators_id asc""", [record["record_id"]])
         record["dc:contributor"] = rows_to_dict(lookup_cur)
 
-        lookup_cur.execute("""SELECT subjects.subject FROM subjects JOIN records_x_subjects on records_x_subjects.subject_id = subjects.subject_id WHERE records_x_subjects.record_id=%s""", [record["record_id"]])
-        record["dc:subject"] = rows_to_dict(lookup_cur)
+        lookup_cur.execute("""SELECT subjects.subject FROM subjects JOIN records_x_subjects on records_x_subjects.subject_id = subjects.subject_id WHERE records_x_subjects.record_id=%s and subjects.language = 'en' """, [record["record_id"]])
+        record["frdr:category_en"] = rows_to_dict(lookup_cur)
+
+        lookup_cur.execute("""SELECT subjects.subject FROM subjects JOIN records_x_subjects on records_x_subjects.subject_id = subjects.subject_id WHERE records_x_subjects.record_id=%s and subjects.language = 'fr' """, [record["record_id"]])
+        record["frdr:category_fr"] = rows_to_dict(lookup_cur)
 
         lookup_cur.execute("""SELECT publishers.publisher FROM publishers JOIN records_x_publishers on records_x_publishers.publisher_id = publishers.publisher_id WHERE records_x_publishers.record_id=%s""", [record["record_id"]])
         record["dc:publisher"] = rows_to_dict(lookup_cur)
@@ -316,16 +332,16 @@ def assemble_record(record, db, user, password, server, port):
         record["dc:rights"] = rows_to_dict(lookup_cur)
 
         lookup_cur.execute("SELECT description FROM descriptions WHERE record_id=%s and language='en' ", [record["record_id"]])
-        record["dc:description"] = rows_to_dict(lookup_cur)
+        record["dc:description_en"] = rows_to_dict(lookup_cur)
 
         lookup_cur.execute("SELECT description FROM descriptions WHERE record_id=%s and language='fr' ", [record["record_id"]])
-        record["frdr:description_fr"] = rows_to_dict(lookup_cur)
+        record["dc:description_fr"] = rows_to_dict(lookup_cur)
 
         lookup_cur.execute("""SELECT tags.tag FROM tags JOIN records_x_tags on records_x_tags.tag_id = tags.tag_id WHERE records_x_tags.record_id=%s and tags.language = 'en' """, [record["record_id"]])
-        record["frdr:tags"] = rows_to_dict(lookup_cur)
+        record["frdr:keywords_en"] = rows_to_dict(lookup_cur)
 
         lookup_cur.execute("""SELECT tags.tag FROM tags JOIN records_x_tags on records_x_tags.tag_id = tags.tag_id WHERE records_x_tags.record_id=%s and tags.language = 'fr' """, [record["record_id"]])
-        record["frdr:tags_fr"] = rows_to_dict(lookup_cur)
+        record["frdr:keywords_fr"] = rows_to_dict(lookup_cur)
 
         lookup_cur.execute("""SELECT access.access FROM access JOIN records_x_access on records_x_access.access_id = access.access_id WHERE records_x_access.record_id=%s""", [record["record_id"]])
         record["frdr:access"] = rows_to_dict(lookup_cur)
@@ -349,7 +365,7 @@ def get_metadata_list(
     records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s' port='%s'" % (db, user, password, server, port))
     with records_con:
         db_cursor = records_con.cursor()
-    records_sql = """SELECT recs.record_id, recs.title, recs.pub_date, recs.series, recs.source_url, recs.item_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp, repos.homepage_url, repos.repo_oai_name FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id"""
+    records_sql = """SELECT recs.record_id, recs.title, recs.title_fr, recs.pub_date, recs.series, recs.source_url, recs.item_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp, repos.homepage_url, repos.repo_oai_name FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id"""
     if set is not None and set != 'openaire_data':
         records_sql = records_sql + " AND (repos.repo_oai_name='" + set + "')"
     if from_datetime is not None:
@@ -364,7 +380,7 @@ def get_metadata_list(
 
     results = []
     for row in record_set:
-        record = (dict(zip(['record_id', 'title', 'pub_date', 'series', 'source_url', 'item_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp', 'homepage_url', 'repo_oai_name'], row)))
+        record = (dict(zip(['record_id', 'title_en', 'title_fr', 'pub_date', 'series', 'source_url', 'item_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp', 'homepage_url', 'repo_oai_name'], row)))
 
         full_record = assemble_record(record, db, user, password, server, port)
         if full_record is not None:
@@ -382,7 +398,7 @@ def get_metadata(identifier, db, user, password, server, port):
     records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s' port='%s'" % (db, user, password, server, port))
     with records_con:
         records_cursor = records_con.cursor()
-    records_sql = ("""SELECT recs.record_id, recs.title, recs.pub_date, recs.series, recs.source_url, 
+    records_sql = ("""SELECT recs.record_id, recs.title, recs.title_fr, recs.pub_date, recs.series, recs.source_url, 
     recs.item_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, 
     repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp, 
     repos.homepage_url, repos.repo_oai_name FROM records recs, repositories repos 
@@ -390,7 +406,7 @@ def get_metadata(identifier, db, user, password, server, port):
         AND recs.local_identifier =\'""" + local_identifier + "\'" + "AND repos.repo_oai_name=\'""" + namespace + "\'")
     records_cursor.execute(records_sql)
     row = records_cursor.fetchone()
-    record = (dict(zip(['record_id', 'title', 'pub_date', 'series', 'source_url', 'item_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp', 'homepage_url', 'repo_oai_name'], row)))
+    record = (dict(zip(['record_id', 'title_en', 'title_fr', 'pub_date', 'series', 'source_url', 'item_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp', 'homepage_url', 'repo_oai_name'], row)))
 
     full_record = assemble_record(record, db, user, password, server, port)
     return build_metadata(full_record)

From ec43f700342146bf892106b0a3c0a394cce2a623 Mon Sep 17 00:00:00 2001
From: Kelly Stathis <kelly.a.stathis@gmail.com>
Date: Fri, 24 Jul 2020 15:04:31 -0700
Subject: [PATCH 42/67] Add DOI as identifier when available

---
 viringo/services/frdr.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py
index cd073c1..cb68fbe 100644
--- a/viringo/services/frdr.py
+++ b/viringo/services/frdr.py
@@ -71,10 +71,14 @@ def construct_datacite_xml(data):
                  "http://datacite.org/schema/kernel-4 http://schema.datacite.org/meta/kernel-4/metadata.xsd")
 
     # Add resource URL as identifier
-    # TODO: Check if the URL is a DOI, reformat and use identifierType="DOI"
     identifier = ET.SubElement(resource, "identifier")
     identifier.set("identifierType", "URL")
     identifier.text = data['item_url']
+    if "doi.org/" in data['item_url']:
+        identifier = ET.SubElement(resource, "identifier")
+        identifier.set("identifierType", "DOI")
+        identifier.text = data['item_url'].split("doi.org/")[1]
+
 
     # Add creators
     creators = ET.SubElement(resource, "creators")

From 6a7b5838f1e59aac16c18857bad55408b7c173ca Mon Sep 17 00:00:00 2001
From: Kelly Stathis <kelly.a.stathis@gmail.com>
Date: Fri, 24 Jul 2020 15:10:15 -0700
Subject: [PATCH 43/67] Only include one identifier (URL or DOI)

---
 viringo/services/frdr.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py
index cb68fbe..153169c 100644
--- a/viringo/services/frdr.py
+++ b/viringo/services/frdr.py
@@ -72,12 +72,12 @@ def construct_datacite_xml(data):
 
     # Add resource URL as identifier
     identifier = ET.SubElement(resource, "identifier")
-    identifier.set("identifierType", "URL")
-    identifier.text = data['item_url']
     if "doi.org/" in data['item_url']:
-        identifier = ET.SubElement(resource, "identifier")
         identifier.set("identifierType", "DOI")
         identifier.text = data['item_url'].split("doi.org/")[1]
+    else:
+        identifier.set("identifierType", "URL")
+        identifier.text = data['item_url']
 
 
     # Add creators

From 2666785aa940f43fb3746f782fbf839162eb46c4 Mon Sep 17 00:00:00 2001
From: Kelly Stathis <kelly.a.stathis@gmail.com>
Date: Fri, 24 Jul 2020 16:04:27 -0700
Subject: [PATCH 44/67] Fix issue where local_identifiers with colons weren't
 working

---
 viringo/services/frdr.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py
index 153169c..ad34362 100644
--- a/viringo/services/frdr.py
+++ b/viringo/services/frdr.py
@@ -397,8 +397,9 @@ def get_metadata_list(
 
 
 def get_metadata(identifier, db, user, password, server, port):
-    namespace = identifier.split(":")[1]
-    local_identifier = identifier.split(":")[2] # get local_identifier substring from identifier
+    identifier = identifier[4:]
+    namespace = identifier[:identifier.find(":")]
+    local_identifier = identifier[identifier.find(":")+1:]
     records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s' port='%s'" % (db, user, password, server, port))
     with records_con:
         records_cursor = records_con.cursor()

From 6504bcefc5ba770ccd438c97c7066c3dd40494ab Mon Sep 17 00:00:00 2001
From: Kelly Stathis <kelly.a.stathis@gmail.com>
Date: Fri, 24 Jul 2020 16:10:25 -0700
Subject: [PATCH 45/67] Add contributors to XML metadata (type is unknown, use
 "Other")

---
 viringo/services/frdr.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py
index ad34362..ba1ea15 100644
--- a/viringo/services/frdr.py
+++ b/viringo/services/frdr.py
@@ -140,8 +140,15 @@ def construct_datacite_xml(data):
     if len(subjects) == 0:
         resource.remove(subjects)
 
-    # Add FRDR as HostingInstituton
+    # Add contributors (contributorType "Other")
     contributors = ET.SubElement(resource, "contributors")
+    for contributor_entry in data["dc:contributor"]:
+        contributor = ET.SubElement(contributors, "contributor")
+        contributor.set("contributorType", "Other")
+        contributorName = ET.SubElement(contributor, "contributorName")
+        contributorName.text = contributor_entry
+
+    # Add FRDR as HostingInstituton
     contributor_en = ET.SubElement(contributors, "contributor")
     contributor_en.set("contributorType", "HostingInstitution")
     contributor_en.set("xml:lang", "en")

From ca5df37ced0830e9da6c7b8d658c3de4df5704f0 Mon Sep 17 00:00:00 2001
From: Kelly Stathis <kelly.a.stathis@gmail.com>
Date: Fri, 24 Jul 2020 16:22:23 -0700
Subject: [PATCH 46/67] Add openAccess statement for records without explicit
 access statement (these are public)

---
 viringo/services/frdr.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py
index ba1ea15..6a44f3f 100644
--- a/viringo/services/frdr.py
+++ b/viringo/services/frdr.py
@@ -186,17 +186,16 @@ def construct_datacite_xml(data):
             if "http" in rights_entry:
                 rights.set("rightsURI", rights_entry[rights_entry.find("http"):].strip())
                 rights.text = rights_entry[:rights_entry.find("http")].strip()
-    for access_entry in data["frdr:access"]:
+    if len(data["frdr:access"]) > 0:
+        for access_entry in data["frdr:access"]:
+            rights = ET.SubElement(rightsList, "rights")
+            if access_entry == "Public":
+                rights.text = "info:eu-repo/semantics/openAccess"
+            else:
+                rights.text = "info:eu-repo/semantics/restrictedAccess"
+    else: # Assume Public/openAccess
         rights = ET.SubElement(rightsList, "rights")
-        if access_entry == "Public":
-            rights.text = "info:eu-repo/semantics/openAccess"
-        else:
-            rights.text = "info:eu-repo/semantics/restrictedAccess"
-
-
-    # If rightsList is empty, remove it
-    if len(rightsList) == 0:
-        resource.remove(rightsList)
+        rights.text = "info:eu-repo/semantics/openAccess"
 
     # Add description(s)
     descriptions = ET.SubElement(resource, "descriptions")

From 35fab8f390c1ff1c6ea3eed4fa98edc0fd2ba38b Mon Sep 17 00:00:00 2001
From: Kelly Stathis <kelly.a.stathis@gmail.com>
Date: Tue, 4 Aug 2020 15:25:57 -0700
Subject: [PATCH 47/67] Use rightsURI for eu-repo/semantics terms

---
 viringo/services/frdr.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py
index 6a44f3f..b3f25dd 100644
--- a/viringo/services/frdr.py
+++ b/viringo/services/frdr.py
@@ -190,12 +190,12 @@ def construct_datacite_xml(data):
         for access_entry in data["frdr:access"]:
             rights = ET.SubElement(rightsList, "rights")
             if access_entry == "Public":
-                rights.text = "info:eu-repo/semantics/openAccess"
+                rights.set("rightsURI", "info:eu-repo/semantics/openAccess")
             else:
-                rights.text = "info:eu-repo/semantics/restrictedAccess"
+                rights.set("rightsURI", "info:eu-repo/semantics/restrictedAccess")
     else: # Assume Public/openAccess
         rights = ET.SubElement(rightsList, "rights")
-        rights.text = "info:eu-repo/semantics/openAccess"
+        rights.set("rightsURI", "info:eu-repo/semantics/openAccess")
 
     # Add description(s)
     descriptions = ET.SubElement(resource, "descriptions")

From 1fee002f682a3eae9a1b9d3449fe47c849d877aa Mon Sep 17 00:00:00 2001
From: Kelly Stathis <kelly.a.stathis@gmail.com>
Date: Mon, 10 Aug 2020 14:26:58 -0700
Subject: [PATCH 48/67] Only retrieve records with pub_date

---
 viringo/services/frdr.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py
index b3f25dd..11b2868 100644
--- a/viringo/services/frdr.py
+++ b/viringo/services/frdr.py
@@ -375,7 +375,7 @@ def get_metadata_list(
     records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s' port='%s'" % (db, user, password, server, port))
     with records_con:
         db_cursor = records_con.cursor()
-    records_sql = """SELECT recs.record_id, recs.title, recs.title_fr, recs.pub_date, recs.series, recs.source_url, recs.item_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp, repos.homepage_url, repos.repo_oai_name FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id"""
+    records_sql = """SELECT recs.record_id, recs.title, recs.title_fr, recs.pub_date, recs.series, recs.source_url, recs.item_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp, repos.homepage_url, repos.repo_oai_name FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id AND recs.pub_date!=''"""
     if set is not None and set != 'openaire_data':
         records_sql = records_sql + " AND (repos.repo_oai_name='" + set + "')"
     if from_datetime is not None:

From b07edd27ede51542a77499f9461574e1cda6a0b7 Mon Sep 17 00:00:00 2001
From: Alex Garnett <axfelix@gmail.com>
Date: Thu, 10 Sep 2020 11:05:25 -0700
Subject: [PATCH 49/67] fix null value handling and order by record ids to
 facilitate debugging

---
 Pipfile                  | 1 +
 viringo/metadata.py      | 8 +++++---
 viringo/services/frdr.py | 3 ++-
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/Pipfile b/Pipfile
index 679071b..ba09f38 100644
--- a/Pipfile
+++ b/Pipfile
@@ -13,6 +13,7 @@ python-dotenv = "*"
 
 [packages]
 flask = "*"
+ftfy = "*"
 pyoai = "*"
 psycopg2-binary = "*"
 requests = "*"
diff --git a/viringo/metadata.py b/viringo/metadata.py
index ff123c6..ba195ee 100644
--- a/viringo/metadata.py
+++ b/viringo/metadata.py
@@ -1,6 +1,7 @@
 """This module deals with handling the representation of metadata formats for OAI"""
 
 import re
+import ftfy
 from lxml import etree
 
 NS_OAIPMH = 'http://www.openarchives.org/OAI/2.0/'
@@ -46,9 +47,10 @@ def nsdc(name):
                 if isinstance(value, list) and len(value) == 1:
                     value = value[0]
                 new_element = etree.SubElement(e_dc, nsdc(name))
-                # The regular expression here is to filter only valid XML chars
-                # Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
-                new_element.text = re.sub(u'[^\u0020-\uD7FF\u0009\u000A\u000D\uE000-\uFFFD\U00010000-\U0010FFFF]+', '', value)
+                if value is not None:
+                    new_element.text = ftfy.fix_text(value)
+                else:
+                    new_element.text = ''
 
 def datacite_writer(element: etree.Element, metadata):
     """Writer for writing data in a metadata object out into raw datacite format"""
diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py
index 11b2868..dbca2aa 100644
--- a/viringo/services/frdr.py
+++ b/viringo/services/frdr.py
@@ -375,13 +375,14 @@ def get_metadata_list(
     records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s' port='%s'" % (db, user, password, server, port))
     with records_con:
         db_cursor = records_con.cursor()
-    records_sql = """SELECT recs.record_id, recs.title, recs.title_fr, recs.pub_date, recs.series, recs.source_url, recs.item_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp, repos.homepage_url, repos.repo_oai_name FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id AND recs.pub_date!=''"""
+    records_sql = """SELECT recs.record_id, recs.title, recs.title_fr, recs.pub_date, recs.series, recs.source_url, recs.item_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp, repos.homepage_url, repos.repo_oai_name FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id AND recs.pub_date != ''"""
     if set is not None and set != 'openaire_data':
         records_sql = records_sql + " AND (repos.repo_oai_name='" + set + "')"
     if from_datetime is not None:
         records_sql = records_sql + " AND recs.pub_date>='" + from_datetime + "'"
     if until_datetime is not None:
         records_sql = records_sql + " AND recs.pub_date<'" + until_datetime + "'"
+    records_sql = records_sql + " ORDER BY recs.record_id"
     if cursor is not None:
         records_sql = records_sql + " OFFSET " + cursor
     db_cursor.execute(records_sql)

From 6523551c077607df8b6a679fbe73477a5c55a17f Mon Sep 17 00:00:00 2001
From: Alex Garnett <axfelix@gmail.com>
Date: Thu, 10 Sep 2020 11:57:17 -0700
Subject: [PATCH 50/67] fix reporting totals and paging at end of listrecords

---
 Pipfile.lock             |  6 ++++++
 viringo/catalogs.py      |  5 +++++
 viringo/services/frdr.py | 11 ++++++++---
 3 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/Pipfile.lock b/Pipfile.lock
index 9bf8991..7d9a934 100644
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -66,6 +66,12 @@
             "index": "pypi",
             "version": "==1.1.1"
         },
+        "ftfy": {
+            "hashes": [
+                "sha256:51c7767f8c4b47d291fcef30b9625fb5341c06a31e6a3b627039c706c42f3720"
+            ],
+            "version": "==5.8"
+        },
         "idna": {
             "hashes": [
                 "sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407",
diff --git a/viringo/catalogs.py b/viringo/catalogs.py
index 4d708c3..87664f4 100644
--- a/viringo/catalogs.py
+++ b/viringo/catalogs.py
@@ -374,6 +374,7 @@ def listRecords(
             set=None,
             paging_cursor=None
         ):
+
         #pylint: disable=no-self-use,invalid-name
         """Returns pyoai data tuple for list of records"""
 
@@ -393,6 +394,10 @@ def listRecords(
             cursor=paging_cursor
         )
 
+        batch_size = 50
+        if len(results) <= batch_size:
+            paging_cursor = None
+
         records = []
         if results:
             for result in results:
diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py
index dbca2aa..180171e 100644
--- a/viringo/services/frdr.py
+++ b/viringo/services/frdr.py
@@ -375,7 +375,7 @@ def get_metadata_list(
     records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s' port='%s'" % (db, user, password, server, port))
     with records_con:
         db_cursor = records_con.cursor()
-    records_sql = """SELECT recs.record_id, recs.title, recs.title_fr, recs.pub_date, recs.series, recs.source_url, recs.item_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp, repos.homepage_url, repos.repo_oai_name FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id AND recs.pub_date != ''"""
+    records_sql = """SELECT recs.record_id, recs.title, recs.title_fr, recs.pub_date, recs.series, recs.source_url, recs.item_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp, repos.homepage_url, repos.repo_oai_name, count(*) OVER() AS full_count FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id AND recs.pub_date != ''"""
     if set is not None and set != 'openaire_data':
         records_sql = records_sql + " AND (repos.repo_oai_name='" + set + "')"
     if from_datetime is not None:
@@ -388,19 +388,24 @@ def get_metadata_list(
     db_cursor.execute(records_sql)
 
     record_set = db_cursor.fetchmany(config.RESULT_SET_SIZE)
+    full_count = 0
 
     results = []
     for row in record_set:
         record = (dict(zip(['record_id', 'title_en', 'title_fr', 'pub_date', 'series', 'source_url', 'item_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp', 'homepage_url', 'repo_oai_name'], row)))
 
+        # This is goofy, but full_count isn't always returned for empty results
+        if int(row[-1]) != 0:
+            full_count = row[-1]
+
         full_record = assemble_record(record, db, user, password, server, port)
         if full_record is not None:
             results.append(build_metadata(full_record))
 
     if cursor is not None:
-        return results, db_cursor.rowcount, (len(record_set) + int(cursor))
+        return results, full_count, (len(record_set) + int(cursor))
     else:
-        return results, db_cursor.rowcount, len(record_set)
+        return results, full_count, len(record_set)
 
 
 def get_metadata(identifier, db, user, password, server, port):

From d78621415df3b361dd7fd6061d42a925860f860c Mon Sep 17 00:00:00 2001
From: Alex Garnett <axfelix@gmail.com>
Date: Fri, 11 Sep 2020 10:52:20 -0700
Subject: [PATCH 51/67] forgot to sideload ftfy dependencies into pipfile lock

---
 Pipfile.lock | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/Pipfile.lock b/Pipfile.lock
index 7d9a934..3a08a9b 100644
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -278,6 +278,13 @@
                 "sha256:6dc65cf9091cf750012f56f2cad759fa9e879f511b5ff8685e456b4e3bf90d16"
             ],
             "version": "==1.0.0"
+        },
+        "wcwidth": {
+            "hashes": [
+                "sha256:c4d647b99872929fdb7bdcaa4fbe7f01413ed3d98077df798530e5b04f116c83",
+                "sha256:beb4802a9cebb9144e99086eff703a642a13d6a0052920003a230f3294bbe784"
+            ],
+            "version": "==0.2.5"
         }
     },
     "develop": {

From d11a5e0ddf947c955b8c528ddd3fe85517e6c430 Mon Sep 17 00:00:00 2001
From: Alex Garnett <axfelix@gmail.com>
Date: Fri, 11 Sep 2020 11:08:25 -0700
Subject: [PATCH 52/67] revert less than or equals test for batch sizes

---
 viringo/catalogs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/viringo/catalogs.py b/viringo/catalogs.py
index 87664f4..504e21a 100644
--- a/viringo/catalogs.py
+++ b/viringo/catalogs.py
@@ -395,7 +395,7 @@ def listRecords(
         )
 
         batch_size = 50
-        if len(results) <= batch_size:
+        if len(results) < batch_size:
             paging_cursor = None
 
         records = []

From 92fe12cb7b245bcdac2ac51b90d441a597c5ca5b Mon Sep 17 00:00:00 2001
From: Kelly Stathis <kelly.a.stathis@gmail.com>
Date: Wed, 16 Sep 2020 10:41:01 -0700
Subject: [PATCH 53/67] Continue iteration when there are fewer than 50 records
 per page; only stop when the total_records is exceeded

---
 viringo/catalogs.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/viringo/catalogs.py b/viringo/catalogs.py
index 504e21a..cdc9c4b 100644
--- a/viringo/catalogs.py
+++ b/viringo/catalogs.py
@@ -394,8 +394,7 @@ def listRecords(
             cursor=paging_cursor
         )
 
-        batch_size = 50
-        if len(results) < batch_size:
+        if paging_cursor >= total_records:
             paging_cursor = None
 
         records = []

From e2636579f5305e74bc96bd944094ae352b70f900 Mon Sep 17 00:00:00 2001
From: Kelly Stathis <kelly.a.stathis@gmail.com>
Date: Mon, 21 Sep 2020 15:48:19 -0700
Subject: [PATCH 54/67] Use ftfy.fix_text in XML for oai_datacite and datacite

---
 viringo/services/frdr.py | 39 ++++++++++++++++++++-------------------
 1 file changed, 20 insertions(+), 19 deletions(-)

diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py
index 180171e..3c626f4 100644
--- a/viringo/services/frdr.py
+++ b/viringo/services/frdr.py
@@ -7,6 +7,7 @@
 import dateutil.tz
 from viringo import config
 import xml.etree.cElementTree as ET
+import ftfy
 
 class Metadata:
     """Represents a DataCite metadata resultset"""
@@ -74,10 +75,10 @@ def construct_datacite_xml(data):
     identifier = ET.SubElement(resource, "identifier")
     if "doi.org/" in data['item_url']:
         identifier.set("identifierType", "DOI")
-        identifier.text = data['item_url'].split("doi.org/")[1]
+        identifier.text = ftfy.fix_text(data['item_url'].split("doi.org/")[1])
     else:
         identifier.set("identifierType", "URL")
-        identifier.text = data['item_url']
+        identifier.text = ftfy.fix_text(data['item_url'])
 
 
     # Add creators
@@ -85,28 +86,28 @@ def construct_datacite_xml(data):
     for creator_entry in data['dc:contributor.author']:
         creator = ET.SubElement(creators, "creator")
         creatorName = ET.SubElement(creator, "creatorName")
-        creatorName.text = creator_entry
+        creatorName.text = ftfy.fix_text(creator_entry)
 
     # Add titles
     titles = ET.SubElement(resource, "titles")
     if data['title_en'] != "":
         title = ET.SubElement(titles, "title")
-        title.text = data['title_en']
+        title.text = ftfy.fix_text(data['title_en'])
         title.set("xml:lang", "en")
     if data['title_fr'] != "":
         title = ET.SubElement(titles, "title")
-        title.text = data['title_fr']
+        title.text = ftfy.fix_text(data['title_fr'])
         title.set("xml:lang", "fr")
         if data['title_en'] != "":
             title.set("titleType", "TranslatedTitle")
 
     # Add publisher
     publisher = ET.SubElement(resource, "publisher")
-    publisher.text = data['repository_name']
+    publisher.text = ftfy.fix_text(data['repository_name'])
 
     # Add publication year
     publicationyear = ET.SubElement(resource, "publicationYear")
-    publicationyear.text = data['pub_date'][:4]
+    publicationyear.text = ftfy.fix_text(data['pub_date'][:4])
 
     # Add subjects
     subject_and_tags = []
@@ -116,25 +117,25 @@ def construct_datacite_xml(data):
             subject_and_tags.append(subject_entry)
             subject = ET.SubElement(subjects, "subject")
             subject.set("xml:lang", "en")
-            subject.text = subject_entry
+            subject.text = ftfy.fix_text(subject_entry)
     for subject_entry in data['frdr:category_fr']:
         if subject_entry not in subject_and_tags and subject_entry != "":
             subject_and_tags.append(subject_entry)
             subject = ET.SubElement(subjects, "subject")
             subject.set("xml:lang", "fr")
-            subject.text = subject_entry
+            subject.text = ftfy.fix_text(subject_entry)
     for subject_entry in data['frdr:keywords_en']:
         if subject_entry not in subject_and_tags and subject_entry != "":
             subject_and_tags.append(subject_entry)
             subject = ET.SubElement(subjects, "subject")
             subject.set("xml:lang", "en")
-            subject.text = subject_entry
+            subject.text = ftfy.fix_text(subject_entry)
     for subject_entry in data['frdr:keywords_fr']:
         if subject_entry not in subject_and_tags and subject_entry != "":
             subject_and_tags.append(subject_entry)
             subject = ET.SubElement(subjects, "subject")
             subject.set("xml:lang", "fr")
-            subject.text = subject_entry
+            subject.text = ftfy.fix_text(subject_entry)
 
     # If subjects is empty, remove it
     if len(subjects) == 0:
@@ -146,7 +147,7 @@ def construct_datacite_xml(data):
         contributor = ET.SubElement(contributors, "contributor")
         contributor.set("contributorType", "Other")
         contributorName = ET.SubElement(contributor, "contributorName")
-        contributorName.text = contributor_entry
+        contributorName.text = ftfy.fix_text(contributor_entry)
 
     # Add FRDR as HostingInstituton
     contributor_en = ET.SubElement(contributors, "contributor")
@@ -164,7 +165,7 @@ def construct_datacite_xml(data):
     dates = ET.SubElement(resource, "dates")
     date = ET.SubElement(dates, "date")
     date.set("dateType", "Issued")
-    date.text = data['pub_date']
+    date.text = ftfy.fix_text(data['pub_date'])
 
     # Add resourceType
     resourceType = ET.SubElement(resource, "resourceType")
@@ -175,17 +176,17 @@ def construct_datacite_xml(data):
     alternateIdentifiers = ET.SubElement(resource, "alternateIdentifiers")
     alternateIdentifier = ET.SubElement(alternateIdentifiers, "alternateIdentifier")
     alternateIdentifier.set("alternateIdentifierType", "local")
-    alternateIdentifier.text = data['local_identifier']
+    alternateIdentifier.text = ftfy.fix_text(data['local_identifier'])
 
     # Add rightsList
     rightsList = ET.SubElement(resource, "rightsList")
     for rights_entry in data['dc:rights']:
         if rights_entry != '':
             rights = ET.SubElement(rightsList, "rights")
-            rights.text = rights_entry
+            rights.text = ftfy.fix_text(rights_entry)
             if "http" in rights_entry:
                 rights.set("rightsURI", rights_entry[rights_entry.find("http"):].strip())
-                rights.text = rights_entry[:rights_entry.find("http")].strip()
+                rights.text = ftfy.fix_text(rights_entry[:rights_entry.find("http")].strip())
     if len(data["frdr:access"]) > 0:
         for access_entry in data["frdr:access"]:
             rights = ET.SubElement(rightsList, "rights")
@@ -204,19 +205,19 @@ def construct_datacite_xml(data):
             description = ET.SubElement(descriptions, "description")
             description.set("descriptionType", "Abstract")
             description.set("xml:lang", "en")
-            description.text = description_entry
+            description.text = ftfy.fix_text(description_entry)
     for description_entry in data['dc:description_fr']:
         if description_entry != "":
             description = ET.SubElement(descriptions, "description")
             description.set("descriptionType", "Abstract")
             description.set("xml:lang", "fr")
-            description.text = description_entry
+            description.text = ftfy.fix_text(description_entry)
 
     # Add series (series)
     if data['series'] != "":
         description_series = ET.SubElement(descriptions, "description")
         description_series.set("descriptionType", "SeriesInformation")
-        description_series.text = data['series']
+        description_series.text = ftfy.fix_text(data['series'])
 
     # If descriptions is empty, remove it
     if len(descriptions) == 0:

From 34a27796963b4dd8b2bca326f28b154b60d06a69 Mon Sep 17 00:00:00 2001
From: Kelly Stathis <kelly.a.stathis@gmail.com>
Date: Mon, 21 Sep 2020 16:43:25 -0700
Subject: [PATCH 55/67] helper function for fixing xml to check if none/zero
 length

---
 viringo/services/frdr.py | 44 +++++++++++++++++++++++-----------------
 1 file changed, 25 insertions(+), 19 deletions(-)

diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py
index 3c626f4..e5583c9 100644
--- a/viringo/services/frdr.py
+++ b/viringo/services/frdr.py
@@ -64,6 +64,12 @@ def __init__(
         self.client = client
         self.active = active
 
+def xml_fix_text(text):
+    if isinstance(text, str) and len(text) > 0:
+        return ftfy.fix_text(text)
+    else:
+        return ''
+
 def construct_datacite_xml(data):
     resource = ET.Element("resource")
     resource.set("xmlns", "http://datacite.org/schema/kernel-4")
@@ -75,10 +81,10 @@ def construct_datacite_xml(data):
     identifier = ET.SubElement(resource, "identifier")
     if "doi.org/" in data['item_url']:
         identifier.set("identifierType", "DOI")
-        identifier.text = ftfy.fix_text(data['item_url'].split("doi.org/")[1])
+        identifier.text = xml_fix_text(data['item_url'].split("doi.org/")[1])
     else:
         identifier.set("identifierType", "URL")
-        identifier.text = ftfy.fix_text(data['item_url'])
+        identifier.text = xml_fix_text(data['item_url'])
 
 
     # Add creators
@@ -86,28 +92,28 @@ def construct_datacite_xml(data):
     for creator_entry in data['dc:contributor.author']:
         creator = ET.SubElement(creators, "creator")
         creatorName = ET.SubElement(creator, "creatorName")
-        creatorName.text = ftfy.fix_text(creator_entry)
+        creatorName.text = xml_fix_text(creator_entry)
 
     # Add titles
     titles = ET.SubElement(resource, "titles")
     if data['title_en'] != "":
         title = ET.SubElement(titles, "title")
-        title.text = ftfy.fix_text(data['title_en'])
+        title.text = xml_fix_text(data['title_en'])
         title.set("xml:lang", "en")
     if data['title_fr'] != "":
         title = ET.SubElement(titles, "title")
-        title.text = ftfy.fix_text(data['title_fr'])
+        title.text = xml_fix_text(data['title_fr'])
         title.set("xml:lang", "fr")
         if data['title_en'] != "":
             title.set("titleType", "TranslatedTitle")
 
     # Add publisher
     publisher = ET.SubElement(resource, "publisher")
-    publisher.text = ftfy.fix_text(data['repository_name'])
+    publisher.text = xml_fix_text(data['repository_name'])
 
     # Add publication year
     publicationyear = ET.SubElement(resource, "publicationYear")
-    publicationyear.text = ftfy.fix_text(data['pub_date'][:4])
+    publicationyear.text = xml_fix_text(data['pub_date'][:4])
 
     # Add subjects
     subject_and_tags = []
@@ -117,25 +123,25 @@ def construct_datacite_xml(data):
             subject_and_tags.append(subject_entry)
             subject = ET.SubElement(subjects, "subject")
             subject.set("xml:lang", "en")
-            subject.text = ftfy.fix_text(subject_entry)
+            subject.text = xml_fix_text(subject_entry)
     for subject_entry in data['frdr:category_fr']:
         if subject_entry not in subject_and_tags and subject_entry != "":
             subject_and_tags.append(subject_entry)
             subject = ET.SubElement(subjects, "subject")
             subject.set("xml:lang", "fr")
-            subject.text = ftfy.fix_text(subject_entry)
+            subject.text = xml_fix_text(subject_entry)
     for subject_entry in data['frdr:keywords_en']:
         if subject_entry not in subject_and_tags and subject_entry != "":
             subject_and_tags.append(subject_entry)
             subject = ET.SubElement(subjects, "subject")
             subject.set("xml:lang", "en")
-            subject.text = ftfy.fix_text(subject_entry)
+            subject.text = xml_fix_text(subject_entry)
     for subject_entry in data['frdr:keywords_fr']:
         if subject_entry not in subject_and_tags and subject_entry != "":
             subject_and_tags.append(subject_entry)
             subject = ET.SubElement(subjects, "subject")
             subject.set("xml:lang", "fr")
-            subject.text = ftfy.fix_text(subject_entry)
+            subject.text = xml_fix_text(subject_entry)
 
     # If subjects is empty, remove it
     if len(subjects) == 0:
@@ -147,7 +153,7 @@ def construct_datacite_xml(data):
         contributor = ET.SubElement(contributors, "contributor")
         contributor.set("contributorType", "Other")
         contributorName = ET.SubElement(contributor, "contributorName")
-        contributorName.text = ftfy.fix_text(contributor_entry)
+        contributorName.text = xml_fix_text(contributor_entry)
 
     # Add FRDR as HostingInstituton
     contributor_en = ET.SubElement(contributors, "contributor")
@@ -165,7 +171,7 @@ def construct_datacite_xml(data):
     dates = ET.SubElement(resource, "dates")
     date = ET.SubElement(dates, "date")
     date.set("dateType", "Issued")
-    date.text = ftfy.fix_text(data['pub_date'])
+    date.text = xml_fix_text(data['pub_date'])
 
     # Add resourceType
     resourceType = ET.SubElement(resource, "resourceType")
@@ -176,17 +182,17 @@ def construct_datacite_xml(data):
     alternateIdentifiers = ET.SubElement(resource, "alternateIdentifiers")
     alternateIdentifier = ET.SubElement(alternateIdentifiers, "alternateIdentifier")
     alternateIdentifier.set("alternateIdentifierType", "local")
-    alternateIdentifier.text = ftfy.fix_text(data['local_identifier'])
+    alternateIdentifier.text = xml_fix_text(data['local_identifier'])
 
     # Add rightsList
     rightsList = ET.SubElement(resource, "rightsList")
     for rights_entry in data['dc:rights']:
         if rights_entry != '':
             rights = ET.SubElement(rightsList, "rights")
-            rights.text = ftfy.fix_text(rights_entry)
+            rights.text = xml_fix_text(rights_entry)
             if "http" in rights_entry:
                 rights.set("rightsURI", rights_entry[rights_entry.find("http"):].strip())
-                rights.text = ftfy.fix_text(rights_entry[:rights_entry.find("http")].strip())
+                rights.text = xml_fix_text(rights_entry[:rights_entry.find("http")].strip())
     if len(data["frdr:access"]) > 0:
         for access_entry in data["frdr:access"]:
             rights = ET.SubElement(rightsList, "rights")
@@ -205,19 +211,19 @@ def construct_datacite_xml(data):
             description = ET.SubElement(descriptions, "description")
             description.set("descriptionType", "Abstract")
             description.set("xml:lang", "en")
-            description.text = ftfy.fix_text(description_entry)
+            description.text = xml_fix_text(description_entry)
     for description_entry in data['dc:description_fr']:
         if description_entry != "":
             description = ET.SubElement(descriptions, "description")
             description.set("descriptionType", "Abstract")
             description.set("xml:lang", "fr")
-            description.text = ftfy.fix_text(description_entry)
+            description.text = xml_fix_text(description_entry)
 
     # Add series (series)
     if data['series'] != "":
         description_series = ET.SubElement(descriptions, "description")
         description_series.set("descriptionType", "SeriesInformation")
-        description_series.text = ftfy.fix_text(data['series'])
+        description_series.text = xml_fix_text(data['series'])
 
     # If descriptions is empty, remove it
     if len(descriptions) == 0:

From dafcbad7bd5aaa48dd92d0ab10ff251064320f95 Mon Sep 17 00:00:00 2001
From: Kelly Stathis <kelly.a.stathis@gmail.com>
Date: Wed, 23 Sep 2020 14:51:28 -0700
Subject: [PATCH 56/67] Add openAccess or restrictedAccess flag to oai_dc for
 Primo

---
 viringo/services/frdr.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py
index e5583c9..9bfd918 100644
--- a/viringo/services/frdr.py
+++ b/viringo/services/frdr.py
@@ -275,6 +275,20 @@ def build_metadata(data):
     result.client = data['repo_oai_name']
     result.active = True
 
+    # Add openAccess or restrictedAccess indicator to dc:rights
+    if len(data["frdr:access"]) > 0:
+        for access_entry in data["frdr:access"]:
+            # If Public in frdr:access, use openAccess
+            if access_entry == "Public":
+                result.rights.append("openAccess")
+                break
+        if "openAccess" not in result.rights:
+            # If there are access values and none are Public, use restrictedAccess
+            result.rights.append("restrictedAccess")
+    else:
+        # If not indicated, assume Public/openAccess
+        result.rights.append("openAccess")
+
     return result
 
 

From ec2f12ba007a703b3d7445664635e05df5a838be Mon Sep 17 00:00:00 2001
From: Kelly Stathis <kelly.a.stathis@gmail.com>
Date: Tue, 6 Oct 2020 10:52:03 -0700
Subject: [PATCH 57/67] Replace form feed chars (\x0c) with space

---
 viringo/metadata.py      | 11 +++++++++--
 viringo/services/frdr.py |  1 +
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/viringo/metadata.py b/viringo/metadata.py
index ba195ee..f8f323b 100644
--- a/viringo/metadata.py
+++ b/viringo/metadata.py
@@ -48,7 +48,11 @@ def nsdc(name):
                     value = value[0]
                 new_element = etree.SubElement(e_dc, nsdc(name))
                 if value is not None:
-                    new_element.text = ftfy.fix_text(value)
+                    try:
+                        value = value.replace('\x0c', " ")
+                        new_element.text = ftfy.fix_text(value)
+                    except:
+                        print(value)
                 else:
                     new_element.text = ''
 
@@ -66,7 +70,10 @@ def oai_datacite_writer(element: etree.Element, metadata):
     _map = metadata.getMap()
     raw_xml = _map.get('xml', '')
 
-    xml_resource_element = etree.fromstring(raw_xml)
+    try:
+        xml_resource_element = etree.fromstring(raw_xml)
+    except:
+        print(raw_xml)
 
     e_oai_datacite = etree.SubElement(
         element, "oai_datacite", {'xmlns': 'http://schema.datacite.org/oai/oai-1.1/'},
diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py
index 9bfd918..b2478e7 100644
--- a/viringo/services/frdr.py
+++ b/viringo/services/frdr.py
@@ -66,6 +66,7 @@ def __init__(
 
 def xml_fix_text(text):
     if isinstance(text, str) and len(text) > 0:
+        text = text.replace('\x0c', " ")
         return ftfy.fix_text(text)
     else:
         return ''

From edd4fe0958a4244c05ff8fb3afffddcb82e54eef Mon Sep 17 00:00:00 2001
From: Kelly Stathis <kelly.a.stathis@gmail.com>
Date: Tue, 6 Oct 2020 17:04:54 -0700
Subject: [PATCH 58/67] Use repository_name for publisher field in oai_dc
 (matches oai_datacite)

---
 viringo/services/frdr.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py
index b2478e7..ee8ddb5 100644
--- a/viringo/services/frdr.py
+++ b/viringo/services/frdr.py
@@ -260,7 +260,7 @@ def build_metadata(data):
             result.subjects.append(subject)
 
     result.descriptions = data['dc:description_en'] + data['dc:description_fr']
-    result.publisher = data['dc:publisher']
+    result.publisher = data['repository_name']
     result.publication_year = dateutil.parser.parse(data['pub_date']).year
     result.dates = [data['pub_date']]
     result.contributors = data['dc:contributor']

From 1404d231ab857c830915fcec49d84d2f3190e7c7 Mon Sep 17 00:00:00 2001
From: Kelly Stathis <kelly.a.stathis@gmail.com>
Date: Tue, 6 Oct 2020 17:06:28 -0700
Subject: [PATCH 59/67] Ensure that only strings are passed to ftfy.fix_text
 and catch exceptions

---
 viringo/metadata.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/viringo/metadata.py b/viringo/metadata.py
index f8f323b..5c2f6d0 100644
--- a/viringo/metadata.py
+++ b/viringo/metadata.py
@@ -44,15 +44,18 @@ def nsdc(name):
         ]:
         for value in _map.get(name, []):
             if value:
-                if isinstance(value, list) and len(value) == 1:
-                    value = value[0]
+                if isinstance(value, list):
+                    if len(value) == 1:
+                        value = value[0]
+                    else:
+                        value = str(value)
                 new_element = etree.SubElement(e_dc, nsdc(name))
-                if value is not None:
+                if isinstance(value, str):
                     try:
                         value = value.replace('\x0c', " ")
                         new_element.text = ftfy.fix_text(value)
                     except:
-                        print(value)
+                        new_element.text = ''
                 else:
                     new_element.text = ''
 

From 7e6ac1231c8db2f566d084f2ccbf8fbafd93c0c5 Mon Sep 17 00:00:00 2001
From: Kelly Stathis <kelly.a.stathis@gmail.com>
Date: Tue, 6 Oct 2020 17:08:56 -0700
Subject: [PATCH 60/67] Exclude deleted records and records without item_url
 from selection

---
 viringo/services/frdr.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py
index ee8ddb5..ffaf422 100644
--- a/viringo/services/frdr.py
+++ b/viringo/services/frdr.py
@@ -397,7 +397,12 @@ def get_metadata_list(
     records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s' port='%s'" % (db, user, password, server, port))
     with records_con:
         db_cursor = records_con.cursor()
-    records_sql = """SELECT recs.record_id, recs.title, recs.title_fr, recs.pub_date, recs.series, recs.source_url, recs.item_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp, repos.homepage_url, repos.repo_oai_name, count(*) OVER() AS full_count FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id AND recs.pub_date != ''"""
+    records_sql = """SELECT recs.record_id, recs.title, recs.title_fr, recs.pub_date, recs.series, recs.source_url, 
+        recs.item_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, 
+        repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp, 
+        repos.homepage_url, repos.repo_oai_name, count(*) OVER() AS full_count FROM records recs, repositories repos 
+        WHERE recs.repository_id = repos.repository_id AND recs.deleted!=1 AND recs.item_url!='' AND 
+        recs.pub_date != ''"""
     if set is not None and set != 'openaire_data':
         records_sql = records_sql + " AND (repos.repo_oai_name='" + set + "')"
     if from_datetime is not None:

From 2a84e4ea2a8d0dd57bd703fdaaf131789bcd2448 Mon Sep 17 00:00:00 2001
From: Kelly Stathis <kelly.a.stathis@gmail.com>
Date: Mon, 9 Nov 2020 08:57:59 -0800
Subject: [PATCH 61/67] Only set one info:eu-repo/semantics access statement
 (still needs testing)

---
 viringo/services/frdr.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py
index ffaf422..cb68956 100644
--- a/viringo/services/frdr.py
+++ b/viringo/services/frdr.py
@@ -194,15 +194,19 @@ def construct_datacite_xml(data):
             if "http" in rights_entry:
                 rights.set("rightsURI", rights_entry[rights_entry.find("http"):].strip())
                 rights.text = xml_fix_text(rights_entry[:rights_entry.find("http")].strip())
+    # Add access statement
+    rights = ET.SubElement(rightsList, "rights")
     if len(data["frdr:access"]) > 0:
         for access_entry in data["frdr:access"]:
-            rights = ET.SubElement(rightsList, "rights")
+            # If Public in frdr:access, use openAccess
             if access_entry == "Public":
                 rights.set("rightsURI", "info:eu-repo/semantics/openAccess")
-            else:
-                rights.set("rightsURI", "info:eu-repo/semantics/restrictedAccess")
-    else: # Assume Public/openAccess
-        rights = ET.SubElement(rightsList, "rights")
+                break
+        if "rightsURI" not in rights.attrib:
+            # If there are access values and none are Public, use restrictedAccess
+            rights.set("rightsURI", "info:eu-repo/semantics/restrictedAccess")
+    else:
+        # If not indicated, assume Public/openAccess
         rights.set("rightsURI", "info:eu-repo/semantics/openAccess")
 
     # Add description(s)

From 1cc56bd2cab9fd786445c781de1388a1986e4cfc Mon Sep 17 00:00:00 2001
From: Kelly Stathis <kelly.a.stathis@gmail.com>
Date: Tue, 15 Dec 2020 14:52:56 -0800
Subject: [PATCH 62/67] Parse pub_date to YYYY-MM-DD format from datetime

---
 viringo/services/frdr.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py
index cb68956..ea69699 100644
--- a/viringo/services/frdr.py
+++ b/viringo/services/frdr.py
@@ -410,9 +410,9 @@ def get_metadata_list(
     if set is not None and set != 'openaire_data':
         records_sql = records_sql + " AND (repos.repo_oai_name='" + set + "')"
     if from_datetime is not None:
-        records_sql = records_sql + " AND recs.pub_date>='" + from_datetime + "'"
+        records_sql = records_sql + " AND recs.pub_date>='" + from_datetime.strftime('%Y-%M-%D') + "'"
     if until_datetime is not None:
-        records_sql = records_sql + " AND recs.pub_date<'" + until_datetime + "'"
+        records_sql = records_sql + " AND recs.pub_date<'" + until_datetime.strftime('%Y-%M-%D') + "'"
     records_sql = records_sql + " ORDER BY recs.record_id"
     if cursor is not None:
         records_sql = records_sql + " OFFSET " + cursor

From 75df50669a567bf1e063ee1863dba0b2bd36752b Mon Sep 17 00:00:00 2001
From: Kelly Stathis <kelly.a.stathis@gmail.com>
Date: Tue, 15 Dec 2020 14:59:39 -0800
Subject: [PATCH 63/67] Switch from pub_date to upstream_modified_timestamp

---
 viringo/services/frdr.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py
index ea69699..ad0a19f 100644
--- a/viringo/services/frdr.py
+++ b/viringo/services/frdr.py
@@ -410,9 +410,11 @@ def get_metadata_list(
     if set is not None and set != 'openaire_data':
         records_sql = records_sql + " AND (repos.repo_oai_name='" + set + "')"
     if from_datetime is not None:
-        records_sql = records_sql + " AND recs.pub_date>='" + from_datetime.strftime('%Y-%M-%D') + "'"
+        from_timestamp = int(datetime.timestamp(from_datetime))
+        records_sql = records_sql + " AND recs.upstream_modified_timestamp>=" + str(from_timestamp)
     if until_datetime is not None:
-        records_sql = records_sql + " AND recs.pub_date<'" + until_datetime.strftime('%Y-%M-%D') + "'"
+        until_timestamp = int(datetime.timestamp(until_datetime))
+        records_sql = records_sql + " AND recs.upstream_modified_timestamp<" + str(until_timestamp)
     records_sql = records_sql + " ORDER BY recs.record_id"
     if cursor is not None:
         records_sql = records_sql + " OFFSET " + cursor

From 30b9a1a75dca12260efdbb9804f4f6e3f72d9062 Mon Sep 17 00:00:00 2001
From: Kelly Stathis <kelly.a.stathis@gmail.com>
Date: Mon, 21 Dec 2020 18:18:51 -0800
Subject: [PATCH 64/67] Add GeoLocation metadata to oai_datacite format

---
 viringo/services/frdr.py | 90 +++++++++++++++++++++++++++++++---------
 1 file changed, 71 insertions(+), 19 deletions(-)

diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py
index ad0a19f..ec09579 100644
--- a/viringo/services/frdr.py
+++ b/viringo/services/frdr.py
@@ -234,6 +234,41 @@ def construct_datacite_xml(data):
     if len(descriptions) == 0:
         resource.remove(descriptions)
 
+    # Add GeoLocation
+    geolocations = ET.SubElement(resource, "geoLocations")
+    if "geoLocationBox" in data["datacite_geoLocation"]:
+        for geobbox in data["datacite_geoLocation"]["geoLocationBox"]:
+            geolocation = ET.SubElement(geolocations, "geoLocation")
+            geolocationBox = ET.SubElement(geolocation, "geolocationBox")
+            geolocationBox.text = xml_fix_text(str(geobbox["southBoundLatitude"]) + " " + str(geobbox["westBoundLongitude"]) + " " +
+                                               str(geobbox["northBoundLatitude"]) + " " + str(geobbox["eastBoundLongitude"]))
+
+    if "geoLocationPoint" in data["datacite_geoLocation"]:
+        for geopoint in data["datacite_geoLocation"]["geoLocationPoint"]:
+            geolocation = ET.SubElement(geolocations, "geoLocation")
+            geoLocationPoint = ET.SubElement(geolocation, "geoLocationPoint")
+            geoLocationPoint.text = xml_fix_text(str(geopoint["pointLatitude"]) + " " + str(geopoint["pointLongitude"]))
+
+    if "geoLocationPlace" in data["datacite_geoLocation"]:
+        for geoplace in data["datacite_geoLocation"]["geoLocationPlace"]:
+            geolocation = ET.SubElement(geolocations, "geoLocation")
+            geoLocationPlace = ET.SubElement(geolocation, "geoLocationPlace")
+            components = []
+            if geoplace["place_name"]:
+                components.append(geoplace["place_name"])
+            if geoplace["additional"]:
+                components.append(geoplace["additional"])
+            if geoplace["city"]:
+                components.append(geoplace["city"])
+            if geoplace["province_state"]:
+                components.append(geoplace["province_state"])
+            if geoplace["country"]:
+                components.append(geoplace["country"])
+            geoLocationPlace.text = xml_fix_text("; ".join(components))
+
+    if len(geolocations) == 0:
+        resource.remove(geolocations)
+
     xml_string = ET.tostring(resource)
     return xml_string
 
@@ -270,7 +305,7 @@ def build_metadata(data):
     result.contributors = data['dc:contributor']
     result.funding_references = ''
     result.sizes = []
-    result.geo_locations = data['frdr:geospatial']
+    result.geo_locations = []
     result.resource_types = ['Dataset']
     result.formats = []
     result.identifiers = [data['item_url']]
@@ -322,24 +357,41 @@ def assemble_record(record, db, user, password, server, port):
 
     con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s' port='%s'" % (db, user, password, server, port))
     with con:
-        lookup_cur = con.cursor(cursor_factory=None)
-
-        lookup_cur.execute("SELECT coordinate_type, lat, lon FROM geospatial WHERE record_id=%s", [record["record_id"]])
-        geodata = lookup_cur.fetchall()
-        record["frdr:geospatial"] = []
-        polycoordinates = []
-
-        try:
-            for coordinate in geodata:
-                if coordinate[0] == "Polygon":
-                    polycoordinates.append([float(coordinate[1]), float(coordinate[2])])
-                else:
-                    record["frdr:geospatial"].append({"frdr:geospatial_type": "Feature", "frdr:geospatial_geometry": {"frdr:geometry_type": coordinate[0], "frdr:geometry_coordinates": [float(coordinate[1]), float(coordinate[2])]}})
-        except:
-            pass
-
-        if polycoordinates:
-            record["frdr:geospatial"].append({"frdr:geospatial_type": "Feature", "frdr:geospatial_geometry": {"frdr:geometry_type": "Polygon", "frdr:geometry_coordinates": polycoordinates}})
+        from psycopg2.extras import DictCursor
+        lookup_cur = con.cursor(cursor_factory=DictCursor)
+
+        record["datacite_geoLocation"] = {}
+        lookup_cur.execute("""SELECT geobbox.westLon, geobbox.eastLon, geobbox.northLat, geobbox.southLat
+                                           FROM geobbox WHERE geobbox.record_id=%s""", [record["record_id"]])
+        geobboxes = lookup_cur.fetchall()
+        if len(geobboxes) > 0:
+            record["datacite_geoLocation"]["geoLocationBox"] = []
+            for geobbox in geobboxes:
+                record["datacite_geoLocation"]["geoLocationBox"].append({"westBoundLongitude": geobbox["westlon"],
+                                                          "eastBoundLongitude": geobbox["eastlon"],
+                                                          "northBoundLatitude": geobbox["northlat"],
+                                                          "southBoundLatitude": geobbox["southlat"]})
+        lookup_cur.execute("""SELECT geopoint.lat, geopoint.lon FROM geopoint WHERE geopoint.record_id=%s""",
+                        [record["record_id"]])
+        geopoints = lookup_cur.fetchall()
+        if len(geopoints) > 0:
+            record["datacite_geoLocation"]["geoLocationPoint"] = []
+            for geopoint in geopoints:
+                record["datacite_geoLocation"]["geoLocationPoint"].append({"pointLatitude": geopoint["lat"],
+                                                            "pointLongitude": geopoint["lon"]})
+
+        lookup_cur.execute("""SELECT geoplace.country, geoplace.province_state, geoplace.city, geoplace.other, geoplace.place_name
+                           FROM geoplace JOIN records_x_geoplace on records_x_geoplace.geoplace_id = geoplace.geoplace_id
+                                           WHERE records_x_geoplace.record_id=%s""", [record["record_id"]])
+        geoplaces = lookup_cur.fetchall()
+        if len(geoplaces) > 0:
+            record["datacite_geoLocation"]["geoLocationPlace"] = []
+            for geoplace in geoplaces:
+                record["datacite_geoLocation"]["geoLocationPlace"].append({"country": geoplace["country"],
+                                                            "province_state": geoplace["province_state"],
+                                                            "city": geoplace["city"],
+                                                            "additional": geoplace["other"],
+                                                             "place_name": geoplace["place_name"]})
 
     with con:
         from psycopg2.extras import DictCursor

From e47e3554977d30c517abec28e94f6b1e3a27c81b Mon Sep 17 00:00:00 2001
From: Kelly Stathis <kelly.a.stathis@gmail.com>
Date: Mon, 21 Dec 2020 18:25:50 -0800
Subject: [PATCH 65/67] Use the same dict cursor throughout; comments and
 spacing

---
 viringo/services/frdr.py | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py
index ec09579..ce9a6e5 100644
--- a/viringo/services/frdr.py
+++ b/viringo/services/frdr.py
@@ -242,13 +242,11 @@ def construct_datacite_xml(data):
             geolocationBox = ET.SubElement(geolocation, "geolocationBox")
             geolocationBox.text = xml_fix_text(str(geobbox["southBoundLatitude"]) + " " + str(geobbox["westBoundLongitude"]) + " " +
                                                str(geobbox["northBoundLatitude"]) + " " + str(geobbox["eastBoundLongitude"]))
-
     if "geoLocationPoint" in data["datacite_geoLocation"]:
         for geopoint in data["datacite_geoLocation"]["geoLocationPoint"]:
             geolocation = ET.SubElement(geolocations, "geoLocation")
             geoLocationPoint = ET.SubElement(geolocation, "geoLocationPoint")
             geoLocationPoint.text = xml_fix_text(str(geopoint["pointLatitude"]) + " " + str(geopoint["pointLongitude"]))
-
     if "geoLocationPlace" in data["datacite_geoLocation"]:
         for geoplace in data["datacite_geoLocation"]["geoLocationPlace"]:
             geolocation = ET.SubElement(geolocations, "geoLocation")
@@ -264,8 +262,10 @@ def construct_datacite_xml(data):
                 components.append(geoplace["province_state"])
             if geoplace["country"]:
                 components.append(geoplace["country"])
+            # Combine all components of the place name separated by "; "
             geoLocationPlace.text = xml_fix_text("; ".join(components))
 
+    # If geolocations is empty, remove it
     if len(geolocations) == 0:
         resource.remove(geolocations)
 
@@ -360,6 +360,7 @@ def assemble_record(record, db, user, password, server, port):
         from psycopg2.extras import DictCursor
         lookup_cur = con.cursor(cursor_factory=DictCursor)
 
+        # get geolocation metadata
         record["datacite_geoLocation"] = {}
         lookup_cur.execute("""SELECT geobbox.westLon, geobbox.eastLon, geobbox.northLat, geobbox.southLat
                                            FROM geobbox WHERE geobbox.record_id=%s""", [record["record_id"]])
@@ -382,20 +383,16 @@ def assemble_record(record, db, user, password, server, port):
 
         lookup_cur.execute("""SELECT geoplace.country, geoplace.province_state, geoplace.city, geoplace.other, geoplace.place_name
                            FROM geoplace JOIN records_x_geoplace on records_x_geoplace.geoplace_id = geoplace.geoplace_id
-                                           WHERE records_x_geoplace.record_id=%s""", [record["record_id"]])
+                           WHERE records_x_geoplace.record_id=%s""", [record["record_id"]])
         geoplaces = lookup_cur.fetchall()
         if len(geoplaces) > 0:
             record["datacite_geoLocation"]["geoLocationPlace"] = []
             for geoplace in geoplaces:
                 record["datacite_geoLocation"]["geoLocationPlace"].append({"country": geoplace["country"],
-                                                            "province_state": geoplace["province_state"],
-                                                            "city": geoplace["city"],
-                                                            "additional": geoplace["other"],
-                                                             "place_name": geoplace["place_name"]})
-
-    with con:
-        from psycopg2.extras import DictCursor
-        lookup_cur = con.cursor(cursor_factory=DictCursor)
+                                                                           "province_state": geoplace["province_state"],
+                                                                           "city": geoplace["city"],
+                                                                           "additional": geoplace["other"],
+                                                                           "place_name": geoplace["place_name"]})
 
         # attach the other values to the dict
         lookup_cur.execute("""SELECT creators.creator FROM creators JOIN records_x_creators on records_x_creators.creator_id = creators.creator_id WHERE records_x_creators.record_id=%s AND records_x_creators.is_contributor=0 order by records_x_creators_id asc""", [record["record_id"]])

From 44b3035a374c7c53b8077f6061402d9fdf595450 Mon Sep 17 00:00:00 2001
From: Kelly Stathis <kelly.a.stathis@gmail.com>
Date: Thu, 11 Mar 2021 14:47:59 -0800
Subject: [PATCH 66/67] Stop ListIdentifiers iteration when total records is
 exceeded

---
 viringo/catalogs.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/viringo/catalogs.py b/viringo/catalogs.py
index fad0bd8..78729aa 100644
--- a/viringo/catalogs.py
+++ b/viringo/catalogs.py
@@ -448,6 +448,9 @@ def listIdentifiers(
             cursor=paging_cursor
         )
 
+        if paging_cursor >= total_records:
+            paging_cursor = None
+
         records = []
         if results:
             for result in results:

From b9d5663f7904b33c03e2ddae61b8674d040e24bb Mon Sep 17 00:00:00 2001
From: Kelly Stathis <kelly.a.stathis@gmail.com>
Date: Tue, 4 Jan 2022 11:36:25 -0800
Subject: [PATCH 67/67] Change record_id to record_uuid

---
 viringo/services/frdr.py | 42 ++++++++++++++++++++--------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py
index ce9a6e5..eea4346 100644
--- a/viringo/services/frdr.py
+++ b/viringo/services/frdr.py
@@ -363,7 +363,7 @@ def assemble_record(record, db, user, password, server, port):
         # get geolocation metadata
         record["datacite_geoLocation"] = {}
         lookup_cur.execute("""SELECT geobbox.westLon, geobbox.eastLon, geobbox.northLat, geobbox.southLat
-                                           FROM geobbox WHERE geobbox.record_id=%s""", [record["record_id"]])
+                                           FROM geobbox WHERE geobbox.record_uuid=%s""", [record["record_uuid"]])
         geobboxes = lookup_cur.fetchall()
         if len(geobboxes) > 0:
             record["datacite_geoLocation"]["geoLocationBox"] = []
@@ -372,8 +372,8 @@ def assemble_record(record, db, user, password, server, port):
                                                           "eastBoundLongitude": geobbox["eastlon"],
                                                           "northBoundLatitude": geobbox["northlat"],
                                                           "southBoundLatitude": geobbox["southlat"]})
-        lookup_cur.execute("""SELECT geopoint.lat, geopoint.lon FROM geopoint WHERE geopoint.record_id=%s""",
-                        [record["record_id"]])
+        lookup_cur.execute("""SELECT geopoint.lat, geopoint.lon FROM geopoint WHERE geopoint.record_uuid=%s""",
+                        [record["record_uuid"]])
         geopoints = lookup_cur.fetchall()
         if len(geopoints) > 0:
             record["datacite_geoLocation"]["geoLocationPoint"] = []
@@ -383,7 +383,7 @@ def assemble_record(record, db, user, password, server, port):
 
         lookup_cur.execute("""SELECT geoplace.country, geoplace.province_state, geoplace.city, geoplace.other, geoplace.place_name
                            FROM geoplace JOIN records_x_geoplace on records_x_geoplace.geoplace_id = geoplace.geoplace_id
-                           WHERE records_x_geoplace.record_id=%s""", [record["record_id"]])
+                           WHERE records_x_geoplace.record_uuid=%s""", [record["record_uuid"]])
         geoplaces = lookup_cur.fetchall()
         if len(geoplaces) > 0:
             record["datacite_geoLocation"]["geoLocationPlace"] = []
@@ -395,40 +395,40 @@ def assemble_record(record, db, user, password, server, port):
                                                                            "place_name": geoplace["place_name"]})
 
         # attach the other values to the dict
-        lookup_cur.execute("""SELECT creators.creator FROM creators JOIN records_x_creators on records_x_creators.creator_id = creators.creator_id WHERE records_x_creators.record_id=%s AND records_x_creators.is_contributor=0 order by records_x_creators_id asc""", [record["record_id"]])
+        lookup_cur.execute("""SELECT creators.creator FROM creators JOIN records_x_creators on records_x_creators.creator_id = creators.creator_id WHERE records_x_creators.record_uuid=%s AND records_x_creators.is_contributor=0 order by records_x_creators_id asc""", [record["record_uuid"]])
         record["dc:contributor.author"] = rows_to_dict(lookup_cur)
 
-        lookup_cur.execute("""SELECT affiliations.affiliation FROM affiliations JOIN records_x_affiliations on records_x_affiliations.affiliation_id = affiliations.affiliation_id WHERE records_x_affiliations.record_id=%s""", [record["record_id"]])
+        lookup_cur.execute("""SELECT affiliations.affiliation FROM affiliations JOIN records_x_affiliations on records_x_affiliations.affiliation_id = affiliations.affiliation_id WHERE records_x_affiliations.record_uuid=%s""", [record["record_uuid"]])
         record["datacite:creatorAffiliation"] = rows_to_dict(lookup_cur)
 
-        lookup_cur.execute("""SELECT creators.creator FROM creators JOIN records_x_creators on records_x_creators.creator_id = creators.creator_id WHERE records_x_creators.record_id=%s AND records_x_creators.is_contributor=1 order by records_x_creators_id asc""", [record["record_id"]])
+        lookup_cur.execute("""SELECT creators.creator FROM creators JOIN records_x_creators on records_x_creators.creator_id = creators.creator_id WHERE records_x_creators.record_uuid=%s AND records_x_creators.is_contributor=1 order by records_x_creators_id asc""", [record["record_uuid"]])
         record["dc:contributor"] = rows_to_dict(lookup_cur)
 
-        lookup_cur.execute("""SELECT subjects.subject FROM subjects JOIN records_x_subjects on records_x_subjects.subject_id = subjects.subject_id WHERE records_x_subjects.record_id=%s and subjects.language = 'en' """, [record["record_id"]])
+        lookup_cur.execute("""SELECT subjects.subject FROM subjects JOIN records_x_subjects on records_x_subjects.subject_id = subjects.subject_id WHERE records_x_subjects.record_uuid=%s and subjects.language = 'en' """, [record["record_uuid"]])
         record["frdr:category_en"] = rows_to_dict(lookup_cur)
 
-        lookup_cur.execute("""SELECT subjects.subject FROM subjects JOIN records_x_subjects on records_x_subjects.subject_id = subjects.subject_id WHERE records_x_subjects.record_id=%s and subjects.language = 'fr' """, [record["record_id"]])
+        lookup_cur.execute("""SELECT subjects.subject FROM subjects JOIN records_x_subjects on records_x_subjects.subject_id = subjects.subject_id WHERE records_x_subjects.record_uuid=%s and subjects.language = 'fr' """, [record["record_uuid"]])
         record["frdr:category_fr"] = rows_to_dict(lookup_cur)
 
-        lookup_cur.execute("""SELECT publishers.publisher FROM publishers JOIN records_x_publishers on records_x_publishers.publisher_id = publishers.publisher_id WHERE records_x_publishers.record_id=%s""", [record["record_id"]])
+        lookup_cur.execute("""SELECT publishers.publisher FROM publishers JOIN records_x_publishers on records_x_publishers.publisher_id = publishers.publisher_id WHERE records_x_publishers.record_uuid=%s""", [record["record_uuid"]])
         record["dc:publisher"] = rows_to_dict(lookup_cur)
 
-        lookup_cur.execute("""SELECT rights.rights FROM rights JOIN records_x_rights on records_x_rights.rights_id = rights.rights_id WHERE records_x_rights.record_id=%s""", [record["record_id"]])
+        lookup_cur.execute("""SELECT rights.rights FROM rights JOIN records_x_rights on records_x_rights.rights_id = rights.rights_id WHERE records_x_rights.record_uuid=%s""", [record["record_uuid"]])
         record["dc:rights"] = rows_to_dict(lookup_cur)
 
-        lookup_cur.execute("SELECT description FROM descriptions WHERE record_id=%s and language='en' ", [record["record_id"]])
+        lookup_cur.execute("SELECT description FROM descriptions WHERE record_uuid=%s and language='en' ", [record["record_uuid"]])
         record["dc:description_en"] = rows_to_dict(lookup_cur)
 
-        lookup_cur.execute("SELECT description FROM descriptions WHERE record_id=%s and language='fr' ", [record["record_id"]])
+        lookup_cur.execute("SELECT description FROM descriptions WHERE record_uuid=%s and language='fr' ", [record["record_uuid"]])
         record["dc:description_fr"] = rows_to_dict(lookup_cur)
 
-        lookup_cur.execute("""SELECT tags.tag FROM tags JOIN records_x_tags on records_x_tags.tag_id = tags.tag_id WHERE records_x_tags.record_id=%s and tags.language = 'en' """, [record["record_id"]])
+        lookup_cur.execute("""SELECT tags.tag FROM tags JOIN records_x_tags on records_x_tags.tag_id = tags.tag_id WHERE records_x_tags.record_uuid=%s and tags.language = 'en' """, [record["record_uuid"]])
         record["frdr:keywords_en"] = rows_to_dict(lookup_cur)
 
-        lookup_cur.execute("""SELECT tags.tag FROM tags JOIN records_x_tags on records_x_tags.tag_id = tags.tag_id WHERE records_x_tags.record_id=%s and tags.language = 'fr' """, [record["record_id"]])
+        lookup_cur.execute("""SELECT tags.tag FROM tags JOIN records_x_tags on records_x_tags.tag_id = tags.tag_id WHERE records_x_tags.record_uuid=%s and tags.language = 'fr' """, [record["record_uuid"]])
         record["frdr:keywords_fr"] = rows_to_dict(lookup_cur)
 
-        lookup_cur.execute("""SELECT access.access FROM access JOIN records_x_access on records_x_access.access_id = access.access_id WHERE records_x_access.record_id=%s""", [record["record_id"]])
+        lookup_cur.execute("""SELECT access.access FROM access JOIN records_x_access on records_x_access.access_id = access.access_id WHERE records_x_access.record_uuid=%s""", [record["record_uuid"]])
         record["frdr:access"] = rows_to_dict(lookup_cur)
 
     return record
@@ -450,7 +450,7 @@ def get_metadata_list(
     records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s' port='%s'" % (db, user, password, server, port))
     with records_con:
         db_cursor = records_con.cursor()
-    records_sql = """SELECT recs.record_id, recs.title, recs.title_fr, recs.pub_date, recs.series, recs.source_url, 
+    records_sql = """SELECT recs.record_uuid, recs.title, recs.title_fr, recs.pub_date, recs.series, recs.source_url, 
         recs.item_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, 
         repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp, 
         repos.homepage_url, repos.repo_oai_name, count(*) OVER() AS full_count FROM records recs, repositories repos 
@@ -464,7 +464,7 @@ def get_metadata_list(
     if until_datetime is not None:
         until_timestamp = int(datetime.timestamp(until_datetime))
         records_sql = records_sql + " AND recs.upstream_modified_timestamp<" + str(until_timestamp)
-    records_sql = records_sql + " ORDER BY recs.record_id"
+    records_sql = records_sql + " ORDER BY recs.record_uuid"
     if cursor is not None:
         records_sql = records_sql + " OFFSET " + cursor
     db_cursor.execute(records_sql)
@@ -474,7 +474,7 @@ def get_metadata_list(
 
     results = []
     for row in record_set:
-        record = (dict(zip(['record_id', 'title_en', 'title_fr', 'pub_date', 'series', 'source_url', 'item_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp', 'homepage_url', 'repo_oai_name'], row)))
+        record = (dict(zip(['record_uuid', 'title_en', 'title_fr', 'pub_date', 'series', 'source_url', 'item_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp', 'homepage_url', 'repo_oai_name'], row)))
 
         # This is goofy, but full_count isn't always returned for empty results
         if int(row[-1]) != 0:
@@ -497,7 +497,7 @@ def get_metadata(identifier, db, user, password, server, port):
     records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s' port='%s'" % (db, user, password, server, port))
     with records_con:
         records_cursor = records_con.cursor()
-    records_sql = ("""SELECT recs.record_id, recs.title, recs.title_fr, recs.pub_date, recs.series, recs.source_url, 
+    records_sql = ("""SELECT recs.record_uuid, recs.title, recs.title_fr, recs.pub_date, recs.series, recs.source_url, 
     recs.item_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, 
     repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp, 
     repos.homepage_url, repos.repo_oai_name FROM records recs, repositories repos 
@@ -505,7 +505,7 @@ def get_metadata(identifier, db, user, password, server, port):
         AND recs.local_identifier =\'""" + local_identifier + "\'" + "AND repos.repo_oai_name=\'""" + namespace + "\'")
     records_cursor.execute(records_sql)
     row = records_cursor.fetchone()
-    record = (dict(zip(['record_id', 'title_en', 'title_fr', 'pub_date', 'series', 'source_url', 'item_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp', 'homepage_url', 'repo_oai_name'], row)))
+    record = (dict(zip(['record_uuid', 'title_en', 'title_fr', 'pub_date', 'series', 'source_url', 'item_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp', 'homepage_url', 'repo_oai_name'], row)))
 
     full_record = assemble_record(record, db, user, password, server, port)
     return build_metadata(full_record)