From 328e60587f7e68c0d28a505af75376d341554117 Mon Sep 17 00:00:00 2001 From: Alex Garnett Date: Tue, 21 Jan 2020 14:19:59 -0800 Subject: [PATCH 01/67] add configuration option to use a postgres backend instead of datacite API --- vendor/docker/env.conf | 5 + viringo/catalogs.py | 271 ++++++++++++++++++++++++++++++++++++++++- viringo/config.py | 10 ++ viringo/oai.py | 10 +- 4 files changed, 294 insertions(+), 2 deletions(-) diff --git a/vendor/docker/env.conf b/vendor/docker/env.conf index 201e3e9..b8c5644 100644 --- a/vendor/docker/env.conf +++ b/vendor/docker/env.conf @@ -7,3 +7,8 @@ env SENTRY_DSN; env API_ADMIN_USERNAME; env API_ADMIN_PASSWORD; env RESULT_SET_SIZE; +env CATALOG_SET; +env POSTGRES_SERVER; +env POSTGRES_DB; +env POSTGRES_USER; +env POSTGRES_PASSWORD; diff --git a/viringo/catalogs.py b/viringo/catalogs.py index 11fc98e..b4dc22e 100644 --- a/viringo/catalogs.py +++ b/viringo/catalogs.py @@ -281,6 +281,276 @@ def build_metadata_map(self, result): return metadata + +class PostgresOAIServer(): + """Build OAI-PMH responses from a Postgres server""" + def identify(self): + """Construct common identification for the OAI service""" + + identify = common.Identify( + repositoryName=config.OAIPMH_REPOS_NAME, + baseURL=config.OAIPMH_BASE_URL, + protocolVersion="2.0", + adminEmails=[config.OAIPMH_ADMIN_EMAIL], + earliestDatestamp=datetime(2011, 1, 1), + deletedRecord='persistent', + granularity='YYYY-MM-DDThh:mm:ssZ', + compression=['gzip', 'deflate'], + toolkit_description=False) + + # Specify a custom description + datacite_desc = """ + + oai + oai.datacite.org + : + oai:oai.datacite.org:12425 + + """ + + identify.add_description(xml_string=datacite_desc) + + return identify + + def listMetadataFormats(self, identifier=None): + #pylint: disable=no-self-use,invalid-name + """Returns metadata formats available for the repository + + Identifier does nothing as our repository responds in all formats for all dois + """ + # PyOAI Expects result format (metadataPrefix, schema, metadataNamespace) + + format_oai_dc = ( + 'oai_dc', + 'http://www.openarchives.org/OAI/2.0/oai_dc.xsd', + 'http://www.openarchives.org/OAI/2.0/oai_dc/' + ) + + format_oai_datacite = ( + 'oai_datacite', + 'http://schema.datacite.org/oai/oai-1.1/oai.xsd', + 'http://schema.datacite.org/oai/oai-1.1/' + ) + + format_datacite = ( + 'datacite', + 'http://schema.datacite.org/meta/nonexistant/nonexistant.xsd', + 'http://datacite.org/schema/nonexistant' + ) + + return [format_oai_dc, format_oai_datacite, format_datacite] + + def getRecord(self, metadataPrefix, identifier): + #pylint: disable=no-self-use,invalid-name + """Returns pyoai data tuple for specific record""" + + # We just want the DOI out of the OAI identifier. + _, doi = identifier.split(':', 1) + + result = datacite.get_metadata(doi) + if not result: + raise error.IdDoesNotExistError( + "\"%s\" is unknown or illegal in this repository" % identifier + ) + + # Build metadata based on requested format and result + metadata = self.build_metadata_map(result) + + header = self.build_header(result) + record = self.build_record(metadata) + data = ( + header, + record, + None # About string - not used + ) + + return data + + def listRecords( + self, + metadataPrefix=None, + from_=None, + until=None, + set=None, + paging_cursor=None + ): + #pylint: disable=no-self-use,invalid-name + """Returns pyoai data tuple for list of records""" + + # If available get the search query from the set param + search_query = set_to_search_query(set) + + # Get both a provider and client_id from the set + provider_id, client_id = set_to_provider_client(set) + results, total_records, paging_cursor = datacite.get_metadata_list( + query=search_query, + provider_id=provider_id, + client_id=client_id, + from_datetime=from_, + until_datetime=until, + cursor=paging_cursor + ) + + records = [] + if results: + for result in results: + # Build metadata based on requested format and result + metadata = self.build_metadata_map(result) + + header = self.build_header(result) + record = self.build_record(metadata) + + data = ( + header, + record, + None # About string - not used + ) + + records.append(data) + + # This differs from the pyoai implementation in that we have to return a cursor here + # But this is okay as we have a custom server to handle it. + return records, total_records, paging_cursor + + def listIdentifiers( + self, + metadataPrefix=None, + from_=None, + until=None, + set=None, + paging_cursor=None + ): + #pylint: disable=no-self-use,invalid-name + """Returns pyoai data tuple for list of identifiers""" + + # Get both a provider and client_id from the set + provider_id, client_id = set_to_provider_client(set) + + results, total_records, paging_cursor = datacite.get_metadata_list( + provider_id=provider_id, + client_id=client_id, + from_datetime=from_, + until_datetime=until, + cursor=paging_cursor + ) + + records = [] + if results: + for result in results: + header = self.build_header(result) + + records.append(header) + + # This differs from the pyoai implementation in that we have to return a cursor here + # But this is okay as we have a custom server to handle it. + return records, total_records, paging_cursor + + def listSets( + self, + paging_cursor=0 + ): + #pylint: disable=no-self-use,invalid-name + """Returns pyoai data tuple for list of sets""" + + # Note this implementation is not super efficient as we request + # the full set everytime regardles of actual paging + # The paging is handled just by offsetting the records returned. + # This is however acceptable given sets are a small subset of data. + + # We know we're always dealing with a integer value here + paging_cursor = int(paging_cursor) + + batch_size = 50 + next_batch = paging_cursor + batch_size + results, total_results = datacite.get_sets() + results = results[paging_cursor: next_batch] + + if len(results) < batch_size: + paging_cursor = None + else: + paging_cursor = next_batch + + records = [] + if results: + for identifier, name in results: + # Format of a set is setSpec, setName, setDescription + records.append((identifier.upper(), name, None)) + + # This differs from the pyoai implementation in that we have to return a cursor here + # But this is okay as we have a custom server to handle it. + return records, total_results, paging_cursor + + def build_header(self, result): + """Construct a OAI-PMH record header""" + + # Provider symbol can just be extracted from the client symbol + provider_symbol, _ = result.client.split(".") + + return common.Header( + None, + 'doi:' + result.identifier, + result.updated_datetime, + setspec=[provider_symbol, result.client], + deleted=not result.active + ) + + def build_record(self, metadata): + """Construct a OAI-PMH payload for a record""" + + return common.Metadata( + None, + metadata + ) + + def build_metadata_map(self, result): + """Construct a metadata map object for oai metadata writing""" + dates = [] + if result.publication_year: + dates.append(str(result.publication_year)) + dates.extend([date['type'] + ": " + str(date['date']) for date in result.dates]) + + rights = [] + for right in result.rights: + if right['statement']: + rights.append(right['statement']) + if right['uri']: + rights.append(right['uri']) + + identifiers = [ + identifier_to_string(identifier) for identifier in result.identifiers + ] + + relations = [ + identifier_to_string(relation) + for relation in result.relations + ] + + contributors = [ + contributor.get('name') for contributor in result.contributors + ] + + metadata = { + 'title': result.titles, + 'creator': result.creators, + 'subject': result.subjects, + 'description': result.descriptions, + 'publisher': [result.publisher] if result.publisher else [], + 'contributor': contributors, + 'date': dates, + 'type': result.resource_types, + 'format': result.formats, + 'identifier': identifiers, + 'relation': relations, + 'language': [result.language] if result.language else [], + 'rights': rights, + 'xml': result.xml, + 'set': result.client, + 'metadata_version': result.metadata_version + } + + return metadata + + def set_to_search_query(unparsed_set): """Take a oai set and extract any base64url encoded search query""" @@ -294,7 +564,6 @@ def set_to_search_query(unparsed_set): return "" - def set_to_provider_client(unparsed_set): """Take a oai set and convert into provider_id and client_id""" diff --git a/viringo/config.py b/viringo/config.py index a65636f..d9d7665 100644 --- a/viringo/config.py +++ b/viringo/config.py @@ -18,3 +18,13 @@ OAIPMH_ADMIN_EMAIL = os.getenv('OAIPMH_ADMIN_EMAIL', 'support@datacite.org') # Page size of results shown for result listings RESULT_SET_SIZE = int(os.getenv('RESULT_SET_SIZE', '50')) +# Source metadata catalog (DataCite or Postgres) +CATALOG_SET = os.getenv('OAIPMH_CATALOG', 'Postgres') +# Postgres server +POSTGRES_SERVER = os.getenv('OAIPMH_POSTGRES_SERVER', '') +# Postgres db +POSTGRES_DB = os.getenv('OAIPMH_POSTGRES_DB', '') +# Postgres user +POSTGRES_USER = os.getenv('OAIPMH_POSTGRES_USER', '') +# Postgres password +POSTGRES_PASSWORD = os.getenv('OAIPMH_POSTGRES_PASSWORD', '') \ No newline at end of file diff --git a/viringo/oai.py b/viringo/oai.py index 41f3377..261835f 100644 --- a/viringo/oai.py +++ b/viringo/oai.py @@ -12,7 +12,9 @@ import oaipmh.datestamp from .catalogs import DataCiteOAIServer +from .catalogs import PostgresOAIServer from . import metadata +from . import config BP = Blueprint('oai', __name__) @@ -93,7 +95,13 @@ def handleVerb(self, verb, kw): def get_oai_server(): """Returns a pyoai server object that can process and return OAI requests""" if 'oai' not in g: - catalog_server = DataCiteOAIServer() + if config.CATALOG_SET == 'DateCite': + catalog_server = DataCiteOAIServer() + elif config.CATALOG_SET == 'Postgres': + catalog_server = PostgresOAIServer() + else: + print('No valid metadata catalog configured') + sys.exit(1) metadata_registry = oaipmh.metadata.MetadataRegistry() metadata_registry.registerWriter('oai_dc', metadata.oai_dc_writer) From 595ef3b226e6614713054ae622bb7fb1b1f16071 Mon Sep 17 00:00:00 2001 From: Alex Garnett Date: Tue, 21 Jan 2020 15:05:34 -0800 Subject: [PATCH 02/67] add lookup functions from frdr harvester for postgres export --- vendor/docker/env.conf | 1 + viringo/catalogs.py | 16 ++-- viringo/config.py | 2 + viringo/services/postgres.py | 180 +++++++++++++++++++++++++++++++++++ 4 files changed, 193 insertions(+), 6 deletions(-) create mode 100644 viringo/services/postgres.py diff --git a/vendor/docker/env.conf b/vendor/docker/env.conf index b8c5644..24951ad 100644 --- a/vendor/docker/env.conf +++ b/vendor/docker/env.conf @@ -3,6 +3,7 @@ env OAIPMH_BASE_URL; env DATACITE_API_URL; env OAIPMH_REPOS_NAME; env OAIPMH_ADMIN_EMAIL; +env OAIPMH_IDENTIFIER; env SENTRY_DSN; env API_ADMIN_USERNAME; env API_ADMIN_PASSWORD; diff --git a/viringo/catalogs.py b/viringo/catalogs.py index b4dc22e..201b8ae 100644 --- a/viringo/catalogs.py +++ b/viringo/catalogs.py @@ -12,6 +12,7 @@ from viringo import config from .services import datacite +from .services import postgres class DataCiteOAIServer(): """Build OAI-PMH data responses for DataCite metadata catalog""" @@ -111,15 +112,18 @@ def listRecords( # If available get the search query from the set param search_query = set_to_search_query(set) + # From and until parameters aren't supported with Postgres # Get both a provider and client_id from the set provider_id, client_id = set_to_provider_client(set) - results, total_records, paging_cursor = datacite.get_metadata_list( + results, total_records, paging_cursor = postgres.get_metadata_list( query=search_query, provider_id=provider_id, client_id=client_id, - from_datetime=from_, - until_datetime=until, - cursor=paging_cursor + cursor=paging_cursor, + server=config.POSTGRES_SERVER, + db=config.POSTGRES_DB, + user=config.POSTGRES_USER, + password=config.POSTGRES_PASSWORD ) records = [] @@ -302,9 +306,9 @@ def identify(self): datacite_desc = """ oai - oai.datacite.org + """ + config.OAIPMH_IDENTIFIER + """ : - oai:oai.datacite.org:12425 + oai""" + config.OAIPMH_IDENTIFIER + """:1 """ diff --git a/viringo/config.py b/viringo/config.py index d9d7665..c2ba38a 100644 --- a/viringo/config.py +++ b/viringo/config.py @@ -16,6 +16,8 @@ OAIPMH_BASE_URL = os.getenv('OAIPMH_BASE_URL', 'https://oai.datacite.org/oai') # Admin e-mail for the OAI-PMH service OAIPMH_ADMIN_EMAIL = os.getenv('OAIPMH_ADMIN_EMAIL', 'support@datacite.org') +# OAI repository identifier +OAIPMH_IDENTIFIER = os.getenv('OAIPMH_IDENTIFIER', 'oai.datacite.org') # Page size of results shown for result listings RESULT_SET_SIZE = int(os.getenv('RESULT_SET_SIZE', '50')) # Source metadata catalog (DataCite or Postgres) diff --git a/viringo/services/postgres.py b/viringo/services/postgres.py new file mode 100644 index 0000000..093dd3e --- /dev/null +++ b/viringo/services/postgres.py @@ -0,0 +1,180 @@ +"""Handles DB queries for retrieving metadata""" + +import psycopg2 +import re + +def construct_local_url(record): + # Check if the local_identifier has already been turned into a url + if "http" in record["local_identifier"].lower(): + return record["local_identifier"] + + # Check for OAI format of identifier (oai:domain:id) + oai_id = None + oai_search = re.search("oai:(.+):(.+)", record["local_identifier"]) + if oai_search: + oai_id = oai_search.group(2) + # TODO: determine if this is needed for all repos, or just SFU? + oai_id = oai_id.replace("_", ":") + + # If given a pattern then substitue in the item ID and return it + if "item_url_pattern" in record and record["item_url_pattern"]: + if oai_id: + local_url = re.sub("(\%id\%)", oai_id, record["item_url_pattern"]) + else: + local_url = re.sub("(\%id\%)", record["local_identifier"], record["item_url_pattern"]) + return local_url + + # Check if the identifier is a DOI + doi = re.search("(doi|DOI):\s?\S+", record["local_identifier"]) + if doi: + doi = doi.group(0).rstrip('\.') + local_url = re.sub("(doi|DOI):\s?", "https://dx.doi.org/", doi) + return local_url + + # If the item has a source URL, use it + if ('source_url' in record) and record['source_url']: + return record['source_url'] + + # URL is in the identifier + local_url = re.search("(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?", + record["local_identifier"]) + if local_url: + return local_url.group(0) + + local_url = None + return local_url + + +def rows_to_dict(cursor): + newdict = [] + if cursor: + for r in cursor: + if r: + if isinstance(r, list): + newdict.append(r[0]) + else: + newdict.append(r) + return newdict + + +def get_metadata_list( + query=None, + provider_id=None, + client_id=None, + cursor=None, + server, + db, + user, + password + ): + + records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % ( + db, user, password, server)) + with records_con: + records_cursor = records_con.cursor() + + records_sql = """SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, + repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp + FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id""" + + records_cursor.execute(records_sql) + + # Need to see if we can somehow page the OAI response to the DB query so it only requests a fixed number at a time + for row in records_cursor: + record = (dict(zip( + ['record_id', 'title', 'pub_date', 'contact', 'series', 'source_url', 'deleted', 'local_identifier', + 'modified_timestamp', + 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', + 'last_crawl_timestamp'], row))) + record["deleted"] = int(record["deleted"]) + + record["dc:source"] = construct_local_url(record) + if record["dc:source"] is None: + continue + + if record["deleted"] == 1: + continue + + if (len(record['title']) == 0): + continue + + con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % ( + db, user, password, server)) + with con: + lookup_cur = con.cursor(cursor_factory=None) + + lookup_cur.execute("SELECT coordinate_type, lat, lon FROM geospatial WHERE record_id=?", + (record["record_id"],)) + geodata = lookup_cur.fetchall() + record["frdr:geospatial"] = [] + polycoordinates = [] + + try: + for coordinate in geodata: + if coordinate[0] == "Polygon": + polycoordinates.append([float(coordinate[1]), float(coordinate[2])]) + else: + record["frdr:geospatial"].append({"frdr:geospatial_type": "Feature", + "frdr:geospatial_geometry": { + "frdr:geometry_type": coordinate[0], + "frdr:geometry_coordinates": [float(coordinate[1]), + float(coordinate[2])]}}) + except: + pass + + if polycoordinates: + record["frdr:geospatial"].append({"frdr:geospatial_type": "Feature", + "frdr:geospatial_geometry": {"frdr:geometry_type": "Polygon", + "frdr:geometry_coordinates": polycoordinates}}) + + with con: + lookup_cur = con.cursor(cursor_factory=DictCursor) + + # attach the other values to the dict + lookup_cur.execute("""SELECT creators.creator FROM creators JOIN records_x_creators on records_x_creators.creator_id = creators.creator_id + WHERE records_x_creators.record_id=? AND records_x_creators.is_contributor=0 order by records_x_creators_id asc""", + (record["record_id"],)) + record["dc:contributor.author"] = rows_to_dict(lookup_cur) + + lookup_cur.execute("""SELECT affiliations.affiliation FROM affiliations JOIN records_x_affiliations on records_x_affiliations.affiliation_id = affiliations.affiliation_id + WHERE records_x_affiliations.record_id=?""", (record["record_id"],)) + record["datacite:creatorAffiliation"] = rows_to_dict(lookup_cur) + + lookup_cur.execute("""SELECT creators.creator FROM creators JOIN records_x_creators on records_x_creators.creator_id = creators.creator_id + WHERE records_x_creators.record_id=? AND records_x_creators.is_contributor=1 order by records_x_creators_id asc""", + (record["record_id"],)) + record["dc:contributor"] = rows_to_dict(lookup_cur) + + lookup_cur.execute("""SELECT subjects.subject FROM subjects JOIN records_x_subjects on records_x_subjects.subject_id = subjects.subject_id + WHERE records_x_subjects.record_id=?""", (record["record_id"],)) + record["dc:subject"] = rows_to_dict(lookup_cur) + + lookup_cur.execute("""SELECT publishers.publisher FROM publishers JOIN records_x_publishers on records_x_publishers.publisher_id = publishers.publisher_id + WHERE records_x_publishers.record_id=?""", (record["record_id"],)) + record["dc:publisher"] = rows_to_dict(lookup_cur) + + lookup_cur.execute("""SELECT rights.rights FROM rights JOIN records_x_rights on records_x_rights.rights_id = rights.rights_id + WHERE records_x_rights.record_id=?""", (record["record_id"],)) + record["dc:rights"] = rows_to_dict(lookup_cur) + + lookup_cur.execute( + "SELECT description FROM descriptions WHERE record_id=? and language='en' "), + (record["record_id"],) + record["dc:description"] = rows_to_dict(lookup_cur) + + lookup_cur.execute( + "SELECT description FROM descriptions WHERE record_id=? and language='fr' "), + (record["record_id"],) + record["frdr:description_fr"] = rows_to_dict(lookup_cur) + + lookup_cur.execute("""SELECT tags.tag FROM tags JOIN records_x_tags on records_x_tags.tag_id = tags.tag_id + WHERE records_x_tags.record_id=? and tags.language = 'en' """, (record["record_id"],)) + record["frdr:tags"] = rows_to_dict(lookup_cur) + + lookup_cur.execute("""SELECT tags.tag FROM tags JOIN records_x_tags on records_x_tags.tag_id = tags.tag_id + WHERE records_x_tags.record_id=? and tags.language = 'fr' """, (record["record_id"],)) + record["frdr:tags_fr"] = rows_to_dict(lookup_cur) + + lookup_cur.execute("""SELECT access.access FROM access JOIN records_x_access on records_x_access.access_id = access.access_id + WHERE records_x_access.record_id=?""", (record["record_id"],)) + record["frdr:access"] = rows_to_dict(lookup_cur) \ No newline at end of file From fe861ee84f9369d9638a9f8389f3c6c5f49b82f6 Mon Sep 17 00:00:00 2001 From: Alex Garnett Date: Tue, 21 Jan 2020 15:53:23 -0800 Subject: [PATCH 03/67] cleanups, add other helper function stubs --- viringo/catalogs.py | 42 +++++++++++++++++++----------------- viringo/services/postgres.py | 18 +++++++++++++++- 2 files changed, 39 insertions(+), 21 deletions(-) diff --git a/viringo/catalogs.py b/viringo/catalogs.py index 201b8ae..df8fa53 100644 --- a/viringo/catalogs.py +++ b/viringo/catalogs.py @@ -115,15 +115,13 @@ def listRecords( # From and until parameters aren't supported with Postgres # Get both a provider and client_id from the set provider_id, client_id = set_to_provider_client(set) - results, total_records, paging_cursor = postgres.get_metadata_list( + results, total_records, paging_cursor = datacite.get_metadata_list( query=search_query, provider_id=provider_id, client_id=client_id, - cursor=paging_cursor, - server=config.POSTGRES_SERVER, - db=config.POSTGRES_DB, - user=config.POSTGRES_USER, - password=config.POSTGRES_PASSWORD + from_datetime=from_, + until_datetime=until, + cursor=paging_cursor ) records = [] @@ -303,7 +301,7 @@ def identify(self): toolkit_description=False) # Specify a custom description - datacite_desc = """ + postgres_desc = """ oai """ + config.OAIPMH_IDENTIFIER + """ @@ -312,7 +310,7 @@ def identify(self): """ - identify.add_description(xml_string=datacite_desc) + identify.add_description(xml_string=postgres_desc) return identify @@ -348,10 +346,9 @@ def getRecord(self, metadataPrefix, identifier): #pylint: disable=no-self-use,invalid-name """Returns pyoai data tuple for specific record""" - # We just want the DOI out of the OAI identifier. - _, doi = identifier.split(':', 1) + # Should we implement this based on source_url and local_identifier the way we currently do for the harvester? - result = datacite.get_metadata(doi) + result = postgres.get_metadata(identifier) if not result: raise error.IdDoesNotExistError( "\"%s\" is unknown or illegal in this repository" % identifier @@ -386,13 +383,15 @@ def listRecords( # Get both a provider and client_id from the set provider_id, client_id = set_to_provider_client(set) - results, total_records, paging_cursor = datacite.get_metadata_list( + results, total_records, paging_cursor = postgres.get_metadata_list( query=search_query, provider_id=provider_id, client_id=client_id, - from_datetime=from_, - until_datetime=until, - cursor=paging_cursor + cursor=paging_cursor, + server=config.POSTGRES_SERVER, + db=config.POSTGRES_DB, + user=config.POSTGRES_USER, + password=config.POSTGRES_PASSWORD ) records = [] @@ -430,12 +429,15 @@ def listIdentifiers( # Get both a provider and client_id from the set provider_id, client_id = set_to_provider_client(set) - results, total_records, paging_cursor = datacite.get_metadata_list( + results, total_records, paging_cursor = postgres.get_metadata_list( + query=search_query, provider_id=provider_id, client_id=client_id, - from_datetime=from_, - until_datetime=until, - cursor=paging_cursor + cursor=paging_cursor, + server=config.POSTGRES_SERVER, + db=config.POSTGRES_DB, + user=config.POSTGRES_USER, + password=config.POSTGRES_PASSWORD ) records = [] @@ -466,7 +468,7 @@ def listSets( batch_size = 50 next_batch = paging_cursor + batch_size - results, total_results = datacite.get_sets() + results, total_results = postgres.get_sets() results = results[paging_cursor: next_batch] if len(results) < batch_size: diff --git a/viringo/services/postgres.py b/viringo/services/postgres.py index 093dd3e..0cd3368 100644 --- a/viringo/services/postgres.py +++ b/viringo/services/postgres.py @@ -177,4 +177,20 @@ def get_metadata_list( lookup_cur.execute("""SELECT access.access FROM access JOIN records_x_access on records_x_access.access_id = access.access_id WHERE records_x_access.record_id=?""", (record["record_id"],)) - record["frdr:access"] = rows_to_dict(lookup_cur) \ No newline at end of file + record["frdr:access"] = rows_to_dict(lookup_cur) + + +def get_metadata(identifier): + # Probably need to refactor some of get_metadata_list so it can be run on one record without duplication + return None + + +def get_sets(): + repos_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % ( + db, user, password, server)) + with repos_con: + repos_cursor = repos_con.cursor() + + repos_cursor.execute("SELECT repository_name from repositories") + # Format this properly to return it like the DataCite response + return None \ No newline at end of file From 2bb3759067df3a4141c4257644a570f2bfaf7866 Mon Sep 17 00:00:00 2001 From: Alex Garnett Date: Thu, 23 Jan 2020 16:01:37 -0800 Subject: [PATCH 04/67] refactor returning single records, clarify todos, implement cursor --- viringo/catalogs.py | 2 +- viringo/services/postgres.py | 218 +++++++++++++++++------------------ 2 files changed, 109 insertions(+), 111 deletions(-) diff --git a/viringo/catalogs.py b/viringo/catalogs.py index df8fa53..ce039aa 100644 --- a/viringo/catalogs.py +++ b/viringo/catalogs.py @@ -348,7 +348,7 @@ def getRecord(self, metadataPrefix, identifier): # Should we implement this based on source_url and local_identifier the way we currently do for the harvester? - result = postgres.get_metadata(identifier) + result = postgres.get_metadata(identifier, db, user, password, server) if not result: raise error.IdDoesNotExistError( "\"%s\" is unknown or illegal in this repository" % identifier diff --git a/viringo/services/postgres.py b/viringo/services/postgres.py index 0cd3368..6885462 100644 --- a/viringo/services/postgres.py +++ b/viringo/services/postgres.py @@ -2,6 +2,7 @@ import psycopg2 import re +from viringo import config def construct_local_url(record): # Check if the local_identifier has already been turned into a url @@ -57,140 +58,137 @@ def rows_to_dict(cursor): return newdict -def get_metadata_list( - query=None, - provider_id=None, - client_id=None, - cursor=None, - server, - db, - user, - password - ): +def assemble_record(record, db, user, password, server): + record["dc:source"] = construct_local_url(record) + if record["dc:source"] is None: + return None + + if int(record["deleted"]) == 1: + return None - records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % ( - db, user, password, server)) - with records_con: - records_cursor = records_con.cursor() + if (len(record['title']) == 0): + return None - records_sql = """SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, - repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp - FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id""" + con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (db, user, password, server)) + with con: + lookup_cur = con.cursor(cursor_factory=None) - records_cursor.execute(records_sql) + lookup_cur.execute("SELECT coordinate_type, lat, lon FROM geospatial WHERE record_id=?", + (record["record_id"],)) + geodata = lookup_cur.fetchall() + record["frdr:geospatial"] = [] + polycoordinates = [] - # Need to see if we can somehow page the OAI response to the DB query so it only requests a fixed number at a time - for row in records_cursor: - record = (dict(zip( - ['record_id', 'title', 'pub_date', 'contact', 'series', 'source_url', 'deleted', 'local_identifier', - 'modified_timestamp', - 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', - 'last_crawl_timestamp'], row))) - record["deleted"] = int(record["deleted"]) - - record["dc:source"] = construct_local_url(record) - if record["dc:source"] is None: - continue - - if record["deleted"] == 1: - continue - - if (len(record['title']) == 0): - continue - - con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % ( - db, user, password, server)) - with con: - lookup_cur = con.cursor(cursor_factory=None) - - lookup_cur.execute("SELECT coordinate_type, lat, lon FROM geospatial WHERE record_id=?", - (record["record_id"],)) - geodata = lookup_cur.fetchall() - record["frdr:geospatial"] = [] - polycoordinates = [] - - try: - for coordinate in geodata: - if coordinate[0] == "Polygon": - polycoordinates.append([float(coordinate[1]), float(coordinate[2])]) - else: - record["frdr:geospatial"].append({"frdr:geospatial_type": "Feature", - "frdr:geospatial_geometry": { - "frdr:geometry_type": coordinate[0], - "frdr:geometry_coordinates": [float(coordinate[1]), - float(coordinate[2])]}}) - except: - pass + try: + for coordinate in geodata: + if coordinate[0] == "Polygon": + polycoordinates.append([float(coordinate[1]), float(coordinate[2])]) + else: + record["frdr:geospatial"].append({"frdr:geospatial_type": "Feature", "frdr:geospatial_geometry": {"frdr:geometry_type": coordinate[0], "frdr:geometry_coordinates": [float(coordinate[1]), float(coordinate[2])]}}) + except: + pass + + if polycoordinates: + record["frdr:geospatial"].append({"frdr:geospatial_type": "Feature", "frdr:geospatial_geometry": {"frdr:geometry_type": "Polygon", "frdr:geometry_coordinates": polycoordinates}}) - if polycoordinates: - record["frdr:geospatial"].append({"frdr:geospatial_type": "Feature", - "frdr:geospatial_geometry": {"frdr:geometry_type": "Polygon", - "frdr:geometry_coordinates": polycoordinates}}) + with con: + lookup_cur = con.cursor(cursor_factory=DictCursor) - with con: - lookup_cur = con.cursor(cursor_factory=DictCursor) + # attach the other values to the dict + lookup_cur.execute("""SELECT creators.creator FROM creators JOIN records_x_creators on records_x_creators.creator_id = creators.creator_id WHERE records_x_creators.record_id=? AND records_x_creators.is_contributor=0 order by records_x_creators_id asc""", (record["record_id"],)) + record["dc:contributor.author"] = rows_to_dict(lookup_cur) - # attach the other values to the dict - lookup_cur.execute("""SELECT creators.creator FROM creators JOIN records_x_creators on records_x_creators.creator_id = creators.creator_id - WHERE records_x_creators.record_id=? AND records_x_creators.is_contributor=0 order by records_x_creators_id asc""", - (record["record_id"],)) - record["dc:contributor.author"] = rows_to_dict(lookup_cur) + lookup_cur.execute("""SELECT affiliations.affiliation FROM affiliations JOIN records_x_affiliations on records_x_affiliations.affiliation_id = affiliations.affiliation_id WHERE records_x_affiliations.record_id=?""", (record["record_id"],)) + record["datacite:creatorAffiliation"] = rows_to_dict(lookup_cur) - lookup_cur.execute("""SELECT affiliations.affiliation FROM affiliations JOIN records_x_affiliations on records_x_affiliations.affiliation_id = affiliations.affiliation_id - WHERE records_x_affiliations.record_id=?""", (record["record_id"],)) - record["datacite:creatorAffiliation"] = rows_to_dict(lookup_cur) + lookup_cur.execute("""SELECT creators.creator FROM creators JOIN records_x_creators on records_x_creators.creator_id = creators.creator_id WHERE records_x_creators.record_id=? AND records_x_creators.is_contributor=1 order by records_x_creators_id asc""", (record["record_id"],)) + record["dc:contributor"] = rows_to_dict(lookup_cur) - lookup_cur.execute("""SELECT creators.creator FROM creators JOIN records_x_creators on records_x_creators.creator_id = creators.creator_id - WHERE records_x_creators.record_id=? AND records_x_creators.is_contributor=1 order by records_x_creators_id asc""", - (record["record_id"],)) - record["dc:contributor"] = rows_to_dict(lookup_cur) + lookup_cur.execute("""SELECT subjects.subject FROM subjects JOIN records_x_subjects on records_x_subjects.subject_id = subjects.subject_id WHERE records_x_subjects.record_id=?""", (record["record_id"],)) + record["dc:subject"] = rows_to_dict(lookup_cur) - lookup_cur.execute("""SELECT subjects.subject FROM subjects JOIN records_x_subjects on records_x_subjects.subject_id = subjects.subject_id - WHERE records_x_subjects.record_id=?""", (record["record_id"],)) - record["dc:subject"] = rows_to_dict(lookup_cur) + lookup_cur.execute("""SELECT publishers.publisher FROM publishers JOIN records_x_publishers on records_x_publishers.publisher_id = publishers.publisher_id WHERE records_x_publishers.record_id=?""", (record["record_id"],)) + record["dc:publisher"] = rows_to_dict(lookup_cur) - lookup_cur.execute("""SELECT publishers.publisher FROM publishers JOIN records_x_publishers on records_x_publishers.publisher_id = publishers.publisher_id - WHERE records_x_publishers.record_id=?""", (record["record_id"],)) - record["dc:publisher"] = rows_to_dict(lookup_cur) + lookup_cur.execute("""SELECT rights.rights FROM rights JOIN records_x_rights on records_x_rights.rights_id = rights.rights_id WHERE records_x_rights.record_id=?""", (record["record_id"],)) + record["dc:rights"] = rows_to_dict(lookup_cur) - lookup_cur.execute("""SELECT rights.rights FROM rights JOIN records_x_rights on records_x_rights.rights_id = rights.rights_id - WHERE records_x_rights.record_id=?""", (record["record_id"],)) - record["dc:rights"] = rows_to_dict(lookup_cur) + lookup_cur.execute("SELECT description FROM descriptions WHERE record_id=? and language='en' "), (record["record_id"],) + record["dc:description"] = rows_to_dict(lookup_cur) - lookup_cur.execute( - "SELECT description FROM descriptions WHERE record_id=? and language='en' "), - (record["record_id"],) - record["dc:description"] = rows_to_dict(lookup_cur) + lookup_cur.execute("SELECT description FROM descriptions WHERE record_id=? and language='fr' "), (record["record_id"],) + record["frdr:description_fr"] = rows_to_dict(lookup_cur) - lookup_cur.execute( - "SELECT description FROM descriptions WHERE record_id=? and language='fr' "), - (record["record_id"],) - record["frdr:description_fr"] = rows_to_dict(lookup_cur) + lookup_cur.execute("""SELECT tags.tag FROM tags JOIN records_x_tags on records_x_tags.tag_id = tags.tag_id WHERE records_x_tags.record_id=? and tags.language = 'en' """, (record["record_id"],)) + record["frdr:tags"] = rows_to_dict(lookup_cur) - lookup_cur.execute("""SELECT tags.tag FROM tags JOIN records_x_tags on records_x_tags.tag_id = tags.tag_id - WHERE records_x_tags.record_id=? and tags.language = 'en' """, (record["record_id"],)) - record["frdr:tags"] = rows_to_dict(lookup_cur) + lookup_cur.execute("""SELECT tags.tag FROM tags JOIN records_x_tags on records_x_tags.tag_id = tags.tag_id WHERE records_x_tags.record_id=? and tags.language = 'fr' """, (record["record_id"],)) + record["frdr:tags_fr"] = rows_to_dict(lookup_cur) - lookup_cur.execute("""SELECT tags.tag FROM tags JOIN records_x_tags on records_x_tags.tag_id = tags.tag_id - WHERE records_x_tags.record_id=? and tags.language = 'fr' """, (record["record_id"],)) - record["frdr:tags_fr"] = rows_to_dict(lookup_cur) + lookup_cur.execute("""SELECT access.access FROM access JOIN records_x_access on records_x_access.access_id = access.access_id WHERE records_x_access.record_id=?""", (record["record_id"],)) + record["frdr:access"] = rows_to_dict(lookup_cur) - lookup_cur.execute("""SELECT access.access FROM access JOIN records_x_access on records_x_access.access_id = access.access_id - WHERE records_x_access.record_id=?""", (record["record_id"],)) - record["frdr:access"] = rows_to_dict(lookup_cur) + return record -def get_metadata(identifier): - # Probably need to refactor some of get_metadata_list so it can be run on one record without duplication - return None +def build_metadata(full_record): + # TODO: construct object to match DataCite reponse and return it + + +def get_metadata_list( + query=None, + provider_id=None, + client_id=None, + cursor=None, + server, + db, + user, + password + ): + + # Trigger cursor navigation with a starting value + if not cursor: + cursor = 1 + records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (db, user, password, server)) + with records_con: + records_cursor = records_con.cursor() + records_sql = """SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id""" + records_cursor.execute(records_sql) + + record_set = records_cursor.fetchmany(config.RESULT_SET_SIZE) + results = [] + for row in record_set: + record = (dict(zip(['record_id', 'title', 'pub_date', 'contact', 'series', 'source_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp'], row))) + + full_record = assemble_record(record, db, user, password, server) + if full_record is not None: + results.append(build_metadata(full_record)) + + cursor += config.RESULT_SET_SIZE + # TODO: Probably need to pass rowcount back to this function for Postgres output + return results, records_cursor.rowcount, cursor + + +def get_metadata(identifier, db, user, password, server): + records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (db, user, password, server)) + with records_con: + records_cursor = records_con.cursor() + # TODO: record_id is kind of a meaningless identifier, support local identifier + source URL + records_sql = ("""SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id AND recs.record_id =?""", (identifier,)) + records_cursor.execute(records_sql) + row = records_cursor.fetchone() + record = (dict(zip(['record_id', 'title', 'pub_date', 'contact', 'series', 'source_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp'], row))) + + full_record = assemble_record(record, db, user, password, server) + return build_metadata(full_record) def get_sets(): - repos_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % ( - db, user, password, server)) + repos_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (db, user, password, server)) with repos_con: repos_cursor = repos_con.cursor() repos_cursor.execute("SELECT repository_name from repositories") - # Format this properly to return it like the DataCite response + sets = repos_cursor.fetchall() + # TODO: Format this properly to return it like the DataCite response return None \ No newline at end of file From dbc1637291037f3fec1d799eeb9de6cc9e1695fd Mon Sep 17 00:00:00 2001 From: Alex Garnett Date: Thu, 23 Jan 2020 16:31:54 -0800 Subject: [PATCH 05/67] add preliminary result formatting for datacite --- viringo/services/postgres.py | 107 +++++++++++++++++++++++++++++++++-- 1 file changed, 102 insertions(+), 5 deletions(-) diff --git a/viringo/services/postgres.py b/viringo/services/postgres.py index 6885462..2e8d81c 100644 --- a/viringo/services/postgres.py +++ b/viringo/services/postgres.py @@ -2,8 +2,108 @@ import psycopg2 import re +from datetime import datetime +import dateutil.parser +import dateutil.tz from viringo import config +class Metadata: + """Represents a DataCite metadata resultset""" + def __init__( + self, + identifier=None, + created_datetime=None, + updated_datetime=None, + xml=None, + metadata_version=None, + titles=None, + creators=None, + subjects=None, + descriptions=None, + publisher=None, + publication_year=None, + dates=None, + contributors=None, + resource_types=None, + funding_references=None, + geo_locations=None, + formats=None, + identifiers=None, + language=None, + relations=None, + rights=None, + sizes=None, + client=None, + active=True + ): + + self.identifier = identifier + self.created_datetime = created_datetime or datetime.min + self.updated_datetime = updated_datetime or datetime.min + self.xml = xml + self.metadata_version = metadata_version + self.titles = titles or [] + self.creators = creators or [] + self.subjects = subjects or [] + self.descriptions = descriptions or [] + self.publisher = publisher + self.publication_year = publication_year + self.dates = dates or [] + self.contributors = contributors or [] + self.resource_types = resource_types or [] + self.funding_references = funding_references or [] + self.geo_locations = geo_locations or [] + self.formats = formats or [] + self.identifiers = identifiers or [] + self.language = language + self.relations = relations or [] + self.rights = rights or [] + self.sizes = sizes or [] + self.client = client + self.active = active + + +def build_metadata(data): + """Parse single postgres result into metadata object""" + result = Metadata() + + result.identifier = data['record_id'] + + # Here we want to parse a ISO date but convert to UTC and then remove the TZinfo entirely + # This is because OAI always works in UTC. + created = dateutil.parser.parse(data['pub_date']) + result.created_datetime = created.astimezone(dateutil.tz.UTC).replace(tzinfo=None) + updated = dateutil.parser.parse(data['pub_date']) + result.updated_datetime = updated.astimezone(dateutil.tz.UTC).replace(tzinfo=None) + + result.xml = None + + # TODO: should I not be hardcoding this for datacite? add other fields based on current XML export + result.metadata_version = 4 + + result.titles = [data['title']] + result.creators = data['dc:contributor.author'] + result.subjects = data['dc:subject'] + result.descriptions = data['dc:description'] + result.publisher = data['dc:publisher'] + result.publication_year = dateutil.parser.parse(data['pub_date']).year + result.dates = [data['pub_date']] + result.contributors = data['dc:contributor'] + result.funding_references = [] + result.sizes = [] + result.geo_locations = data['frdr:geospatial'] + result.resource_types = [] + result.formats = [] + result.identifiers = [] + result.language = '' + result.relations = [] + result.rights = data['dc:rights'] + result.client = '' + result.active = True + + return result + + def construct_local_url(record): # Check if the local_identifier has already been turned into a url if "http" in record["local_identifier"].lower(): @@ -14,7 +114,6 @@ def construct_local_url(record): oai_search = re.search("oai:(.+):(.+)", record["local_identifier"]) if oai_search: oai_id = oai_search.group(2) - # TODO: determine if this is needed for all repos, or just SFU? oai_id = oai_id.replace("_", ":") # If given a pattern then substitue in the item ID and return it @@ -131,10 +230,6 @@ def assemble_record(record, db, user, password, server): return record -def build_metadata(full_record): - # TODO: construct object to match DataCite reponse and return it - - def get_metadata_list( query=None, provider_id=None, @@ -146,6 +241,8 @@ def get_metadata_list( password ): + # TODO: support listing by set + # Trigger cursor navigation with a starting value if not cursor: cursor = 1 From 9f08a03af0797dc25c37d3cf2f0adcc626bb5f99 Mon Sep 17 00:00:00 2001 From: Alex Garnett Date: Wed, 29 Jan 2020 13:51:02 -0800 Subject: [PATCH 06/67] fix outstanding TODOs, let's test this --- viringo/services/postgres.py | 572 +++++++++++++++++------------------ 1 file changed, 281 insertions(+), 291 deletions(-) diff --git a/viringo/services/postgres.py b/viringo/services/postgres.py index 2e8d81c..258d77e 100644 --- a/viringo/services/postgres.py +++ b/viringo/services/postgres.py @@ -1,291 +1,281 @@ -"""Handles DB queries for retrieving metadata""" - -import psycopg2 -import re -from datetime import datetime -import dateutil.parser -import dateutil.tz -from viringo import config - -class Metadata: - """Represents a DataCite metadata resultset""" - def __init__( - self, - identifier=None, - created_datetime=None, - updated_datetime=None, - xml=None, - metadata_version=None, - titles=None, - creators=None, - subjects=None, - descriptions=None, - publisher=None, - publication_year=None, - dates=None, - contributors=None, - resource_types=None, - funding_references=None, - geo_locations=None, - formats=None, - identifiers=None, - language=None, - relations=None, - rights=None, - sizes=None, - client=None, - active=True - ): - - self.identifier = identifier - self.created_datetime = created_datetime or datetime.min - self.updated_datetime = updated_datetime or datetime.min - self.xml = xml - self.metadata_version = metadata_version - self.titles = titles or [] - self.creators = creators or [] - self.subjects = subjects or [] - self.descriptions = descriptions or [] - self.publisher = publisher - self.publication_year = publication_year - self.dates = dates or [] - self.contributors = contributors or [] - self.resource_types = resource_types or [] - self.funding_references = funding_references or [] - self.geo_locations = geo_locations or [] - self.formats = formats or [] - self.identifiers = identifiers or [] - self.language = language - self.relations = relations or [] - self.rights = rights or [] - self.sizes = sizes or [] - self.client = client - self.active = active - - -def build_metadata(data): - """Parse single postgres result into metadata object""" - result = Metadata() - - result.identifier = data['record_id'] - - # Here we want to parse a ISO date but convert to UTC and then remove the TZinfo entirely - # This is because OAI always works in UTC. - created = dateutil.parser.parse(data['pub_date']) - result.created_datetime = created.astimezone(dateutil.tz.UTC).replace(tzinfo=None) - updated = dateutil.parser.parse(data['pub_date']) - result.updated_datetime = updated.astimezone(dateutil.tz.UTC).replace(tzinfo=None) - - result.xml = None - - # TODO: should I not be hardcoding this for datacite? add other fields based on current XML export - result.metadata_version = 4 - - result.titles = [data['title']] - result.creators = data['dc:contributor.author'] - result.subjects = data['dc:subject'] - result.descriptions = data['dc:description'] - result.publisher = data['dc:publisher'] - result.publication_year = dateutil.parser.parse(data['pub_date']).year - result.dates = [data['pub_date']] - result.contributors = data['dc:contributor'] - result.funding_references = [] - result.sizes = [] - result.geo_locations = data['frdr:geospatial'] - result.resource_types = [] - result.formats = [] - result.identifiers = [] - result.language = '' - result.relations = [] - result.rights = data['dc:rights'] - result.client = '' - result.active = True - - return result - - -def construct_local_url(record): - # Check if the local_identifier has already been turned into a url - if "http" in record["local_identifier"].lower(): - return record["local_identifier"] - - # Check for OAI format of identifier (oai:domain:id) - oai_id = None - oai_search = re.search("oai:(.+):(.+)", record["local_identifier"]) - if oai_search: - oai_id = oai_search.group(2) - oai_id = oai_id.replace("_", ":") - - # If given a pattern then substitue in the item ID and return it - if "item_url_pattern" in record and record["item_url_pattern"]: - if oai_id: - local_url = re.sub("(\%id\%)", oai_id, record["item_url_pattern"]) - else: - local_url = re.sub("(\%id\%)", record["local_identifier"], record["item_url_pattern"]) - return local_url - - # Check if the identifier is a DOI - doi = re.search("(doi|DOI):\s?\S+", record["local_identifier"]) - if doi: - doi = doi.group(0).rstrip('\.') - local_url = re.sub("(doi|DOI):\s?", "https://dx.doi.org/", doi) - return local_url - - # If the item has a source URL, use it - if ('source_url' in record) and record['source_url']: - return record['source_url'] - - # URL is in the identifier - local_url = re.search("(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?", - record["local_identifier"]) - if local_url: - return local_url.group(0) - - local_url = None - return local_url - - -def rows_to_dict(cursor): - newdict = [] - if cursor: - for r in cursor: - if r: - if isinstance(r, list): - newdict.append(r[0]) - else: - newdict.append(r) - return newdict - - -def assemble_record(record, db, user, password, server): - record["dc:source"] = construct_local_url(record) - if record["dc:source"] is None: - return None - - if int(record["deleted"]) == 1: - return None - - if (len(record['title']) == 0): - return None - - con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (db, user, password, server)) - with con: - lookup_cur = con.cursor(cursor_factory=None) - - lookup_cur.execute("SELECT coordinate_type, lat, lon FROM geospatial WHERE record_id=?", - (record["record_id"],)) - geodata = lookup_cur.fetchall() - record["frdr:geospatial"] = [] - polycoordinates = [] - - try: - for coordinate in geodata: - if coordinate[0] == "Polygon": - polycoordinates.append([float(coordinate[1]), float(coordinate[2])]) - else: - record["frdr:geospatial"].append({"frdr:geospatial_type": "Feature", "frdr:geospatial_geometry": {"frdr:geometry_type": coordinate[0], "frdr:geometry_coordinates": [float(coordinate[1]), float(coordinate[2])]}}) - except: - pass - - if polycoordinates: - record["frdr:geospatial"].append({"frdr:geospatial_type": "Feature", "frdr:geospatial_geometry": {"frdr:geometry_type": "Polygon", "frdr:geometry_coordinates": polycoordinates}}) - - with con: - lookup_cur = con.cursor(cursor_factory=DictCursor) - - # attach the other values to the dict - lookup_cur.execute("""SELECT creators.creator FROM creators JOIN records_x_creators on records_x_creators.creator_id = creators.creator_id WHERE records_x_creators.record_id=? AND records_x_creators.is_contributor=0 order by records_x_creators_id asc""", (record["record_id"],)) - record["dc:contributor.author"] = rows_to_dict(lookup_cur) - - lookup_cur.execute("""SELECT affiliations.affiliation FROM affiliations JOIN records_x_affiliations on records_x_affiliations.affiliation_id = affiliations.affiliation_id WHERE records_x_affiliations.record_id=?""", (record["record_id"],)) - record["datacite:creatorAffiliation"] = rows_to_dict(lookup_cur) - - lookup_cur.execute("""SELECT creators.creator FROM creators JOIN records_x_creators on records_x_creators.creator_id = creators.creator_id WHERE records_x_creators.record_id=? AND records_x_creators.is_contributor=1 order by records_x_creators_id asc""", (record["record_id"],)) - record["dc:contributor"] = rows_to_dict(lookup_cur) - - lookup_cur.execute("""SELECT subjects.subject FROM subjects JOIN records_x_subjects on records_x_subjects.subject_id = subjects.subject_id WHERE records_x_subjects.record_id=?""", (record["record_id"],)) - record["dc:subject"] = rows_to_dict(lookup_cur) - - lookup_cur.execute("""SELECT publishers.publisher FROM publishers JOIN records_x_publishers on records_x_publishers.publisher_id = publishers.publisher_id WHERE records_x_publishers.record_id=?""", (record["record_id"],)) - record["dc:publisher"] = rows_to_dict(lookup_cur) - - lookup_cur.execute("""SELECT rights.rights FROM rights JOIN records_x_rights on records_x_rights.rights_id = rights.rights_id WHERE records_x_rights.record_id=?""", (record["record_id"],)) - record["dc:rights"] = rows_to_dict(lookup_cur) - - lookup_cur.execute("SELECT description FROM descriptions WHERE record_id=? and language='en' "), (record["record_id"],) - record["dc:description"] = rows_to_dict(lookup_cur) - - lookup_cur.execute("SELECT description FROM descriptions WHERE record_id=? and language='fr' "), (record["record_id"],) - record["frdr:description_fr"] = rows_to_dict(lookup_cur) - - lookup_cur.execute("""SELECT tags.tag FROM tags JOIN records_x_tags on records_x_tags.tag_id = tags.tag_id WHERE records_x_tags.record_id=? and tags.language = 'en' """, (record["record_id"],)) - record["frdr:tags"] = rows_to_dict(lookup_cur) - - lookup_cur.execute("""SELECT tags.tag FROM tags JOIN records_x_tags on records_x_tags.tag_id = tags.tag_id WHERE records_x_tags.record_id=? and tags.language = 'fr' """, (record["record_id"],)) - record["frdr:tags_fr"] = rows_to_dict(lookup_cur) - - lookup_cur.execute("""SELECT access.access FROM access JOIN records_x_access on records_x_access.access_id = access.access_id WHERE records_x_access.record_id=?""", (record["record_id"],)) - record["frdr:access"] = rows_to_dict(lookup_cur) - - return record - - -def get_metadata_list( - query=None, - provider_id=None, - client_id=None, - cursor=None, - server, - db, - user, - password - ): - - # TODO: support listing by set - - # Trigger cursor navigation with a starting value - if not cursor: - cursor = 1 - records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (db, user, password, server)) - with records_con: - records_cursor = records_con.cursor() - records_sql = """SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id""" - records_cursor.execute(records_sql) - - record_set = records_cursor.fetchmany(config.RESULT_SET_SIZE) - results = [] - for row in record_set: - record = (dict(zip(['record_id', 'title', 'pub_date', 'contact', 'series', 'source_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp'], row))) - - full_record = assemble_record(record, db, user, password, server) - if full_record is not None: - results.append(build_metadata(full_record)) - - cursor += config.RESULT_SET_SIZE - # TODO: Probably need to pass rowcount back to this function for Postgres output - return results, records_cursor.rowcount, cursor - - -def get_metadata(identifier, db, user, password, server): - records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (db, user, password, server)) - with records_con: - records_cursor = records_con.cursor() - # TODO: record_id is kind of a meaningless identifier, support local identifier + source URL - records_sql = ("""SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id AND recs.record_id =?""", (identifier,)) - records_cursor.execute(records_sql) - row = records_cursor.fetchone() - record = (dict(zip(['record_id', 'title', 'pub_date', 'contact', 'series', 'source_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp'], row))) - - full_record = assemble_record(record, db, user, password, server) - return build_metadata(full_record) - - -def get_sets(): - repos_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (db, user, password, server)) - with repos_con: - repos_cursor = repos_con.cursor() - - repos_cursor.execute("SELECT repository_name from repositories") - sets = repos_cursor.fetchall() - # TODO: Format this properly to return it like the DataCite response - return None \ No newline at end of file +"""Handles DB queries for retrieving metadata""" + +import psycopg2 +import re +from datetime import datetime +import dateutil.parser +import dateutil.tz +from viringo import config + +class Metadata: + """Represents a DataCite metadata resultset""" + def __init__( + self, + identifier=None, + created_datetime=None, + updated_datetime=None, + xml=None, + metadata_version=None, + titles=None, + creators=None, + subjects=None, + descriptions=None, + publisher=None, + publication_year=None, + dates=None, + contributors=None, + resource_types=None, + funding_references=None, + geo_locations=None, + formats=None, + identifiers=None, + language=None, + relations=None, + rights=None, + sizes=None, + client=None, + active=True + ): + + self.identifier = identifier + self.created_datetime = created_datetime or datetime.min + self.updated_datetime = updated_datetime or datetime.min + self.xml = xml + self.metadata_version = metadata_version + self.titles = titles or [] + self.creators = creators or [] + self.subjects = subjects or [] + self.descriptions = descriptions or [] + self.publisher = publisher + self.publication_year = publication_year + self.dates = dates or [] + self.contributors = contributors or [] + self.resource_types = resource_types or [] + self.funding_references = funding_references or [] + self.geo_locations = geo_locations or [] + self.formats = formats or [] + self.identifiers = identifiers or [] + self.language = language + self.relations = relations or [] + self.rights = rights or [] + self.sizes = sizes or [] + self.client = client + self.active = active + + +def build_metadata(data): + """Parse single postgres result into metadata object""" + result = Metadata() + + result.identifier = data['record_id'] + + # Here we want to parse a ISO date but convert to UTC and then remove the TZinfo entirely + # This is because OAI always works in UTC. + created = dateutil.parser.parse(data['pub_date']) + result.created_datetime = created.astimezone(dateutil.tz.UTC).replace(tzinfo=None) + updated = dateutil.parser.parse(data['pub_date']) + result.updated_datetime = updated.astimezone(dateutil.tz.UTC).replace(tzinfo=None) + + result.xml = None + result.metadata_version = None + result.titles = [data['title']] + result.creators = data['dc:contributor.author'] + result.subjects = data['dc:subject'] + result.descriptions = data['dc:description'] + result.publisher = data['dc:publisher'] + result.publication_year = dateutil.parser.parse(data['pub_date']).year + result.dates = [data['pub_date']] + result.contributors = data['dc:contributor'] + result.funding_references = [] + result.sizes = [] + result.geo_locations = data['frdr:geospatial'] + result.resource_types = [] + result.formats = [] + result.identifiers = [] + result.language = '' + result.relations = [] + result.rights = data['dc:rights'] + result.client = '' + result.active = True + + return result + + +def construct_local_url(record): + # Check if the local_identifier has already been turned into a url + if "http" in record["local_identifier"].lower(): + return record["local_identifier"] + + # Check for OAI format of identifier (oai:domain:id) + oai_id = None + oai_search = re.search("oai:(.+):(.+)", record["local_identifier"]) + if oai_search: + oai_id = oai_search.group(2) + oai_id = oai_id.replace("_", ":") + + # If given a pattern then substitue in the item ID and return it + if "item_url_pattern" in record and record["item_url_pattern"]: + if oai_id: + local_url = re.sub("(\%id\%)", oai_id, record["item_url_pattern"]) + else: + local_url = re.sub("(\%id\%)", record["local_identifier"], record["item_url_pattern"]) + return local_url + + # Check if the identifier is a DOI + doi = re.search("(doi|DOI):\s?\S+", record["local_identifier"]) + if doi: + doi = doi.group(0).rstrip('\.') + local_url = re.sub("(doi|DOI):\s?", "https://dx.doi.org/", doi) + return local_url + + # If the item has a source URL, use it + if ('source_url' in record) and record['source_url']: + return record['source_url'] + + # URL is in the identifier + local_url = re.search("(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?", + record["local_identifier"]) + if local_url: + return local_url.group(0) + + local_url = None + return local_url + + +def rows_to_dict(cursor): + newdict = [] + if cursor: + for r in cursor: + if r: + if isinstance(r, list): + newdict.append(r[0]) + else: + newdict.append(r) + return newdict + + +def assemble_record(record, db, user, password, server): + record["dc:source"] = construct_local_url(record) + if record["dc:source"] is None: + return None + + if int(record["deleted"]) == 1: + return None + + if (len(record['title']) == 0): + return None + + con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (db, user, password, server)) + with con: + lookup_cur = con.cursor(cursor_factory=None) + + lookup_cur.execute("SELECT coordinate_type, lat, lon FROM geospatial WHERE record_id=?", + (record["record_id"],)) + geodata = lookup_cur.fetchall() + record["frdr:geospatial"] = [] + polycoordinates = [] + + try: + for coordinate in geodata: + if coordinate[0] == "Polygon": + polycoordinates.append([float(coordinate[1]), float(coordinate[2])]) + else: + record["frdr:geospatial"].append({"frdr:geospatial_type": "Feature", "frdr:geospatial_geometry": {"frdr:geometry_type": coordinate[0], "frdr:geometry_coordinates": [float(coordinate[1]), float(coordinate[2])]}}) + except: + pass + + if polycoordinates: + record["frdr:geospatial"].append({"frdr:geospatial_type": "Feature", "frdr:geospatial_geometry": {"frdr:geometry_type": "Polygon", "frdr:geometry_coordinates": polycoordinates}}) + + with con: + lookup_cur = con.cursor(cursor_factory=DictCursor) + + # attach the other values to the dict + lookup_cur.execute("""SELECT creators.creator FROM creators JOIN records_x_creators on records_x_creators.creator_id = creators.creator_id WHERE records_x_creators.record_id=? AND records_x_creators.is_contributor=0 order by records_x_creators_id asc""", (record["record_id"],)) + record["dc:contributor.author"] = rows_to_dict(lookup_cur) + + lookup_cur.execute("""SELECT affiliations.affiliation FROM affiliations JOIN records_x_affiliations on records_x_affiliations.affiliation_id = affiliations.affiliation_id WHERE records_x_affiliations.record_id=?""", (record["record_id"],)) + record["datacite:creatorAffiliation"] = rows_to_dict(lookup_cur) + + lookup_cur.execute("""SELECT creators.creator FROM creators JOIN records_x_creators on records_x_creators.creator_id = creators.creator_id WHERE records_x_creators.record_id=? AND records_x_creators.is_contributor=1 order by records_x_creators_id asc""", (record["record_id"],)) + record["dc:contributor"] = rows_to_dict(lookup_cur) + + lookup_cur.execute("""SELECT subjects.subject FROM subjects JOIN records_x_subjects on records_x_subjects.subject_id = subjects.subject_id WHERE records_x_subjects.record_id=?""", (record["record_id"],)) + record["dc:subject"] = rows_to_dict(lookup_cur) + + lookup_cur.execute("""SELECT publishers.publisher FROM publishers JOIN records_x_publishers on records_x_publishers.publisher_id = publishers.publisher_id WHERE records_x_publishers.record_id=?""", (record["record_id"],)) + record["dc:publisher"] = rows_to_dict(lookup_cur) + + lookup_cur.execute("""SELECT rights.rights FROM rights JOIN records_x_rights on records_x_rights.rights_id = rights.rights_id WHERE records_x_rights.record_id=?""", (record["record_id"],)) + record["dc:rights"] = rows_to_dict(lookup_cur) + + lookup_cur.execute("SELECT description FROM descriptions WHERE record_id=? and language='en' "), (record["record_id"],) + record["dc:description"] = rows_to_dict(lookup_cur) + + lookup_cur.execute("SELECT description FROM descriptions WHERE record_id=? and language='fr' "), (record["record_id"],) + record["frdr:description_fr"] = rows_to_dict(lookup_cur) + + lookup_cur.execute("""SELECT tags.tag FROM tags JOIN records_x_tags on records_x_tags.tag_id = tags.tag_id WHERE records_x_tags.record_id=? and tags.language = 'en' """, (record["record_id"],)) + record["frdr:tags"] = rows_to_dict(lookup_cur) + + lookup_cur.execute("""SELECT tags.tag FROM tags JOIN records_x_tags on records_x_tags.tag_id = tags.tag_id WHERE records_x_tags.record_id=? and tags.language = 'fr' """, (record["record_id"],)) + record["frdr:tags_fr"] = rows_to_dict(lookup_cur) + + lookup_cur.execute("""SELECT access.access FROM access JOIN records_x_access on records_x_access.access_id = access.access_id WHERE records_x_access.record_id=?""", (record["record_id"],)) + record["frdr:access"] = rows_to_dict(lookup_cur) + + return record + + +def get_metadata_list( + query=None, + provider_id=None, + client_id=None, + records_cursor=None, + server, + db, + user, + password + ): + + # TODO: support listing by set + if records_cursor is None: + records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (db, user, password, server)) + with records_con: + records_cursor = records_con.cursor() + records_sql = """SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id""" + records_cursor.execute(records_sql) + + record_set = records_cursor.fetchmany(config.RESULT_SET_SIZE) + results = [] + for row in record_set: + record = (dict(zip(['record_id', 'title', 'pub_date', 'contact', 'series', 'source_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp'], row))) + + full_record = assemble_record(record, db, user, password, server) + if full_record is not None: + results.append(build_metadata(full_record)) + + return results, records_cursor.rowcount, records_cursor + + +def get_metadata(identifier, db, user, password, server): + records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (db, user, password, server)) + with records_con: + records_cursor = records_con.cursor() + records_sql = ("""SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id AND recs.record_id =?""", (identifier,)) + records_cursor.execute(records_sql) + row = records_cursor.fetchone() + record = (dict(zip(['record_id', 'title', 'pub_date', 'contact', 'series', 'source_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp'], row))) + + full_record = assemble_record(record, db, user, password, server) + return build_metadata(full_record) + + +def get_sets(): + repos_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (db, user, password, server)) + with repos_con: + repos_cursor = repos_con.cursor() + + repos_cursor.execute("SELECT repository_name from repositories") + results = repos_cursor.fetchall() + return results, len(results) From e628330ae7d5e3e8b96044f8dc455d841d241525 Mon Sep 17 00:00:00 2001 From: Alex Garnett Date: Thu, 6 Feb 2020 09:01:13 -0500 Subject: [PATCH 07/67] change mentions of postgres to FRDR where appropriate --- viringo/catalogs.py | 20 +-- viringo/config.py | 12 +- viringo/oai.py | 6 +- viringo/services/postgres.py | 281 ----------------------------------- 4 files changed, 19 insertions(+), 300 deletions(-) delete mode 100644 viringo/services/postgres.py diff --git a/viringo/catalogs.py b/viringo/catalogs.py index ce039aa..20bcdfb 100644 --- a/viringo/catalogs.py +++ b/viringo/catalogs.py @@ -12,7 +12,7 @@ from viringo import config from .services import datacite -from .services import postgres +from .services import frdr class DataCiteOAIServer(): """Build OAI-PMH data responses for DataCite metadata catalog""" @@ -112,7 +112,6 @@ def listRecords( # If available get the search query from the set param search_query = set_to_search_query(set) - # From and until parameters aren't supported with Postgres # Get both a provider and client_id from the set provider_id, client_id = set_to_provider_client(set) results, total_records, paging_cursor = datacite.get_metadata_list( @@ -284,8 +283,8 @@ def build_metadata_map(self, result): return metadata -class PostgresOAIServer(): - """Build OAI-PMH responses from a Postgres server""" +class FRDROAIServer(): + """Build OAI-PMH responses from the FRDR Postgres server""" def identify(self): """Construct common identification for the OAI service""" @@ -301,7 +300,7 @@ def identify(self): toolkit_description=False) # Specify a custom description - postgres_desc = """ + frdr_desc = """ oai """ + config.OAIPMH_IDENTIFIER + """ @@ -310,7 +309,7 @@ def identify(self): """ - identify.add_description(xml_string=postgres_desc) + identify.add_description(xml_string=frdr_desc) return identify @@ -348,7 +347,7 @@ def getRecord(self, metadataPrefix, identifier): # Should we implement this based on source_url and local_identifier the way we currently do for the harvester? - result = postgres.get_metadata(identifier, db, user, password, server) + result = frdr.get_metadata(identifier, db, user, password, server) if not result: raise error.IdDoesNotExistError( "\"%s\" is unknown or illegal in this repository" % identifier @@ -381,9 +380,10 @@ def listRecords( # If available get the search query from the set param search_query = set_to_search_query(set) + # From and until parameters aren't supported with FRDR # Get both a provider and client_id from the set provider_id, client_id = set_to_provider_client(set) - results, total_records, paging_cursor = postgres.get_metadata_list( + results, total_records, paging_cursor = frdr.get_metadata_list( query=search_query, provider_id=provider_id, client_id=client_id, @@ -429,7 +429,7 @@ def listIdentifiers( # Get both a provider and client_id from the set provider_id, client_id = set_to_provider_client(set) - results, total_records, paging_cursor = postgres.get_metadata_list( + results, total_records, paging_cursor = frdr.get_metadata_list( query=search_query, provider_id=provider_id, client_id=client_id, @@ -468,7 +468,7 @@ def listSets( batch_size = 50 next_batch = paging_cursor + batch_size - results, total_results = postgres.get_sets() + results, total_results = frdr.get_sets() results = results[paging_cursor: next_batch] if len(results) < batch_size: diff --git a/viringo/config.py b/viringo/config.py index c2ba38a..4a1ec81 100644 --- a/viringo/config.py +++ b/viringo/config.py @@ -20,13 +20,13 @@ OAIPMH_IDENTIFIER = os.getenv('OAIPMH_IDENTIFIER', 'oai.datacite.org') # Page size of results shown for result listings RESULT_SET_SIZE = int(os.getenv('RESULT_SET_SIZE', '50')) -# Source metadata catalog (DataCite or Postgres) -CATALOG_SET = os.getenv('OAIPMH_CATALOG', 'Postgres') -# Postgres server +# Source metadata catalog (DataCite or FRDR) +CATALOG_SET = os.getenv('OAIPMH_CATALOG', 'DataCite') +# FRDR Postgres server POSTGRES_SERVER = os.getenv('OAIPMH_POSTGRES_SERVER', '') -# Postgres db +# FRDR Postgres db POSTGRES_DB = os.getenv('OAIPMH_POSTGRES_DB', '') -# Postgres user +# FRDR Postgres user POSTGRES_USER = os.getenv('OAIPMH_POSTGRES_USER', '') -# Postgres password +# FRDR Postgres password POSTGRES_PASSWORD = os.getenv('OAIPMH_POSTGRES_PASSWORD', '') \ No newline at end of file diff --git a/viringo/oai.py b/viringo/oai.py index 261835f..04c192e 100644 --- a/viringo/oai.py +++ b/viringo/oai.py @@ -12,7 +12,7 @@ import oaipmh.datestamp from .catalogs import DataCiteOAIServer -from .catalogs import PostgresOAIServer +from .catalogs import FRDROAIServer from . import metadata from . import config @@ -97,8 +97,8 @@ def get_oai_server(): if 'oai' not in g: if config.CATALOG_SET == 'DateCite': catalog_server = DataCiteOAIServer() - elif config.CATALOG_SET == 'Postgres': - catalog_server = PostgresOAIServer() + elif config.CATALOG_SET == 'FRDR': + catalog_server = FRDROAIServer() else: print('No valid metadata catalog configured') sys.exit(1) diff --git a/viringo/services/postgres.py b/viringo/services/postgres.py deleted file mode 100644 index 258d77e..0000000 --- a/viringo/services/postgres.py +++ /dev/null @@ -1,281 +0,0 @@ -"""Handles DB queries for retrieving metadata""" - -import psycopg2 -import re -from datetime import datetime -import dateutil.parser -import dateutil.tz -from viringo import config - -class Metadata: - """Represents a DataCite metadata resultset""" - def __init__( - self, - identifier=None, - created_datetime=None, - updated_datetime=None, - xml=None, - metadata_version=None, - titles=None, - creators=None, - subjects=None, - descriptions=None, - publisher=None, - publication_year=None, - dates=None, - contributors=None, - resource_types=None, - funding_references=None, - geo_locations=None, - formats=None, - identifiers=None, - language=None, - relations=None, - rights=None, - sizes=None, - client=None, - active=True - ): - - self.identifier = identifier - self.created_datetime = created_datetime or datetime.min - self.updated_datetime = updated_datetime or datetime.min - self.xml = xml - self.metadata_version = metadata_version - self.titles = titles or [] - self.creators = creators or [] - self.subjects = subjects or [] - self.descriptions = descriptions or [] - self.publisher = publisher - self.publication_year = publication_year - self.dates = dates or [] - self.contributors = contributors or [] - self.resource_types = resource_types or [] - self.funding_references = funding_references or [] - self.geo_locations = geo_locations or [] - self.formats = formats or [] - self.identifiers = identifiers or [] - self.language = language - self.relations = relations or [] - self.rights = rights or [] - self.sizes = sizes or [] - self.client = client - self.active = active - - -def build_metadata(data): - """Parse single postgres result into metadata object""" - result = Metadata() - - result.identifier = data['record_id'] - - # Here we want to parse a ISO date but convert to UTC and then remove the TZinfo entirely - # This is because OAI always works in UTC. - created = dateutil.parser.parse(data['pub_date']) - result.created_datetime = created.astimezone(dateutil.tz.UTC).replace(tzinfo=None) - updated = dateutil.parser.parse(data['pub_date']) - result.updated_datetime = updated.astimezone(dateutil.tz.UTC).replace(tzinfo=None) - - result.xml = None - result.metadata_version = None - result.titles = [data['title']] - result.creators = data['dc:contributor.author'] - result.subjects = data['dc:subject'] - result.descriptions = data['dc:description'] - result.publisher = data['dc:publisher'] - result.publication_year = dateutil.parser.parse(data['pub_date']).year - result.dates = [data['pub_date']] - result.contributors = data['dc:contributor'] - result.funding_references = [] - result.sizes = [] - result.geo_locations = data['frdr:geospatial'] - result.resource_types = [] - result.formats = [] - result.identifiers = [] - result.language = '' - result.relations = [] - result.rights = data['dc:rights'] - result.client = '' - result.active = True - - return result - - -def construct_local_url(record): - # Check if the local_identifier has already been turned into a url - if "http" in record["local_identifier"].lower(): - return record["local_identifier"] - - # Check for OAI format of identifier (oai:domain:id) - oai_id = None - oai_search = re.search("oai:(.+):(.+)", record["local_identifier"]) - if oai_search: - oai_id = oai_search.group(2) - oai_id = oai_id.replace("_", ":") - - # If given a pattern then substitue in the item ID and return it - if "item_url_pattern" in record and record["item_url_pattern"]: - if oai_id: - local_url = re.sub("(\%id\%)", oai_id, record["item_url_pattern"]) - else: - local_url = re.sub("(\%id\%)", record["local_identifier"], record["item_url_pattern"]) - return local_url - - # Check if the identifier is a DOI - doi = re.search("(doi|DOI):\s?\S+", record["local_identifier"]) - if doi: - doi = doi.group(0).rstrip('\.') - local_url = re.sub("(doi|DOI):\s?", "https://dx.doi.org/", doi) - return local_url - - # If the item has a source URL, use it - if ('source_url' in record) and record['source_url']: - return record['source_url'] - - # URL is in the identifier - local_url = re.search("(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?", - record["local_identifier"]) - if local_url: - return local_url.group(0) - - local_url = None - return local_url - - -def rows_to_dict(cursor): - newdict = [] - if cursor: - for r in cursor: - if r: - if isinstance(r, list): - newdict.append(r[0]) - else: - newdict.append(r) - return newdict - - -def assemble_record(record, db, user, password, server): - record["dc:source"] = construct_local_url(record) - if record["dc:source"] is None: - return None - - if int(record["deleted"]) == 1: - return None - - if (len(record['title']) == 0): - return None - - con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (db, user, password, server)) - with con: - lookup_cur = con.cursor(cursor_factory=None) - - lookup_cur.execute("SELECT coordinate_type, lat, lon FROM geospatial WHERE record_id=?", - (record["record_id"],)) - geodata = lookup_cur.fetchall() - record["frdr:geospatial"] = [] - polycoordinates = [] - - try: - for coordinate in geodata: - if coordinate[0] == "Polygon": - polycoordinates.append([float(coordinate[1]), float(coordinate[2])]) - else: - record["frdr:geospatial"].append({"frdr:geospatial_type": "Feature", "frdr:geospatial_geometry": {"frdr:geometry_type": coordinate[0], "frdr:geometry_coordinates": [float(coordinate[1]), float(coordinate[2])]}}) - except: - pass - - if polycoordinates: - record["frdr:geospatial"].append({"frdr:geospatial_type": "Feature", "frdr:geospatial_geometry": {"frdr:geometry_type": "Polygon", "frdr:geometry_coordinates": polycoordinates}}) - - with con: - lookup_cur = con.cursor(cursor_factory=DictCursor) - - # attach the other values to the dict - lookup_cur.execute("""SELECT creators.creator FROM creators JOIN records_x_creators on records_x_creators.creator_id = creators.creator_id WHERE records_x_creators.record_id=? AND records_x_creators.is_contributor=0 order by records_x_creators_id asc""", (record["record_id"],)) - record["dc:contributor.author"] = rows_to_dict(lookup_cur) - - lookup_cur.execute("""SELECT affiliations.affiliation FROM affiliations JOIN records_x_affiliations on records_x_affiliations.affiliation_id = affiliations.affiliation_id WHERE records_x_affiliations.record_id=?""", (record["record_id"],)) - record["datacite:creatorAffiliation"] = rows_to_dict(lookup_cur) - - lookup_cur.execute("""SELECT creators.creator FROM creators JOIN records_x_creators on records_x_creators.creator_id = creators.creator_id WHERE records_x_creators.record_id=? AND records_x_creators.is_contributor=1 order by records_x_creators_id asc""", (record["record_id"],)) - record["dc:contributor"] = rows_to_dict(lookup_cur) - - lookup_cur.execute("""SELECT subjects.subject FROM subjects JOIN records_x_subjects on records_x_subjects.subject_id = subjects.subject_id WHERE records_x_subjects.record_id=?""", (record["record_id"],)) - record["dc:subject"] = rows_to_dict(lookup_cur) - - lookup_cur.execute("""SELECT publishers.publisher FROM publishers JOIN records_x_publishers on records_x_publishers.publisher_id = publishers.publisher_id WHERE records_x_publishers.record_id=?""", (record["record_id"],)) - record["dc:publisher"] = rows_to_dict(lookup_cur) - - lookup_cur.execute("""SELECT rights.rights FROM rights JOIN records_x_rights on records_x_rights.rights_id = rights.rights_id WHERE records_x_rights.record_id=?""", (record["record_id"],)) - record["dc:rights"] = rows_to_dict(lookup_cur) - - lookup_cur.execute("SELECT description FROM descriptions WHERE record_id=? and language='en' "), (record["record_id"],) - record["dc:description"] = rows_to_dict(lookup_cur) - - lookup_cur.execute("SELECT description FROM descriptions WHERE record_id=? and language='fr' "), (record["record_id"],) - record["frdr:description_fr"] = rows_to_dict(lookup_cur) - - lookup_cur.execute("""SELECT tags.tag FROM tags JOIN records_x_tags on records_x_tags.tag_id = tags.tag_id WHERE records_x_tags.record_id=? and tags.language = 'en' """, (record["record_id"],)) - record["frdr:tags"] = rows_to_dict(lookup_cur) - - lookup_cur.execute("""SELECT tags.tag FROM tags JOIN records_x_tags on records_x_tags.tag_id = tags.tag_id WHERE records_x_tags.record_id=? and tags.language = 'fr' """, (record["record_id"],)) - record["frdr:tags_fr"] = rows_to_dict(lookup_cur) - - lookup_cur.execute("""SELECT access.access FROM access JOIN records_x_access on records_x_access.access_id = access.access_id WHERE records_x_access.record_id=?""", (record["record_id"],)) - record["frdr:access"] = rows_to_dict(lookup_cur) - - return record - - -def get_metadata_list( - query=None, - provider_id=None, - client_id=None, - records_cursor=None, - server, - db, - user, - password - ): - - # TODO: support listing by set - if records_cursor is None: - records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (db, user, password, server)) - with records_con: - records_cursor = records_con.cursor() - records_sql = """SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id""" - records_cursor.execute(records_sql) - - record_set = records_cursor.fetchmany(config.RESULT_SET_SIZE) - results = [] - for row in record_set: - record = (dict(zip(['record_id', 'title', 'pub_date', 'contact', 'series', 'source_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp'], row))) - - full_record = assemble_record(record, db, user, password, server) - if full_record is not None: - results.append(build_metadata(full_record)) - - return results, records_cursor.rowcount, records_cursor - - -def get_metadata(identifier, db, user, password, server): - records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (db, user, password, server)) - with records_con: - records_cursor = records_con.cursor() - records_sql = ("""SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id AND recs.record_id =?""", (identifier,)) - records_cursor.execute(records_sql) - row = records_cursor.fetchone() - record = (dict(zip(['record_id', 'title', 'pub_date', 'contact', 'series', 'source_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp'], row))) - - full_record = assemble_record(record, db, user, password, server) - return build_metadata(full_record) - - -def get_sets(): - repos_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (db, user, password, server)) - with repos_con: - repos_cursor = repos_con.cursor() - - repos_cursor.execute("SELECT repository_name from repositories") - results = repos_cursor.fetchall() - return results, len(results) From 41f0428a48b7643811a14b0e5ec203fd33783d0c Mon Sep 17 00:00:00 2001 From: Alex Garnett Date: Thu, 13 Feb 2020 16:07:31 -0800 Subject: [PATCH 08/67] fix up pipfile for postgres, clean up some syntax --- Pipfile | 3 +- Pipfile.lock | 301 +++++++++++++++++++++++---------------- docker-compose.yml | 2 +- viringo/catalogs.py | 24 ++-- viringo/oai.py | 2 + viringo/services/frdr.py | 282 ++++++++++++++++++++++++++++++++++++ 6 files changed, 480 insertions(+), 134 deletions(-) create mode 100644 viringo/services/frdr.py diff --git a/Pipfile b/Pipfile index c58d20c..679071b 100644 --- a/Pipfile +++ b/Pipfile @@ -14,6 +14,7 @@ python-dotenv = "*" [packages] flask = "*" pyoai = "*" +psycopg2-binary = "*" requests = "*" python-dateutil = "*" lxml = "*" @@ -23,4 +24,4 @@ sentry-sdk = {extras = ["flask"],version = "*"} python-dotenv = "*" [requires] -python_version = "3.6" +python_version = "3.5" diff --git a/Pipfile.lock b/Pipfile.lock index 032c439..9bf8991 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,11 +1,11 @@ { "_meta": { "hash": { - "sha256": "408ef5f3e6921b3359ecee3e8d293c34bd619136293244fe829928ab30c6cf06" + "sha256": "875a3074f25d395b53d43cb957d5d4567234ba291132d91c88cbc1a2f20c77b2" }, "pipfile-spec": 6, "requires": { - "python_version": "3.6" + "python_version": "3.5" }, "sources": [ { @@ -24,10 +24,10 @@ }, "certifi": { "hashes": [ - "sha256:e4f3620cfea4f83eedc95b24abd9cd56f3c4b146dd0177e83a21b4eb49e21e50", - "sha256:fd7c7c74727ddcf00e9acd26bba8da604ffec95bf1c2144e67aff7a8b50e6cef" + "sha256:017c25db2a153ce562900032d5bc68e9f191e44e9a0f762f373977de9df1fbb3", + "sha256:25b64c7da4cd7479594d035c08c2d809eb4aab3a26e5a990ea98cc450c320f1f" ], - "version": "==2019.9.11" + "version": "==2019.11.28" }, "chardet": { "hashes": [ @@ -53,10 +53,10 @@ }, "faker": { "hashes": [ - "sha256:5902379d8df308a204fc11c4f621590ee83975805a6c7b2228203b9defa45250", - "sha256:5e8c755c619f332d5ec28b7586389665f136bcf528e165eb925e87c06a63eda7" + "sha256:047d4d1791bfb3756264da670d99df13d799bb36e7d88774b1585a82d05dbaec", + "sha256:1b1a58961683b30c574520d0c739c4443e0ef6a185c04382e8cc888273dbebed" ], - "version": "==2.0.3" + "version": "==4.0.0" }, "flask": { "hashes": [ @@ -82,50 +82,50 @@ }, "jinja2": { "hashes": [ - "sha256:74320bb91f31270f9551d46522e33af46a80c3d619f4a4bf42b3164d30b5911f", - "sha256:9fe95f19286cfefaa917656583d020be14e7859c6b0252588391e47db34527de" + "sha256:93187ffbc7808079673ef52771baa950426fd664d3aad1d0fa3e95644360e250", + "sha256:b0eaf100007721b5c16c1fc1eecb87409464edc10469ddc9a22a27a99123be49" ], - "version": "==2.10.3" + "version": "==2.11.1" }, "json-log-formatter": { "hashes": [ - "sha256:8bb02773a37274c08f4de748e3accbed269c8395d27149bc2e7e9109af342eee", - "sha256:cdc1f1dabc0b9c808ed4e4f26e73885a9e7955bf7190dd9f1b86be967feb5b29" + "sha256:ee187c9a80936cbf1259f73573973450fc24b84a4fb54e53eb0dcff86ea1e759" ], "index": "pypi", - "version": "==0.2.0" + "version": "==0.3.0" }, "lxml": { "hashes": [ - "sha256:02ca7bf899da57084041bb0f6095333e4d239948ad3169443f454add9f4e9cb4", - "sha256:096b82c5e0ea27ce9138bcbb205313343ee66a6e132f25c5ed67e2c8d960a1bc", - "sha256:0a920ff98cf1aac310470c644bc23b326402d3ef667ddafecb024e1713d485f1", - "sha256:1409b14bf83a7d729f92e2a7fbfe7ec929d4883ca071b06e95c539ceedb6497c", - "sha256:17cae1730a782858a6e2758fd20dd0ef7567916c47757b694a06ffafdec20046", - "sha256:17e3950add54c882e032527795c625929613adbd2ce5162b94667334458b5a36", - "sha256:1f4f214337f6ee5825bf90a65d04d70aab05526c08191ab888cb5149501923c5", - "sha256:2e8f77db25b0a96af679e64ff9bf9dddb27d379c9900c3272f3041c4d1327c9d", - "sha256:4dffd405390a45ecb95ab5ab1c1b847553c18b0ef8ed01e10c1c8b1a76452916", - "sha256:6b899931a5648862c7b88c795eddff7588fb585e81cecce20f8d9da16eff96e0", - "sha256:726c17f3e0d7a7200718c9a890ccfeab391c9133e363a577a44717c85c71db27", - "sha256:760c12276fee05c36f95f8040180abc7fbebb9e5011447a97cdc289b5d6ab6fc", - "sha256:796685d3969815a633827c818863ee199440696b0961e200b011d79b9394bbe7", - "sha256:891fe897b49abb7db470c55664b198b1095e4943b9f82b7dcab317a19116cd38", - "sha256:9277562f175d2334744ad297568677056861070399cec56ff06abbe2564d1232", - "sha256:a471628e20f03dcdfde00770eeaf9c77811f0c331c8805219ca7b87ac17576c5", - "sha256:a63b4fd3e2cabdcc9d918ed280bdde3e8e9641e04f3c59a2a3109644a07b9832", - "sha256:ae88588d687bd476be588010cbbe551e9c2872b816f2da8f01f6f1fda74e1ef0", - "sha256:b0b84408d4eabc6de9dd1e1e0bc63e7731e890c0b378a62443e5741cfd0ae90a", - "sha256:be78485e5d5f3684e875dab60f40cddace2f5b2a8f7fede412358ab3214c3a6f", - "sha256:c27eaed872185f047bb7f7da2d21a7d8913457678c9a100a50db6da890bc28b9", - "sha256:c7fccd08b14aa437fe096c71c645c0f9be0655a9b1a4b7cffc77bcb23b3d61d2", - "sha256:c81cb40bff373ab7a7446d6bbca0190bccc5be3448b47b51d729e37799bb5692", - "sha256:d11874b3c33ee441059464711cd365b89fa1a9cf19ae75b0c189b01fbf735b84", - "sha256:e9c028b5897901361d81a4718d1db217b716424a0283afe9d6735fe0caf70f79", - "sha256:fe489d486cd00b739be826e8c1be188ddb74c7a1ca784d93d06fda882a6a1681" + "sha256:06d4e0bbb1d62e38ae6118406d7cdb4693a3fa34ee3762238bcb96c9e36a93cd", + "sha256:0701f7965903a1c3f6f09328c1278ac0eee8f56f244e66af79cb224b7ef3801c", + "sha256:1f2c4ec372bf1c4a2c7e4bb20845e8bcf8050365189d86806bad1e3ae473d081", + "sha256:4235bc124fdcf611d02047d7034164897ade13046bda967768836629bc62784f", + "sha256:5828c7f3e615f3975d48f40d4fe66e8a7b25f16b5e5705ffe1d22e43fb1f6261", + "sha256:585c0869f75577ac7a8ff38d08f7aac9033da2c41c11352ebf86a04652758b7a", + "sha256:5d467ce9c5d35b3bcc7172c06320dddb275fea6ac2037f72f0a4d7472035cea9", + "sha256:63dbc21efd7e822c11d5ddbedbbb08cd11a41e0032e382a0fd59b0b08e405a3a", + "sha256:7bc1b221e7867f2e7ff1933165c0cec7153dce93d0cdba6554b42a8beb687bdb", + "sha256:8620ce80f50d023d414183bf90cc2576c2837b88e00bea3f33ad2630133bbb60", + "sha256:8a0ebda56ebca1a83eb2d1ac266649b80af8dd4b4a3502b2c1e09ac2f88fe128", + "sha256:90ed0e36455a81b25b7034038e40880189169c308a3df360861ad74da7b68c1a", + "sha256:95e67224815ef86924fbc2b71a9dbd1f7262384bca4bc4793645794ac4200717", + "sha256:afdb34b715daf814d1abea0317b6d672476b498472f1e5aacbadc34ebbc26e89", + "sha256:b4b2c63cc7963aedd08a5f5a454c9f67251b1ac9e22fd9d72836206c42dc2a72", + "sha256:d068f55bda3c2c3fcaec24bd083d9e2eede32c583faf084d6e4b9daaea77dde8", + "sha256:d5b3c4b7edd2e770375a01139be11307f04341ec709cf724e0f26ebb1eef12c3", + "sha256:deadf4df349d1dcd7b2853a2c8796593cc346600726eff680ed8ed11812382a7", + "sha256:df533af6f88080419c5a604d0d63b2c33b1c0c4409aba7d0cb6de305147ea8c8", + "sha256:e4aa948eb15018a657702fee0b9db47e908491c64d36b4a90f59a64741516e77", + "sha256:e5d842c73e4ef6ed8c1bd77806bf84a7cb535f9c0cf9b2c74d02ebda310070e1", + "sha256:ebec08091a22c2be870890913bdadd86fcd8e9f0f22bcb398abd3af914690c15", + "sha256:edc15fcfd77395e24543be48871c251f38132bb834d9fdfdad756adb6ea37679", + "sha256:f2b74784ed7e0bc2d02bd53e48ad6ba523c9b36c194260b7a5045071abbb1012", + "sha256:fa071559f14bd1e92077b1b5f6c22cf09756c6de7139370249eb372854ce51e6", + "sha256:fd52e796fee7171c4361d441796b64df1acfceb51f29e545e812f16d023c4bbc", + "sha256:fe976a0f1ef09b3638778024ab9fb8cde3118f203364212c198f71341c0715ca" ], "index": "pypi", - "version": "==4.4.1" + "version": "==4.5.0" }, "markupsafe": { "hashes": [ @@ -133,13 +133,16 @@ "sha256:09027a7803a62ca78792ad89403b1b7a73a01c8cb65909cd876f7fcebd79b161", "sha256:09c4b7f37d6c648cb13f9230d847adf22f8171b1ccc4d5682398e77f40309235", "sha256:1027c282dad077d0bae18be6794e6b6b8c91d58ed8a8d89a89d59693b9131db5", + "sha256:13d3144e1e340870b25e7b10b98d779608c02016d5184cfb9927a9f10c689f42", "sha256:24982cc2533820871eba85ba648cd53d8623687ff11cbb805be4ff7b4c971aff", "sha256:29872e92839765e546828bb7754a68c418d927cd064fd4708fab9fe9c8bb116b", "sha256:43a55c2930bbc139570ac2452adf3d70cdbb3cfe5912c71cdce1c2c6bbd9c5d1", "sha256:46c99d2de99945ec5cb54f23c8cd5689f6d7177305ebff350a58ce5f8de1669e", "sha256:500d4957e52ddc3351cabf489e79c91c17f6e0899158447047588650b5e69183", "sha256:535f6fc4d397c1563d08b88e485c3496cf5784e927af890fb3c3aac7f933ec66", + "sha256:596510de112c685489095da617b5bcbbac7dd6384aeebeda4df6025d0256a81b", "sha256:62fe6c95e3ec8a7fad637b7f3d372c15ec1caa01ab47926cfdf7a75b40e0eac1", + "sha256:6788b695d50a51edb699cb55e35487e430fa21f1ed838122d722e0ff0ac5ba15", "sha256:6dd73240d2af64df90aa7c4e7481e23825ea70af4b4922f8ede5b9e35f78a3b1", "sha256:717ba8fe3ae9cc0006d7c451f0bb265ee07739daf76355d06366154ee68d221e", "sha256:79855e1c5b8da654cf486b830bd42c06e8780cea587384cf6545b7d9ac013a0b", @@ -156,10 +159,50 @@ "sha256:ba59edeaa2fc6114428f1637ffff42da1e311e29382d81b339c1817d37ec93c6", "sha256:c8716a48d94b06bb3b2524c2b77e055fb313aeb4ea620c8dd03a105574ba704f", "sha256:cd5df75523866410809ca100dc9681e301e3c27567cf498077e8551b6d20e42f", - "sha256:e249096428b3ae81b08327a63a485ad0878de3fb939049038579ac0ef61e17e7" + "sha256:cdb132fc825c38e1aeec2c8aa9338310d29d337bebbd7baa06889d09a60a1fa2", + "sha256:e249096428b3ae81b08327a63a485ad0878de3fb939049038579ac0ef61e17e7", + "sha256:e8313f01ba26fbbe36c7be1966a7b7424942f670f38e666995b88d012765b9be" ], "version": "==1.1.1" }, + "psycopg2-binary": { + "hashes": [ + "sha256:040234f8a4a8dfd692662a8308d78f63f31a97e1c42d2480e5e6810c48966a29", + "sha256:086f7e89ec85a6704db51f68f0dcae432eff9300809723a6e8782c41c2f48e03", + "sha256:18ca813fdb17bc1db73fe61b196b05dd1ca2165b884dd5ec5568877cabf9b039", + "sha256:19dc39616850342a2a6db70559af55b22955f86667b5f652f40c0e99253d9881", + "sha256:2166e770cb98f02ed5ee2b0b569d40db26788e0bf2ec3ae1a0d864ea6f1d8309", + "sha256:3a2522b1d9178575acee4adf8fd9f979f9c0449b00b4164bb63c3475ea6528ed", + "sha256:3aa773580f85a28ffdf6f862e59cb5a3cc7ef6885121f2de3fca8d6ada4dbf3b", + "sha256:3b5deaa3ee7180585a296af33e14c9b18c218d148e735c7accf78130765a47e3", + "sha256:407af6d7e46593415f216c7f56ba087a9a42bd6dc2ecb86028760aa45b802bd7", + "sha256:4c3c09fb674401f630626310bcaf6cd6285daf0d5e4c26d6e55ca26a2734e39b", + "sha256:4c6717962247445b4f9e21c962ea61d2e884fc17df5ddf5e35863b016f8a1f03", + "sha256:50446fae5681fc99f87e505d4e77c9407e683ab60c555ec302f9ac9bffa61103", + "sha256:5057669b6a66aa9ca118a2a860159f0ee3acf837eda937bdd2a64f3431361a2d", + "sha256:5dd90c5438b4f935c9d01fcbad3620253da89d19c1f5fca9158646407ed7df35", + "sha256:659c815b5b8e2a55193ede2795c1e2349b8011497310bb936da7d4745652823b", + "sha256:69b13fdf12878b10dc6003acc8d0abf3ad93e79813fd5f3812497c1c9fb9be49", + "sha256:7a1cb80e35e1ccea3e11a48afe65d38744a0e0bde88795cc56a4d05b6e4f9d70", + "sha256:7e6e3c52e6732c219c07bd97fff6c088f8df4dae3b79752ee3a817e6f32e177e", + "sha256:7f42a8490c4fe854325504ce7a6e4796b207960dabb2cbafe3c3959cb00d1d7e", + "sha256:84156313f258eafff716b2961644a4483a9be44a5d43551d554844d15d4d224e", + "sha256:8578d6b8192e4c805e85f187bc530d0f52ba86c39172e61cd51f68fddd648103", + "sha256:890167d5091279a27e2505ff0e1fb273f8c48c41d35c5b92adbf4af80e6b2ed6", + "sha256:98e10634792ac0e9e7a92a76b4991b44c2325d3e7798270a808407355e7bb0a1", + "sha256:9aadff9032e967865f9778485571e93908d27dab21d0fdfdec0ca779bb6f8ad9", + "sha256:9f24f383a298a0c0f9b3113b982e21751a8ecde6615494a3f1470eb4a9d70e9e", + "sha256:a73021b44813b5c84eda4a3af5826dd72356a900bac9bd9dd1f0f81ee1c22c2f", + "sha256:afd96845e12638d2c44d213d4810a08f4dc4a563f9a98204b7428e567014b1cd", + "sha256:b73ddf033d8cd4cc9dfed6324b1ad2a89ba52c410ef6877998422fcb9c23e3a8", + "sha256:b8f490f5fad1767a1331df1259763b3bad7d7af12a75b950c2843ba319b2415f", + "sha256:dbc5cd56fff1a6152ca59445178652756f4e509f672e49ccdf3d79c1043113a4", + "sha256:eac8a3499754790187bb00574ab980df13e754777d346f85e0ff6df929bcd964", + "sha256:eaed1c65f461a959284649e37b5051224f4db6ebdc84e40b5e65f2986f101a08" + ], + "index": "pypi", + "version": "==2.8.4" + }, "pyoai": { "hashes": [ "sha256:029521e1f6a819511feb4299a6181b5c312e8a71f7cddc4547e27001e7552be0" @@ -177,11 +220,11 @@ }, "python-dotenv": { "hashes": [ - "sha256:debd928b49dbc2bf68040566f55cdb3252458036464806f4094487244e2a4093", - "sha256:f157d71d5fec9d4bd5f51c82746b6344dffa680ee85217c123f4a0c8117c4544" + "sha256:8429f459fc041237d98c9ff32e1938e7e5535b5ff24388876315a098027c3a57", + "sha256:ca9f3debf2262170d6f46571ce4d6ca1add60bb93b69c3a29dcb3d1a00a65c93" ], "index": "pypi", - "version": "==0.10.3" + "version": "==0.11.0" }, "requests": { "hashes": [ @@ -196,18 +239,18 @@ "flask" ], "hashes": [ - "sha256:09e1e8f00f22ea580348f83bbbd880adf40b29f1dec494a8e4b33e22f77184fb", - "sha256:ff1fa7fb85703ae9414c8b427ee73f8363232767c9cd19158f08f6e4f0b58fc7" + "sha256:b06dd27391fd11fb32f84fe054e6a64736c469514a718a99fb5ce1dff95d6b28", + "sha256:e023da07cfbead3868e1e2ba994160517885a32dfd994fc455b118e37989479b" ], "index": "pypi", - "version": "==0.13.2" + "version": "==0.14.1" }, "six": { "hashes": [ - "sha256:1f1b7d42e254082a9db6279deae68afb421ceba6158efa6131de7b3003ee93fd", - "sha256:30f610279e8b2578cab6db20741130331735c781b56053c59c4076da27f06b66" + "sha256:236bdbdce46e6e6a3d61a337c0f8b763ca1e8717c03b369e87a7ec7ce1319c0a", + "sha256:8f3cd2e254d8f793e7f3d6d9df77b92252b52637291d0f0da013c76ea2724b6c" ], - "version": "==1.13.0" + "version": "==1.14.0" }, "text-unidecode": { "hashes": [ @@ -218,17 +261,17 @@ }, "urllib3": { "hashes": [ - "sha256:3de946ffbed6e6746608990594d08faac602528ac7015ac28d33cee6a45b7398", - "sha256:9a107b99a5393caf59c7aa3c1249c16e6879447533d0887f4336dde834c7be86" + "sha256:2f3db8b19923a873b3e5256dc9c2dedfa883e33d87c690d9c7913e1f40673cdc", + "sha256:87716c2d2a7121198ebcb7ce7cccf6ce5e9ba539041cfbaeecfb641dc0bf6acc" ], - "version": "==1.25.6" + "version": "==1.25.8" }, "werkzeug": { "hashes": [ - "sha256:7280924747b5733b246fe23972186c6b348f9ae29724135a6dfc1e53cea433e7", - "sha256:e5f4a1f98b52b18a93da705a7458e55afb26f32bff83ff5d19189f92462d65c4" + "sha256:169ba8a33788476292d04186ab33b01d6add475033dfc07215e6d219cc077096", + "sha256:6dc65cf9091cf750012f56f2cad759fa9e879f511b5ff8685e456b4e3bf90d16" ], - "version": "==0.16.0" + "version": "==1.0.0" } }, "develop": { @@ -244,6 +287,7 @@ "sha256:03472c30eb2c5d1ba9227e4c2ca66ab8287fbfbbda3888aa93dc2e28fc6811b4", "sha256:75a9445bac02d8d058d5e1fe689654ba5a6556a1dfd8ce6ec55a0ed79866cfa6" ], + "markers": "sys_platform == 'win32'", "version": "==1.3.0" }, "attrs": { @@ -253,6 +297,14 @@ ], "version": "==19.3.0" }, + "colorama": { + "hashes": [ + "sha256:7d73d2a99753107a36ac6b455ee49046802e59d9d076ef8e47b61499fa29afff", + "sha256:e96da0d330793e2cb9485e9ddfd918d456036c7149416295932478192f4436a1" + ], + "markers": "sys_platform == 'win32'", + "version": "==0.4.3" + }, "factory-boy": { "hashes": [ "sha256:728df59b372c9588b83153facf26d3d28947fc750e8e3c95cefa9bed0e6394ee", @@ -263,18 +315,18 @@ }, "faker": { "hashes": [ - "sha256:5902379d8df308a204fc11c4f621590ee83975805a6c7b2228203b9defa45250", - "sha256:5e8c755c619f332d5ec28b7586389665f136bcf528e165eb925e87c06a63eda7" + "sha256:047d4d1791bfb3756264da670d99df13d799bb36e7d88774b1585a82d05dbaec", + "sha256:1b1a58961683b30c574520d0c739c4443e0ef6a185c04382e8cc888273dbebed" ], - "version": "==2.0.3" + "version": "==4.0.0" }, "importlib-metadata": { "hashes": [ - "sha256:aa18d7378b00b40847790e7c27e11673d7fed219354109d0e7b9e5b25dc3ad26", - "sha256:d5f18a79777f3aa179c145737780282e27b508fc8fd688cb17c7a813e8bd39af" + "sha256:06f5b3a99029c7134207dd882428a66992a9de2bef7c2b699b5641f9886c3302", + "sha256:b97607a1a18a5100839aec1dc26a1ea17ee0d93b20b0f008d80a5a050afb200b" ], "markers": "python_version < '3.8'", - "version": "==0.23" + "version": "==1.5.0" }, "isort": { "hashes": [ @@ -318,62 +370,70 @@ }, "more-itertools": { "hashes": [ - "sha256:409cd48d4db7052af495b09dec721011634af3753ae1ef92d2b32f73a745f832", - "sha256:92b8c4b06dac4f0611c0729b2f2ede52b2e1bac1ab48f089c7ddc12e26bb60c4" + "sha256:5dd8bcf33e5f9513ffa06d5ad33d78f31e1931ac9a18f33d37e77a180d393a7c", + "sha256:b1ddb932186d8a6ac451e1d95844b382f55e12686d51ca0c68b6f61f2ab7a507" ], - "version": "==7.2.0" + "version": "==8.2.0" }, "packaging": { "hashes": [ - "sha256:28b924174df7a2fa32c1953825ff29c61e2f5e082343165438812f00d3a7fc47", - "sha256:d9551545c6d761f3def1677baf08ab2a3ca17c56879e70fecba2fc4dde4ed108" + "sha256:170748228214b70b672c581a3dd610ee51f733018650740e98c7df862a583f73", + "sha256:e665345f9eef0c621aa0bf2f8d78cf6d21904eef16a93f020240b704a57f1334" ], - "version": "==19.2" + "version": "==20.1" + }, + "pathlib2": { + "hashes": [ + "sha256:0ec8205a157c80d7acc301c0b18fbd5d44fe655968f5d947b6ecef5290fc35db", + "sha256:6cd9a47b597b37cc57de1c05e56fb1a1c9cc9fab04fe78c29acd090418529868" + ], + "markers": "python_version < '3.6'", + "version": "==2.3.5" }, "pluggy": { "hashes": [ - "sha256:0db4b7601aae1d35b4a033282da476845aa19185c1e6964b25cf324b5e4ec3e6", - "sha256:fa5fa1622fa6dd5c030e9cad086fa19ef6a0cf6d7a2d12318e10cb49d6d68f34" + "sha256:15b2acde666561e1298d71b523007ed7364de07029219b604cf808bfa1c765b0", + "sha256:966c145cd83c96502c3c3868f50408687b38434af77734af1e9ca461a4081d2d" ], - "version": "==0.13.0" + "version": "==0.13.1" }, "py": { "hashes": [ - "sha256:64f65755aee5b381cea27766a3a147c3f15b9b6b9ac88676de66ba2ae36793fa", - "sha256:dc639b046a6e2cff5bbe40194ad65936d6ba360b52b3c3fe1d08a82dd50b5e53" + "sha256:5e27081401262157467ad6e7f851b7aa402c5852dbcb3dae06768434de5752aa", + "sha256:c20fdd83a5dbc0af9efd622bee9a5564e278f6380fffcacc43ba6f43db2813b0" ], - "version": "==1.8.0" + "version": "==1.8.1" }, "pylint": { "hashes": [ - "sha256:7b76045426c650d2b0f02fc47c14d7934d17898779da95288a74c2a7ec440702", - "sha256:856476331f3e26598017290fd65bebe81c960e806776f324093a46b76fb2d1c0" + "sha256:3db5468ad013380e987410a8d6956226963aed94ecb5f9d3a28acca6d9ac36cd", + "sha256:886e6afc935ea2590b462664b161ca9a5e40168ea99e5300935f6591ad467df4" ], "index": "pypi", - "version": "==2.4.3" + "version": "==2.4.4" }, "pyparsing": { "hashes": [ - "sha256:4acadc9a2b96c19fe00932a38ca63e601180c39a189a696abce1eaab641447e1", - "sha256:61b5ed888beab19ddccab3478910e2076a6b5a0295dffc43021890e136edf764" + "sha256:4c830582a84fb022400b85429791bc551f1f4871c33f23e44f353119e92f969f", + "sha256:c342dccb5250c08d45fd6f8b4a559613ca603b57498511740e65cd11a2e7dcec" ], - "version": "==2.4.4" + "version": "==2.4.6" }, "pytest": { "hashes": [ - "sha256:27abc3fef618a01bebb1f0d6d303d2816a99aa87a5968ebc32fe971be91eb1e6", - "sha256:58cee9e09242937e136dbb3dab466116ba20d6b7828c7620f23947f37eb4dae4" + "sha256:0d5fe9189a148acc3c3eb2ac8e1ac0742cb7618c084f3d228baaec0c254b318d", + "sha256:ff615c761e25eb25df19edddc0b970302d2a9091fbce0e7213298d85fb61fef6" ], "index": "pypi", - "version": "==5.2.2" + "version": "==5.3.5" }, "pytest-mock": { "hashes": [ - "sha256:b3514caac35fe3f05555923eabd9546abce11571cc2ddf7d8615959d04f2c89e", - "sha256:ea502c3891599c26243a3a847ccf0b1d20556678c528f86c98e3cd6d40c5cf11" + "sha256:b35eb281e93aafed138db25c8772b95d3756108b601947f89af503f8c629413f", + "sha256:cb67402d87d5f53c579263d37971a164743dc33c159dfb4fb4a86f37c5552307" ], "index": "pypi", - "version": "==1.11.2" + "version": "==2.0.0" }, "python-dateutil": { "hashes": [ @@ -385,18 +445,18 @@ }, "python-dotenv": { "hashes": [ - "sha256:debd928b49dbc2bf68040566f55cdb3252458036464806f4094487244e2a4093", - "sha256:f157d71d5fec9d4bd5f51c82746b6344dffa680ee85217c123f4a0c8117c4544" + "sha256:8429f459fc041237d98c9ff32e1938e7e5535b5ff24388876315a098027c3a57", + "sha256:ca9f3debf2262170d6f46571ce4d6ca1add60bb93b69c3a29dcb3d1a00a65c93" ], "index": "pypi", - "version": "==0.10.3" + "version": "==0.11.0" }, "six": { "hashes": [ - "sha256:1f1b7d42e254082a9db6279deae68afb421ceba6158efa6131de7b3003ee93fd", - "sha256:30f610279e8b2578cab6db20741130331735c781b56053c59c4076da27f06b66" + "sha256:236bdbdce46e6e6a3d61a337c0f8b763ca1e8717c03b369e87a7ec7ce1319c0a", + "sha256:8f3cd2e254d8f793e7f3d6d9df77b92252b52637291d0f0da013c76ea2724b6c" ], - "version": "==1.13.0" + "version": "==1.14.0" }, "termcolor": { "hashes": [ @@ -414,36 +474,37 @@ }, "typed-ast": { "hashes": [ - "sha256:1170afa46a3799e18b4c977777ce137bb53c7485379d9706af8a59f2ea1aa161", - "sha256:18511a0b3e7922276346bcb47e2ef9f38fb90fd31cb9223eed42c85d1312344e", - "sha256:262c247a82d005e43b5b7f69aff746370538e176131c32dda9cb0f324d27141e", - "sha256:2b907eb046d049bcd9892e3076c7a6456c93a25bebfe554e931620c90e6a25b0", - "sha256:354c16e5babd09f5cb0ee000d54cfa38401d8b8891eefa878ac772f827181a3c", - "sha256:48e5b1e71f25cfdef98b013263a88d7145879fbb2d5185f2a0c79fa7ebbeae47", - "sha256:4e0b70c6fc4d010f8107726af5fd37921b666f5b31d9331f0bd24ad9a088e631", - "sha256:630968c5cdee51a11c05a30453f8cd65e0cc1d2ad0d9192819df9978984529f4", - "sha256:66480f95b8167c9c5c5c87f32cf437d585937970f3fc24386f313a4c97b44e34", - "sha256:71211d26ffd12d63a83e079ff258ac9d56a1376a25bc80b1cdcdf601b855b90b", - "sha256:7954560051331d003b4e2b3eb822d9dd2e376fa4f6d98fee32f452f52dd6ebb2", - "sha256:838997f4310012cf2e1ad3803bce2f3402e9ffb71ded61b5ee22617b3a7f6b6e", - "sha256:95bd11af7eafc16e829af2d3df510cecfd4387f6453355188342c3e79a2ec87a", - "sha256:bc6c7d3fa1325a0c6613512a093bc2a2a15aeec350451cbdf9e1d4bffe3e3233", - "sha256:cc34a6f5b426748a507dd5d1de4c1978f2eb5626d51326e43280941206c209e1", - "sha256:d755f03c1e4a51e9b24d899561fec4ccaf51f210d52abdf8c07ee2849b212a36", - "sha256:d7c45933b1bdfaf9f36c579671fec15d25b06c8398f113dab64c18ed1adda01d", - "sha256:d896919306dd0aa22d0132f62a1b78d11aaf4c9fc5b3410d3c666b818191630a", - "sha256:fdc1c9bbf79510b76408840e009ed65958feba92a88833cdceecff93ae8fff66", - "sha256:ffde2fbfad571af120fcbfbbc61c72469e72f550d676c3342492a9dfdefb8f12" + "sha256:0666aa36131496aed8f7be0410ff974562ab7eeac11ef351def9ea6fa28f6355", + "sha256:0c2c07682d61a629b68433afb159376e24e5b2fd4641d35424e462169c0a7919", + "sha256:249862707802d40f7f29f6e1aad8d84b5aa9e44552d2cc17384b209f091276aa", + "sha256:24995c843eb0ad11a4527b026b4dde3da70e1f2d8806c99b7b4a7cf491612652", + "sha256:269151951236b0f9a6f04015a9004084a5ab0d5f19b57de779f908621e7d8b75", + "sha256:4083861b0aa07990b619bd7ddc365eb7fa4b817e99cf5f8d9cf21a42780f6e01", + "sha256:498b0f36cc7054c1fead3d7fc59d2150f4d5c6c56ba7fb150c013fbc683a8d2d", + "sha256:4e3e5da80ccbebfff202a67bf900d081906c358ccc3d5e3c8aea42fdfdfd51c1", + "sha256:6daac9731f172c2a22ade6ed0c00197ee7cc1221aa84cfdf9c31defeb059a907", + "sha256:715ff2f2df46121071622063fc7543d9b1fd19ebfc4f5c8895af64a77a8c852c", + "sha256:73d785a950fc82dd2a25897d525d003f6378d1cb23ab305578394694202a58c3", + "sha256:8c8aaad94455178e3187ab22c8b01a3837f8ee50e09cf31f1ba129eb293ec30b", + "sha256:8ce678dbaf790dbdb3eba24056d5364fb45944f33553dd5869b7580cdbb83614", + "sha256:aaee9905aee35ba5905cfb3c62f3e83b3bec7b39413f0a7f19be4e547ea01ebb", + "sha256:bcd3b13b56ea479b3650b82cabd6b5343a625b0ced5429e4ccad28a8973f301b", + "sha256:c9e348e02e4d2b4a8b2eedb48210430658df6951fa484e59de33ff773fbd4b41", + "sha256:d205b1b46085271b4e15f670058ce182bd1199e56b317bf2ec004b6a44f911f6", + "sha256:d43943ef777f9a1c42bf4e552ba23ac77a6351de620aa9acf64ad54933ad4d34", + "sha256:d5d33e9e7af3b34a40dc05f498939f0ebf187f07c385fd58d591c533ad8562fe", + "sha256:fc0fea399acb12edbf8a628ba8d2312f583bdbdb3335635db062fa98cf71fca4", + "sha256:fe460b922ec15dd205595c9b5b99e2f056fd98ae8f9f56b888e7a17dc2b757e7" ], "markers": "implementation_name == 'cpython' and python_version < '3.8'", - "version": "==1.4.0" + "version": "==1.4.1" }, "wcwidth": { "hashes": [ - "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e", - "sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c" + "sha256:8fd29383f539be45b20bd4df0dc29c20ba48654a41e661925e612311e9f3c603", + "sha256:f28b3e8a6483e5d49e7f8949ac1a78314e740333ae305b4ba5defd3e74fb37a8" ], - "version": "==0.1.7" + "version": "==0.1.8" }, "wrapt": { "hashes": [ @@ -453,10 +514,10 @@ }, "zipp": { "hashes": [ - "sha256:3718b1cbcd963c7d4c5511a8240812904164b7f381b647143a89d3b98f9bcd8e", - "sha256:f06903e9f1f43b12d371004b4ac7b06ab39a44adc747266928ae6debfa7b3335" + "sha256:15428d652e993b6ce86694c3cccf0d71aa7afdc6ef1807fa25a920e9444e0281", + "sha256:d9d2efe11d3a3fb9184da550d35bd1319dc8e30a63255927c82bb42fca1f4f7c" ], - "version": "==0.6.0" + "version": "==1.1.0" } } } diff --git a/docker-compose.yml b/docker-compose.yml index 09ed2a1..5c412aa 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -10,4 +10,4 @@ services: volumes: - ./:/home/app/webapp/ env_file: - - .env + - .env \ No newline at end of file diff --git a/viringo/catalogs.py b/viringo/catalogs.py index 20bcdfb..fd3d424 100644 --- a/viringo/catalogs.py +++ b/viringo/catalogs.py @@ -347,7 +347,7 @@ def getRecord(self, metadataPrefix, identifier): # Should we implement this based on source_url and local_identifier the way we currently do for the harvester? - result = frdr.get_metadata(identifier, db, user, password, server) + result = frdr.get_metadata(identifier, db=config.POSTGRES_DB, user=config.POSTGRES_USER, password=config.POSTGRES_PASSWORD, server=config.POSTGRES_SERVER) if not result: raise error.IdDoesNotExistError( "\"%s\" is unknown or illegal in this repository" % identifier @@ -384,14 +384,14 @@ def listRecords( # Get both a provider and client_id from the set provider_id, client_id = set_to_provider_client(set) results, total_records, paging_cursor = frdr.get_metadata_list( - query=search_query, - provider_id=provider_id, - client_id=client_id, - cursor=paging_cursor, server=config.POSTGRES_SERVER, db=config.POSTGRES_DB, user=config.POSTGRES_USER, - password=config.POSTGRES_PASSWORD + password=config.POSTGRES_PASSWORD, + query=search_query, + provider_id=provider_id, + client_id=client_id, + records_cursor=paging_cursor ) records = [] @@ -430,14 +430,14 @@ def listIdentifiers( provider_id, client_id = set_to_provider_client(set) results, total_records, paging_cursor = frdr.get_metadata_list( - query=search_query, - provider_id=provider_id, - client_id=client_id, - cursor=paging_cursor, server=config.POSTGRES_SERVER, db=config.POSTGRES_DB, user=config.POSTGRES_USER, - password=config.POSTGRES_PASSWORD + password=config.POSTGRES_PASSWORD, + query=search_query, + provider_id=provider_id, + client_id=client_id, + records_cursor=paging_cursor ) records = [] @@ -468,7 +468,7 @@ def listSets( batch_size = 50 next_batch = paging_cursor + batch_size - results, total_results = frdr.get_sets() + results, total_results = frdr.get_sets(db=config.POSTGRES_DB, user=config.POSTGRES_USER, password=config.POSTGRES_PASSWORD, server=config.POSTGRES_SERVER) results = results[paging_cursor: next_batch] if len(results) < batch_size: diff --git a/viringo/oai.py b/viringo/oai.py index 04c192e..9ef1b7e 100644 --- a/viringo/oai.py +++ b/viringo/oai.py @@ -16,6 +16,8 @@ from . import metadata from . import config +import sys + BP = Blueprint('oai', __name__) class XMLTreeServer(oaipmh.server.XMLTreeServer): diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py new file mode 100644 index 0000000..1ab7fb5 --- /dev/null +++ b/viringo/services/frdr.py @@ -0,0 +1,282 @@ +"""Handles DB queries for retrieving metadata""" + +import psycopg2 +import re +from datetime import datetime +import dateutil.parser +import dateutil.tz +from viringo import config + +class Metadata: + """Represents a DataCite metadata resultset""" + def __init__( + self, + identifier=None, + created_datetime=None, + updated_datetime=None, + xml=None, + metadata_version=None, + titles=None, + creators=None, + subjects=None, + descriptions=None, + publisher=None, + publication_year=None, + dates=None, + contributors=None, + resource_types=None, + funding_references=None, + geo_locations=None, + formats=None, + identifiers=None, + language=None, + relations=None, + rights=None, + sizes=None, + client=None, + active=True + ): + + self.identifier = identifier + self.created_datetime = created_datetime or datetime.min + self.updated_datetime = updated_datetime or datetime.min + self.xml = xml + self.metadata_version = metadata_version + self.titles = titles or [] + self.creators = creators or [] + self.subjects = subjects or [] + self.descriptions = descriptions or [] + self.publisher = publisher + self.publication_year = publication_year + self.dates = dates or [] + self.contributors = contributors or [] + self.resource_types = resource_types or [] + self.funding_references = funding_references or [] + self.geo_locations = geo_locations or [] + self.formats = formats or [] + self.identifiers = identifiers or [] + self.language = language + self.relations = relations or [] + self.rights = rights or [] + self.sizes = sizes or [] + self.client = client + self.active = active + + +def build_metadata(data): + """Parse single FRDR result into metadata object""" + result = Metadata() + + result.identifier = data['record_id'] + + # Here we want to parse a ISO date but convert to UTC and then remove the TZinfo entirely + # This is because OAI always works in UTC. + created = dateutil.parser.parse(data['pub_date']) + result.created_datetime = created.astimezone(dateutil.tz.UTC).replace(tzinfo=None) + updated = dateutil.parser.parse(data['pub_date']) + result.updated_datetime = updated.astimezone(dateutil.tz.UTC).replace(tzinfo=None) + + result.xml = None + result.metadata_version = None + result.titles = [data['title']] + result.creators = data['dc:contributor.author'] + result.subjects = data['dc:subject'] + result.descriptions = data['dc:description'] + result.publisher = data['dc:publisher'] + result.publication_year = dateutil.parser.parse(data['pub_date']).year + result.dates = [data['pub_date']] + result.contributors = data['dc:contributor'] + result.funding_references = [] + result.sizes = [] + result.geo_locations = data['frdr:geospatial'] + result.resource_types = [] + result.formats = [] + result.identifiers = [] + result.language = '' + result.relations = [] + result.rights = data['dc:rights'] + result.client = '' + result.active = True + + return result + + +def construct_local_url(record): + # Check if the local_identifier has already been turned into a url + if "http" in record["local_identifier"].lower(): + return record["local_identifier"] + + # Check for OAI format of identifier (oai:domain:id) + oai_id = None + oai_search = re.search("oai:(.+):(.+)", record["local_identifier"]) + if oai_search: + oai_id = oai_search.group(2) + oai_id = oai_id.replace("_", ":") + + # If given a pattern then substitue in the item ID and return it + if "item_url_pattern" in record and record["item_url_pattern"]: + if oai_id: + local_url = re.sub("(\%id\%)", oai_id, record["item_url_pattern"]) + else: + local_url = re.sub("(\%id\%)", record["local_identifier"], record["item_url_pattern"]) + return local_url + + # Check if the identifier is a DOI + doi = re.search("(doi|DOI):\s?\S+", record["local_identifier"]) + if doi: + doi = doi.group(0).rstrip('\.') + local_url = re.sub("(doi|DOI):\s?", "https://dx.doi.org/", doi) + return local_url + + # If the item has a source URL, use it + if ('source_url' in record) and record['source_url']: + return record['source_url'] + + # URL is in the identifier + local_url = re.search("(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?", + record["local_identifier"]) + if local_url: + return local_url.group(0) + + local_url = None + return local_url + + +def rows_to_dict(cursor): + newdict = [] + if cursor: + for r in cursor: + if r: + if isinstance(r, list): + newdict.append(r[0]) + else: + newdict.append(r) + return newdict + + +def assemble_record(record, db, user, password, server): + record["dc:source"] = construct_local_url(record) + if record["dc:source"] is None: + return None + + if int(record["deleted"]) == 1: + return None + + if (len(record['title']) == 0): + return None + + con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (db, user, password, server)) + with con: + lookup_cur = con.cursor(cursor_factory=None) + + lookup_cur.execute("SELECT coordinate_type, lat, lon FROM geospatial WHERE record_id=?", + (record["record_id"],)) + geodata = lookup_cur.fetchall() + record["frdr:geospatial"] = [] + polycoordinates = [] + + try: + for coordinate in geodata: + if coordinate[0] == "Polygon": + polycoordinates.append([float(coordinate[1]), float(coordinate[2])]) + else: + record["frdr:geospatial"].append({"frdr:geospatial_type": "Feature", "frdr:geospatial_geometry": {"frdr:geometry_type": coordinate[0], "frdr:geometry_coordinates": [float(coordinate[1]), float(coordinate[2])]}}) + except: + pass + + if polycoordinates: + record["frdr:geospatial"].append({"frdr:geospatial_type": "Feature", "frdr:geospatial_geometry": {"frdr:geometry_type": "Polygon", "frdr:geometry_coordinates": polycoordinates}}) + + with con: + lookup_cur = con.cursor(cursor_factory=DictCursor) + + # attach the other values to the dict + lookup_cur.execute("""SELECT creators.creator FROM creators JOIN records_x_creators on records_x_creators.creator_id = creators.creator_id WHERE records_x_creators.record_id=? AND records_x_creators.is_contributor=0 order by records_x_creators_id asc""", (record["record_id"],)) + record["dc:contributor.author"] = rows_to_dict(lookup_cur) + + lookup_cur.execute("""SELECT affiliations.affiliation FROM affiliations JOIN records_x_affiliations on records_x_affiliations.affiliation_id = affiliations.affiliation_id WHERE records_x_affiliations.record_id=?""", (record["record_id"],)) + record["datacite:creatorAffiliation"] = rows_to_dict(lookup_cur) + + lookup_cur.execute("""SELECT creators.creator FROM creators JOIN records_x_creators on records_x_creators.creator_id = creators.creator_id WHERE records_x_creators.record_id=? AND records_x_creators.is_contributor=1 order by records_x_creators_id asc""", (record["record_id"],)) + record["dc:contributor"] = rows_to_dict(lookup_cur) + + lookup_cur.execute("""SELECT subjects.subject FROM subjects JOIN records_x_subjects on records_x_subjects.subject_id = subjects.subject_id WHERE records_x_subjects.record_id=?""", (record["record_id"],)) + record["dc:subject"] = rows_to_dict(lookup_cur) + + lookup_cur.execute("""SELECT publishers.publisher FROM publishers JOIN records_x_publishers on records_x_publishers.publisher_id = publishers.publisher_id WHERE records_x_publishers.record_id=?""", (record["record_id"],)) + record["dc:publisher"] = rows_to_dict(lookup_cur) + + lookup_cur.execute("""SELECT rights.rights FROM rights JOIN records_x_rights on records_x_rights.rights_id = rights.rights_id WHERE records_x_rights.record_id=?""", (record["record_id"],)) + record["dc:rights"] = rows_to_dict(lookup_cur) + + lookup_cur.execute("SELECT description FROM descriptions WHERE record_id=? and language='en' "), (record["record_id"],) + record["dc:description"] = rows_to_dict(lookup_cur) + + lookup_cur.execute("SELECT description FROM descriptions WHERE record_id=? and language='fr' "), (record["record_id"],) + record["frdr:description_fr"] = rows_to_dict(lookup_cur) + + lookup_cur.execute("""SELECT tags.tag FROM tags JOIN records_x_tags on records_x_tags.tag_id = tags.tag_id WHERE records_x_tags.record_id=? and tags.language = 'en' """, (record["record_id"],)) + record["frdr:tags"] = rows_to_dict(lookup_cur) + + lookup_cur.execute("""SELECT tags.tag FROM tags JOIN records_x_tags on records_x_tags.tag_id = tags.tag_id WHERE records_x_tags.record_id=? and tags.language = 'fr' """, (record["record_id"],)) + record["frdr:tags_fr"] = rows_to_dict(lookup_cur) + + lookup_cur.execute("""SELECT access.access FROM access JOIN records_x_access on records_x_access.access_id = access.access_id WHERE records_x_access.record_id=?""", (record["record_id"],)) + record["frdr:access"] = rows_to_dict(lookup_cur) + + return record + + +def get_metadata_list( + server, + db, + user, + password, + query=None, + provider_id=None, + client_id=None, + records_cursor=None + ): + + # TODO: support listing by set + if records_cursor is None: + records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (db, user, password, server)) + with records_con: + records_cursor = records_con.cursor() + records_sql = """SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id""" + records_cursor.execute(records_sql) + + record_set = records_cursor.fetchmany(config.RESULT_SET_SIZE) + results = [] + for row in record_set: + record = (dict(zip(['record_id', 'title', 'pub_date', 'contact', 'series', 'source_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp'], row))) + + full_record = assemble_record(record, db, user, password, server) + if full_record is not None: + results.append(build_metadata(full_record)) + + return results, records_cursor.rowcount, records_cursor + + +def get_metadata(identifier, db, user, password, server): + records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (db, user, password, server)) + with records_con: + records_cursor = records_con.cursor() + records_sql = ("""SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id AND recs.record_id =?""", (identifier,)) + records_cursor.execute(records_sql) + row = records_cursor.fetchone() + record = (dict(zip(['record_id', 'title', 'pub_date', 'contact', 'series', 'source_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp'], row))) + + full_record = assemble_record(record, db, user, password, server) + return build_metadata(full_record) + + +def get_sets(db, user, password, server): + # TODO: this is returning the wrong number of parameters + repos_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (db, user, password, server)) + with repos_con: + repos_cursor = repos_con.cursor() + + repos_cursor.execute("SELECT repository_name from repositories") + results = repos_cursor.fetchall() + return results, len(results) From 8d9c829103ae2f9558b7313f2aec12f91889fc4c Mon Sep 17 00:00:00 2001 From: Alex Garnett Date: Fri, 14 Feb 2020 15:59:39 -0800 Subject: [PATCH 09/67] lots of recasting, postgres cleanups --- viringo/catalogs.py | 29 +++++------------------------ viringo/metadata.py | 2 +- viringo/services/frdr.py | 30 +++++++++++++++--------------- 3 files changed, 21 insertions(+), 40 deletions(-) diff --git a/viringo/catalogs.py b/viringo/catalogs.py index fd3d424..8b25894 100644 --- a/viringo/catalogs.py +++ b/viringo/catalogs.py @@ -489,14 +489,11 @@ def listSets( def build_header(self, result): """Construct a OAI-PMH record header""" - # Provider symbol can just be extracted from the client symbol - provider_symbol, _ = result.client.split(".") - return common.Header( None, - 'doi:' + result.identifier, + 'doi:' + str(result.identifier), result.updated_datetime, - setspec=[provider_symbol, result.client], + setspec=[result.client], deleted=not result.active ) @@ -510,18 +507,6 @@ def build_record(self, metadata): def build_metadata_map(self, result): """Construct a metadata map object for oai metadata writing""" - dates = [] - if result.publication_year: - dates.append(str(result.publication_year)) - dates.extend([date['type'] + ": " + str(date['date']) for date in result.dates]) - - rights = [] - for right in result.rights: - if right['statement']: - rights.append(right['statement']) - if right['uri']: - rights.append(right['uri']) - identifiers = [ identifier_to_string(identifier) for identifier in result.identifiers ] @@ -531,24 +516,20 @@ def build_metadata_map(self, result): for relation in result.relations ] - contributors = [ - contributor.get('name') for contributor in result.contributors - ] - metadata = { 'title': result.titles, 'creator': result.creators, 'subject': result.subjects, 'description': result.descriptions, 'publisher': [result.publisher] if result.publisher else [], - 'contributor': contributors, - 'date': dates, + 'contributor': result.contributors, + 'date': result.dates, 'type': result.resource_types, 'format': result.formats, 'identifier': identifiers, 'relation': relations, 'language': [result.language] if result.language else [], - 'rights': rights, + 'rights': result.rights, 'xml': result.xml, 'set': result.client, 'metadata_version': result.metadata_version diff --git a/viringo/metadata.py b/viringo/metadata.py index 27c7bfb..8b1bccb 100644 --- a/viringo/metadata.py +++ b/viringo/metadata.py @@ -45,7 +45,7 @@ def nsdc(name): new_element = etree.SubElement(e_dc, nsdc(name)) # The regular expression here is to filter only valid XML chars # Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] - new_element.text = re.sub(u'[^\u0020-\uD7FF\u0009\u000A\u000D\uE000-\uFFFD\U00010000-\U0010FFFF]+', '', value) + new_element.text = re.sub(u'[^\u0020-\uD7FF\u0009\u000A\u000D\uE000-\uFFFD\U00010000-\U0010FFFF]+', '', str(value)) def datacite_writer(element: etree.Element, metadata): """Writer for writing data in a metadata object out into raw datacite format""" diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py index 1ab7fb5..af04a5d 100644 --- a/viringo/services/frdr.py +++ b/viringo/services/frdr.py @@ -95,7 +95,7 @@ def build_metadata(data): result.language = '' result.relations = [] result.rights = data['dc:rights'] - result.client = '' + result.client = data['repository_name'] result.active = True return result @@ -169,8 +169,7 @@ def assemble_record(record, db, user, password, server): with con: lookup_cur = con.cursor(cursor_factory=None) - lookup_cur.execute("SELECT coordinate_type, lat, lon FROM geospatial WHERE record_id=?", - (record["record_id"],)) + lookup_cur.execute("SELECT coordinate_type, lat, lon FROM geospatial WHERE record_id=%s", [record["record_id"]]) geodata = lookup_cur.fetchall() record["frdr:geospatial"] = [] polycoordinates = [] @@ -188,40 +187,41 @@ def assemble_record(record, db, user, password, server): record["frdr:geospatial"].append({"frdr:geospatial_type": "Feature", "frdr:geospatial_geometry": {"frdr:geometry_type": "Polygon", "frdr:geometry_coordinates": polycoordinates}}) with con: + from psycopg2.extras import DictCursor lookup_cur = con.cursor(cursor_factory=DictCursor) # attach the other values to the dict - lookup_cur.execute("""SELECT creators.creator FROM creators JOIN records_x_creators on records_x_creators.creator_id = creators.creator_id WHERE records_x_creators.record_id=? AND records_x_creators.is_contributor=0 order by records_x_creators_id asc""", (record["record_id"],)) + lookup_cur.execute("""SELECT creators.creator FROM creators JOIN records_x_creators on records_x_creators.creator_id = creators.creator_id WHERE records_x_creators.record_id=%s AND records_x_creators.is_contributor=0 order by records_x_creators_id asc""", [record["record_id"]]) record["dc:contributor.author"] = rows_to_dict(lookup_cur) - lookup_cur.execute("""SELECT affiliations.affiliation FROM affiliations JOIN records_x_affiliations on records_x_affiliations.affiliation_id = affiliations.affiliation_id WHERE records_x_affiliations.record_id=?""", (record["record_id"],)) + lookup_cur.execute("""SELECT affiliations.affiliation FROM affiliations JOIN records_x_affiliations on records_x_affiliations.affiliation_id = affiliations.affiliation_id WHERE records_x_affiliations.record_id=%s""", [record["record_id"]]) record["datacite:creatorAffiliation"] = rows_to_dict(lookup_cur) - lookup_cur.execute("""SELECT creators.creator FROM creators JOIN records_x_creators on records_x_creators.creator_id = creators.creator_id WHERE records_x_creators.record_id=? AND records_x_creators.is_contributor=1 order by records_x_creators_id asc""", (record["record_id"],)) + lookup_cur.execute("""SELECT creators.creator FROM creators JOIN records_x_creators on records_x_creators.creator_id = creators.creator_id WHERE records_x_creators.record_id=%s AND records_x_creators.is_contributor=1 order by records_x_creators_id asc""", [record["record_id"]]) record["dc:contributor"] = rows_to_dict(lookup_cur) - lookup_cur.execute("""SELECT subjects.subject FROM subjects JOIN records_x_subjects on records_x_subjects.subject_id = subjects.subject_id WHERE records_x_subjects.record_id=?""", (record["record_id"],)) + lookup_cur.execute("""SELECT subjects.subject FROM subjects JOIN records_x_subjects on records_x_subjects.subject_id = subjects.subject_id WHERE records_x_subjects.record_id=%s""", [record["record_id"]]) record["dc:subject"] = rows_to_dict(lookup_cur) - lookup_cur.execute("""SELECT publishers.publisher FROM publishers JOIN records_x_publishers on records_x_publishers.publisher_id = publishers.publisher_id WHERE records_x_publishers.record_id=?""", (record["record_id"],)) + lookup_cur.execute("""SELECT publishers.publisher FROM publishers JOIN records_x_publishers on records_x_publishers.publisher_id = publishers.publisher_id WHERE records_x_publishers.record_id=%s""", [record["record_id"]]) record["dc:publisher"] = rows_to_dict(lookup_cur) - lookup_cur.execute("""SELECT rights.rights FROM rights JOIN records_x_rights on records_x_rights.rights_id = rights.rights_id WHERE records_x_rights.record_id=?""", (record["record_id"],)) + lookup_cur.execute("""SELECT rights.rights FROM rights JOIN records_x_rights on records_x_rights.rights_id = rights.rights_id WHERE records_x_rights.record_id=%s""", [record["record_id"]]) record["dc:rights"] = rows_to_dict(lookup_cur) - lookup_cur.execute("SELECT description FROM descriptions WHERE record_id=? and language='en' "), (record["record_id"],) + lookup_cur.execute("SELECT description FROM descriptions WHERE record_id=%s and language='en' ", [record["record_id"]]) record["dc:description"] = rows_to_dict(lookup_cur) - lookup_cur.execute("SELECT description FROM descriptions WHERE record_id=? and language='fr' "), (record["record_id"],) + lookup_cur.execute("SELECT description FROM descriptions WHERE record_id=%s and language='fr' ", [record["record_id"]]) record["frdr:description_fr"] = rows_to_dict(lookup_cur) - lookup_cur.execute("""SELECT tags.tag FROM tags JOIN records_x_tags on records_x_tags.tag_id = tags.tag_id WHERE records_x_tags.record_id=? and tags.language = 'en' """, (record["record_id"],)) + lookup_cur.execute("""SELECT tags.tag FROM tags JOIN records_x_tags on records_x_tags.tag_id = tags.tag_id WHERE records_x_tags.record_id=%s and tags.language = 'en' """, [record["record_id"]]) record["frdr:tags"] = rows_to_dict(lookup_cur) - lookup_cur.execute("""SELECT tags.tag FROM tags JOIN records_x_tags on records_x_tags.tag_id = tags.tag_id WHERE records_x_tags.record_id=? and tags.language = 'fr' """, (record["record_id"],)) + lookup_cur.execute("""SELECT tags.tag FROM tags JOIN records_x_tags on records_x_tags.tag_id = tags.tag_id WHERE records_x_tags.record_id=%s and tags.language = 'fr' """, [record["record_id"]]) record["frdr:tags_fr"] = rows_to_dict(lookup_cur) - lookup_cur.execute("""SELECT access.access FROM access JOIN records_x_access on records_x_access.access_id = access.access_id WHERE records_x_access.record_id=?""", (record["record_id"],)) + lookup_cur.execute("""SELECT access.access FROM access JOIN records_x_access on records_x_access.access_id = access.access_id WHERE records_x_access.record_id=%s""", [record["record_id"]]) record["frdr:access"] = rows_to_dict(lookup_cur) return record @@ -262,7 +262,7 @@ def get_metadata(identifier, db, user, password, server): records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (db, user, password, server)) with records_con: records_cursor = records_con.cursor() - records_sql = ("""SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id AND recs.record_id =?""", (identifier,)) + records_sql = ("""SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id AND recs.record_id =%s""", [identifier]) records_cursor.execute(records_sql) row = records_cursor.fetchone() record = (dict(zip(['record_id', 'title', 'pub_date', 'contact', 'series', 'source_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp'], row))) From f1d876cbaf3e63b0950874d3bc5f3bc73799296c Mon Sep 17 00:00:00 2001 From: Alex Garnett Date: Tue, 18 Feb 2020 13:35:57 -0800 Subject: [PATCH 10/67] properly support resumption token --- viringo/catalogs.py | 12 +++++++----- viringo/config.py | 4 +++- viringo/services/frdr.py | 38 ++++++++++++++++++++------------------ 3 files changed, 30 insertions(+), 24 deletions(-) diff --git a/viringo/catalogs.py b/viringo/catalogs.py index 8b25894..7f5da4c 100644 --- a/viringo/catalogs.py +++ b/viringo/catalogs.py @@ -347,7 +347,7 @@ def getRecord(self, metadataPrefix, identifier): # Should we implement this based on source_url and local_identifier the way we currently do for the harvester? - result = frdr.get_metadata(identifier, db=config.POSTGRES_DB, user=config.POSTGRES_USER, password=config.POSTGRES_PASSWORD, server=config.POSTGRES_SERVER) + result = frdr.get_metadata(identifier, db=config.POSTGRES_DB, user=config.POSTGRES_USER, password=config.POSTGRES_PASSWORD, server=config.POSTGRES_SERVER, port=config.POSTGRES_PORT) if not result: raise error.IdDoesNotExistError( "\"%s\" is unknown or illegal in this repository" % identifier @@ -388,10 +388,11 @@ def listRecords( db=config.POSTGRES_DB, user=config.POSTGRES_USER, password=config.POSTGRES_PASSWORD, + port=config.POSTGRES_PORT, query=search_query, provider_id=provider_id, client_id=client_id, - records_cursor=paging_cursor + cursor=paging_cursor ) records = [] @@ -421,7 +422,7 @@ def listIdentifiers( from_=None, until=None, set=None, - paging_cursor=None + cursor=None ): #pylint: disable=no-self-use,invalid-name """Returns pyoai data tuple for list of identifiers""" @@ -434,10 +435,11 @@ def listIdentifiers( db=config.POSTGRES_DB, user=config.POSTGRES_USER, password=config.POSTGRES_PASSWORD, + port=config.POSTGRES_PORT, query=search_query, provider_id=provider_id, client_id=client_id, - records_cursor=paging_cursor + cursor=paging_cursor ) records = [] @@ -468,7 +470,7 @@ def listSets( batch_size = 50 next_batch = paging_cursor + batch_size - results, total_results = frdr.get_sets(db=config.POSTGRES_DB, user=config.POSTGRES_USER, password=config.POSTGRES_PASSWORD, server=config.POSTGRES_SERVER) + results, total_results = frdr.get_sets(db=config.POSTGRES_DB, user=config.POSTGRES_USER, password=config.POSTGRES_PASSWORD, server=config.POSTGRES_SERVER, port=config.POSTGRES_PORT) results = results[paging_cursor: next_batch] if len(results) < batch_size: diff --git a/viringo/config.py b/viringo/config.py index 4a1ec81..baddaa0 100644 --- a/viringo/config.py +++ b/viringo/config.py @@ -29,4 +29,6 @@ # FRDR Postgres user POSTGRES_USER = os.getenv('OAIPMH_POSTGRES_USER', '') # FRDR Postgres password -POSTGRES_PASSWORD = os.getenv('OAIPMH_POSTGRES_PASSWORD', '') \ No newline at end of file +POSTGRES_PASSWORD = os.getenv('OAIPMH_POSTGRES_PASSWORD', '') +# FRDR Postgres port +POSTGRES_PORT = os.getenv('OAIPMH_POSTGRES_PORT', '5432') \ No newline at end of file diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py index af04a5d..3faf7a4 100644 --- a/viringo/services/frdr.py +++ b/viringo/services/frdr.py @@ -154,7 +154,7 @@ def rows_to_dict(cursor): return newdict -def assemble_record(record, db, user, password, server): +def assemble_record(record, db, user, password, server, port): record["dc:source"] = construct_local_url(record) if record["dc:source"] is None: return None @@ -165,7 +165,7 @@ def assemble_record(record, db, user, password, server): if (len(record['title']) == 0): return None - con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (db, user, password, server)) + con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s' port='%s'" % (db, user, password, server, port)) with con: lookup_cur = con.cursor(cursor_factory=None) @@ -232,34 +232,36 @@ def get_metadata_list( db, user, password, + port, query=None, provider_id=None, client_id=None, - records_cursor=None + cursor=None ): # TODO: support listing by set - if records_cursor is None: - records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (db, user, password, server)) - with records_con: - records_cursor = records_con.cursor() - records_sql = """SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id""" - records_cursor.execute(records_sql) - - record_set = records_cursor.fetchmany(config.RESULT_SET_SIZE) + records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s' port='%s'" % (db, user, password, server, port)) + with records_con: + db_cursor = records_con.cursor() + records_sql = """SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id""" + if cursor is not None: + records_sql = records_sql + " OFFSET " + cursor + db_cursor.execute(records_sql) + + record_set = db_cursor.fetchmany(config.RESULT_SET_SIZE) results = [] for row in record_set: record = (dict(zip(['record_id', 'title', 'pub_date', 'contact', 'series', 'source_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp'], row))) - full_record = assemble_record(record, db, user, password, server) + full_record = assemble_record(record, db, user, password, server, port) if full_record is not None: results.append(build_metadata(full_record)) - return results, records_cursor.rowcount, records_cursor + return results, db_cursor.rowcount, len(record_set) -def get_metadata(identifier, db, user, password, server): - records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (db, user, password, server)) +def get_metadata(identifier, db, user, password, server, port): + records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s' port='%s'" % (db, user, password, server, port)) with records_con: records_cursor = records_con.cursor() records_sql = ("""SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id AND recs.record_id =%s""", [identifier]) @@ -267,13 +269,13 @@ def get_metadata(identifier, db, user, password, server): row = records_cursor.fetchone() record = (dict(zip(['record_id', 'title', 'pub_date', 'contact', 'series', 'source_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp'], row))) - full_record = assemble_record(record, db, user, password, server) + full_record = assemble_record(record, db, user, password, server, port) return build_metadata(full_record) -def get_sets(db, user, password, server): +def get_sets(db, user, password, server, port): # TODO: this is returning the wrong number of parameters - repos_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s'" % (db, user, password, server)) + repos_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s' port='%s'" % (db, user, password, server, port)) with repos_con: repos_cursor = repos_con.cursor() From e7e6a822df8021bab1c73ad5469356d91fa3ce63 Mon Sep 17 00:00:00 2001 From: Alex Garnett Date: Wed, 19 Feb 2020 14:27:31 -0800 Subject: [PATCH 11/67] fix calls to get sets and identifiers --- viringo/catalogs.py | 19 +++++++------------ viringo/services/frdr.py | 17 ++++++++--------- 2 files changed, 15 insertions(+), 21 deletions(-) diff --git a/viringo/catalogs.py b/viringo/catalogs.py index 7f5da4c..a937934 100644 --- a/viringo/catalogs.py +++ b/viringo/catalogs.py @@ -380,9 +380,6 @@ def listRecords( # If available get the search query from the set param search_query = set_to_search_query(set) - # From and until parameters aren't supported with FRDR - # Get both a provider and client_id from the set - provider_id, client_id = set_to_provider_client(set) results, total_records, paging_cursor = frdr.get_metadata_list( server=config.POSTGRES_SERVER, db=config.POSTGRES_DB, @@ -390,8 +387,7 @@ def listRecords( password=config.POSTGRES_PASSWORD, port=config.POSTGRES_PORT, query=search_query, - provider_id=provider_id, - client_id=client_id, + set=set, cursor=paging_cursor ) @@ -422,13 +418,13 @@ def listIdentifiers( from_=None, until=None, set=None, - cursor=None + paging_cursor=None ): #pylint: disable=no-self-use,invalid-name """Returns pyoai data tuple for list of identifiers""" - # Get both a provider and client_id from the set - provider_id, client_id = set_to_provider_client(set) + # If available get the search query from the set param + search_query = set_to_search_query(set) results, total_records, paging_cursor = frdr.get_metadata_list( server=config.POSTGRES_SERVER, @@ -437,8 +433,7 @@ def listIdentifiers( password=config.POSTGRES_PASSWORD, port=config.POSTGRES_PORT, query=search_query, - provider_id=provider_id, - client_id=client_id, + set=set, cursor=paging_cursor ) @@ -482,7 +477,7 @@ def listSets( if results: for identifier, name in results: # Format of a set is setSpec, setName, setDescription - records.append((identifier.upper(), name, None)) + records.append((identifier.split('//')[1].split('/')[0], name, None)) # This differs from the pyoai implementation in that we have to return a cursor here # But this is okay as we have a custom server to handle it. @@ -493,7 +488,7 @@ def build_header(self, result): return common.Header( None, - 'doi:' + str(result.identifier), + str(result.identifier), result.updated_datetime, setspec=[result.client], deleted=not result.active diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py index 3faf7a4..9467d15 100644 --- a/viringo/services/frdr.py +++ b/viringo/services/frdr.py @@ -86,16 +86,16 @@ def build_metadata(data): result.publication_year = dateutil.parser.parse(data['pub_date']).year result.dates = [data['pub_date']] result.contributors = data['dc:contributor'] - result.funding_references = [] + result.funding_references = data['contact'] result.sizes = [] result.geo_locations = data['frdr:geospatial'] - result.resource_types = [] + result.resource_types = [data['frdr:tags']] result.formats = [] result.identifiers = [] result.language = '' result.relations = [] result.rights = data['dc:rights'] - result.client = data['repository_name'] + result.client = data['repository_url'].split('//')[1].split('/')[0] result.active = True return result @@ -234,16 +234,16 @@ def get_metadata_list( password, port, query=None, - provider_id=None, - client_id=None, + set=None, cursor=None ): - # TODO: support listing by set records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s' port='%s'" % (db, user, password, server, port)) with records_con: db_cursor = records_con.cursor() records_sql = """SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id""" + if set is not None: + records_sql = records_sql + " AND (repos.homepage_url='http://" + set + "/' OR repos.homepage_url='https://" + set + "/')" if cursor is not None: records_sql = records_sql + " OFFSET " + cursor db_cursor.execute(records_sql) @@ -264,7 +264,7 @@ def get_metadata(identifier, db, user, password, server, port): records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s' port='%s'" % (db, user, password, server, port)) with records_con: records_cursor = records_con.cursor() - records_sql = ("""SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id AND recs.record_id =%s""", [identifier]) + records_sql = """SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id AND recs.record_id =""" + identifier records_cursor.execute(records_sql) row = records_cursor.fetchone() record = (dict(zip(['record_id', 'title', 'pub_date', 'contact', 'series', 'source_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp'], row))) @@ -274,11 +274,10 @@ def get_metadata(identifier, db, user, password, server, port): def get_sets(db, user, password, server, port): - # TODO: this is returning the wrong number of parameters repos_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s' port='%s'" % (db, user, password, server, port)) with repos_con: repos_cursor = repos_con.cursor() - repos_cursor.execute("SELECT repository_name from repositories") + repos_cursor.execute("SELECT homepage_url, repository_name from repositories") results = repos_cursor.fetchall() return results, len(results) From 6352d5b576e4036d6083488e7a0bc0d276f98563 Mon Sep 17 00:00:00 2001 From: Kelly Stathis Date: Thu, 16 Apr 2020 14:33:21 -0700 Subject: [PATCH 12/67] Add new set for openaire_data Do not transform setSpec URL (keep as homepage_url) --- viringo/catalogs.py | 2 +- viringo/services/frdr.py | 18 +++++++++++------- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/viringo/catalogs.py b/viringo/catalogs.py index a937934..c103cef 100644 --- a/viringo/catalogs.py +++ b/viringo/catalogs.py @@ -477,7 +477,7 @@ def listSets( if results: for identifier, name in results: # Format of a set is setSpec, setName, setDescription - records.append((identifier.split('//')[1].split('/')[0], name, None)) + records.append((identifier, name, None)) # This differs from the pyoai implementation in that we have to return a cursor here # But this is okay as we have a custom server to handle it. diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py index 9467d15..3047c6d 100644 --- a/viringo/services/frdr.py +++ b/viringo/services/frdr.py @@ -95,7 +95,7 @@ def build_metadata(data): result.language = '' result.relations = [] result.rights = data['dc:rights'] - result.client = data['repository_url'].split('//')[1].split('/')[0] + result.client = data['homepage_url'] result.active = True return result @@ -241,17 +241,18 @@ def get_metadata_list( records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s' port='%s'" % (db, user, password, server, port)) with records_con: db_cursor = records_con.cursor() - records_sql = """SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id""" - if set is not None: - records_sql = records_sql + " AND (repos.homepage_url='http://" + set + "/' OR repos.homepage_url='https://" + set + "/')" + records_sql = """SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp, repos.homepage_url FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id""" + if set is not None and set != 'openaire_data': + records_sql = records_sql + " AND (repos.homepage_url='" + set + "')" if cursor is not None: records_sql = records_sql + " OFFSET " + cursor db_cursor.execute(records_sql) record_set = db_cursor.fetchmany(config.RESULT_SET_SIZE) + results = [] for row in record_set: - record = (dict(zip(['record_id', 'title', 'pub_date', 'contact', 'series', 'source_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp'], row))) + record = (dict(zip(['record_id', 'title', 'pub_date', 'contact', 'series', 'source_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp', 'homepage_url'], row))) full_record = assemble_record(record, db, user, password, server, port) if full_record is not None: @@ -264,10 +265,10 @@ def get_metadata(identifier, db, user, password, server, port): records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s' port='%s'" % (db, user, password, server, port)) with records_con: records_cursor = records_con.cursor() - records_sql = """SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id AND recs.record_id =""" + identifier + records_sql = """SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp, repos.homepage_url FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id AND recs.record_id =""" + identifier records_cursor.execute(records_sql) row = records_cursor.fetchone() - record = (dict(zip(['record_id', 'title', 'pub_date', 'contact', 'series', 'source_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp'], row))) + record = (dict(zip(['record_id', 'title', 'pub_date', 'contact', 'series', 'source_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp', 'homepage_url'], row))) full_record = assemble_record(record, db, user, password, server, port) return build_metadata(full_record) @@ -280,4 +281,7 @@ def get_sets(db, user, password, server, port): repos_cursor.execute("SELECT homepage_url, repository_name from repositories") results = repos_cursor.fetchall() + + results.append(['openaire_data', 'OpenAIRE']) + return results, len(results) From 321abe00b32b96680259756e6627f2b9b40a6672 Mon Sep 17 00:00:00 2001 From: Kelly Stathis Date: Thu, 16 Apr 2020 18:04:20 -0700 Subject: [PATCH 13/67] Construct XML from FRDR metadata for oai_datacite --- viringo/services/frdr.py | 90 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 89 insertions(+), 1 deletion(-) diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py index 3047c6d..852df97 100644 --- a/viringo/services/frdr.py +++ b/viringo/services/frdr.py @@ -6,6 +6,7 @@ import dateutil.parser import dateutil.tz from viringo import config +import xml.etree.cElementTree as ET class Metadata: """Represents a DataCite metadata resultset""" @@ -62,6 +63,93 @@ def __init__( self.client = client self.active = active +def construct_datacite_xml(data): + resource = ET.Element("resource") + resource.set("xmlns", "http://datacite.org/schema/kernel-4") + resource.set("xmlns:xsi", "http://www.w3.org/2001/XMLSchema-instance") + resource.set("xsi:schemaLocation", + "http://datacite.org/schema/kernel-4 http://schema.datacite.org/meta/kernel-4/metadata.xsd") + + # Add resource URL as identifier + identifier = ET.SubElement(resource, "identifier") + identifier.set("identifierType", "URL") + identifier.text = data['source_url'] + if data['source_url'] == '': + if data['item_url_pattern'] != '' and "%id%" in data['item_url_pattern'] and data['local_identifier'] != '': + identifier.text = data['item_url_pattern'].replace("%id%", data['local_identifier']) + + # Add creators + creators = ET.SubElement(resource, "creators") + for creator_entry in data['dc:contributor.author']: + creator = ET.SubElement(creators, "creator") + creatorName = ET.SubElement(creator, "creatorName") + creatorName.text = creator_entry + + # Add title + titles = ET.SubElement(resource, "titles") + title = ET.SubElement(titles, "title") + title.text = data['title'] + + # Add publisher + publisher = ET.SubElement(resource, "publisher") + publisher.text = data['repository_name'] + + # Add publication year + publicationyear = ET.SubElement(resource, "publicationyear") + publicationyear.text = data['pub_date'][:4] + + # Add subjects + subject_and_tags = [] + subjects = ET.SubElement(resource, "subjects") + for subject_entry in data['dc:subject'] + data['frdr:tags'] + data['frdr:tags_fr']: + if subject_entry not in subject_and_tags: + subject_and_tags.append(subject_entry) + subject = ET.SubElement(subjects, "subject") + subject.text = subject_entry + + # Add dates + dates = ET.SubElement(resource, "dates") + date = ET.SubElement(dates, "date") + date.set("dateType", "Issued") + date.text = data['pub_date'] + + # Add resourceType + resourceType = ET.SubElement(resource, "resourceType") + resourceType.set("resourceTypeGeneral", "Dataset") + resourceType.text = "Dataset" + + # Add alternateIdentifiers + alternateIdentifiers = ET.SubElement(resource, "alternateIdentifiers") + alternateIdentifier = ET.SubElement(alternateIdentifiers, "alternateIdentifier") + alternateIdentifier.set("alternateIdentifierType", "local") + alternateIdentifier.text = data['local_identifier'] + + # Add relatedIdentifiers (series) + if data['series'] != "": + relatedIdentifiers = ET.SubElement(resource, "relatedIdentifiers") + relatedIdentifier = ET.SubElement(relatedIdentifiers, "relatedIdentifier") + relatedIdentifier.set("relationType", "isPartOf") + relatedIdentifier.text = data['series'] + + # Add rightsList + rightsList = ET.SubElement(resource, "rightsList") + for rights_entry in data['dc:rights']: + rights = ET.SubElement(rightsList, "rights") + rights.text = rights_entry + if "http" in rights_entry: + rights.set("rightsURI", rights_entry[rights_entry.find("http"):].strip()) + rights.text = rights_entry[:rights_entry.find("http")].strip() + + # Add description(s) + descriptions = ET.SubElement(resource, "descriptions") + for description_entry in data['dc:description'] + data['frdr:description_fr']: + if description_entry != "": + description = ET.SubElement(descriptions, "description") + description.set("descriptionType", "Abstract") + description.text = description_entry + xml_string = ET.tostring(resource) + print(xml_string) + return xml_string def build_metadata(data): """Parse single FRDR result into metadata object""" @@ -76,7 +164,7 @@ def build_metadata(data): updated = dateutil.parser.parse(data['pub_date']) result.updated_datetime = updated.astimezone(dateutil.tz.UTC).replace(tzinfo=None) - result.xml = None + result.xml = construct_datacite_xml(data) result.metadata_version = None result.titles = [data['title']] result.creators = data['dc:contributor.author'] From fbd7aae603acd350c57b456ff7d65f64931095be Mon Sep 17 00:00:00 2001 From: Kelly Stathis Date: Fri, 17 Apr 2020 09:42:36 -0700 Subject: [PATCH 14/67] Only include rightsList if there is a rights entry --- viringo/services/frdr.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py index 852df97..47e2f73 100644 --- a/viringo/services/frdr.py +++ b/viringo/services/frdr.py @@ -132,13 +132,14 @@ def construct_datacite_xml(data): relatedIdentifier.text = data['series'] # Add rightsList - rightsList = ET.SubElement(resource, "rightsList") - for rights_entry in data['dc:rights']: - rights = ET.SubElement(rightsList, "rights") - rights.text = rights_entry - if "http" in rights_entry: - rights.set("rightsURI", rights_entry[rights_entry.find("http"):].strip()) - rights.text = rights_entry[:rights_entry.find("http")].strip() + if len(data['dc:rights']) > 0: + rightsList = ET.SubElement(resource, "rightsList") + for rights_entry in data['dc:rights']: + rights = ET.SubElement(rightsList, "rights") + rights.text = rights_entry + if "http" in rights_entry: + rights.set("rightsURI", rights_entry[rights_entry.find("http"):].strip()) + rights.text = rights_entry[:rights_entry.find("http")].strip() # Add description(s) descriptions = ET.SubElement(resource, "descriptions") From b39e5f3e675115b944a2e6e9a2c7917a27e988c1 Mon Sep 17 00:00:00 2001 From: Kelly Stathis Date: Fri, 17 Apr 2020 09:43:04 -0700 Subject: [PATCH 15/67] Hardcode "Dataset" for resourcetypes (for Dublin Core) --- viringo/services/frdr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py index 47e2f73..a500acd 100644 --- a/viringo/services/frdr.py +++ b/viringo/services/frdr.py @@ -178,7 +178,7 @@ def build_metadata(data): result.funding_references = data['contact'] result.sizes = [] result.geo_locations = data['frdr:geospatial'] - result.resource_types = [data['frdr:tags']] + result.resource_types = ['Dataset'] result.formats = [] result.identifiers = [] result.language = '' From fde0bd8aee837ed833c3cffa6ff2e8e95eeee58f Mon Sep 17 00:00:00 2001 From: Kelly Stathis Date: Fri, 17 Apr 2020 09:45:20 -0700 Subject: [PATCH 16/67] Include subjects and tags in dc:subjects, deduplicate when needed --- viringo/services/frdr.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py index a500acd..7c617f9 100644 --- a/viringo/services/frdr.py +++ b/viringo/services/frdr.py @@ -169,7 +169,14 @@ def build_metadata(data): result.metadata_version = None result.titles = [data['title']] result.creators = data['dc:contributor.author'] - result.subjects = data['dc:subject'] + result.subjects = [] + + # De-duplicate subjects and tags + for subject in data['dc:subject'] + data['frdr:tags'] + data['frdr:tags_fr']: + if subject not in result.subjects: + result.subjects.append(subject) + + # TODO: Add French description result.descriptions = data['dc:description'] result.publisher = data['dc:publisher'] result.publication_year = dateutil.parser.parse(data['pub_date']).year From 6a074251177dd1719d342df2ea1805a78b57fd86 Mon Sep 17 00:00:00 2001 From: Kelly Stathis Date: Fri, 17 Apr 2020 09:49:50 -0700 Subject: [PATCH 17/67] Use dateutil to parse year for oai_datacite --- viringo/services/frdr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py index 7c617f9..0803f9e 100644 --- a/viringo/services/frdr.py +++ b/viringo/services/frdr.py @@ -96,7 +96,7 @@ def construct_datacite_xml(data): # Add publication year publicationyear = ET.SubElement(resource, "publicationyear") - publicationyear.text = data['pub_date'][:4] + publicationyear.text = dateutil.parser.parse(data['pub_date']).year # Add subjects subject_and_tags = [] From d9de944145688c34cbe0b549c291ce967451079d Mon Sep 17 00:00:00 2001 From: Kelly Stathis Date: Fri, 17 Apr 2020 09:50:55 -0700 Subject: [PATCH 18/67] Revert "Use dateutil to parse year for oai_datacite" This reverts commit 6a074251177dd1719d342df2ea1805a78b57fd86. --- viringo/services/frdr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py index 0803f9e..7c617f9 100644 --- a/viringo/services/frdr.py +++ b/viringo/services/frdr.py @@ -96,7 +96,7 @@ def construct_datacite_xml(data): # Add publication year publicationyear = ET.SubElement(resource, "publicationyear") - publicationyear.text = dateutil.parser.parse(data['pub_date']).year + publicationyear.text = data['pub_date'][:4] # Add subjects subject_and_tags = [] From 229ae49f774a061146d4ec328f38423f4478637c Mon Sep 17 00:00:00 2001 From: Kelly Stathis Date: Fri, 17 Apr 2020 14:34:37 -0700 Subject: [PATCH 19/67] Remove contact info (email) from funding_references and remove print for datacite XML --- viringo/services/frdr.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py index 7c617f9..3f56323 100644 --- a/viringo/services/frdr.py +++ b/viringo/services/frdr.py @@ -149,7 +149,6 @@ def construct_datacite_xml(data): description.set("descriptionType", "Abstract") description.text = description_entry xml_string = ET.tostring(resource) - print(xml_string) return xml_string def build_metadata(data): @@ -182,7 +181,7 @@ def build_metadata(data): result.publication_year = dateutil.parser.parse(data['pub_date']).year result.dates = [data['pub_date']] result.contributors = data['dc:contributor'] - result.funding_references = data['contact'] + result.funding_references = '' result.sizes = [] result.geo_locations = data['frdr:geospatial'] result.resource_types = ['Dataset'] From cf17a3a8ec7bca8aac6fe2e26fd0b4a8f4d8c611 Mon Sep 17 00:00:00 2001 From: Kelly Stathis Date: Fri, 17 Apr 2020 15:41:09 -0700 Subject: [PATCH 20/67] Add FRDR as contributor with type HostingInstitution --- viringo/services/frdr.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py index 3f56323..917980c 100644 --- a/viringo/services/frdr.py +++ b/viringo/services/frdr.py @@ -107,6 +107,13 @@ def construct_datacite_xml(data): subject = ET.SubElement(subjects, "subject") subject.text = subject_entry + # Add FRDR as HostingInstituton + contributors = ET.SubElement(resource, "contributors") + contributor = ET.SubElement(contributors, "contributor") + contributor.set("contributorType", "HostingInstitution") + contributorName = ET.SubElement(contributor, "contributorName") + contributorName.text = "Federated Research Data Repository / dépôt fédéré de données de recherche" + # Add dates dates = ET.SubElement(resource, "dates") date = ET.SubElement(dates, "date") From 1c9139258cce45666942f7d96bbdf56f782f34e1 Mon Sep 17 00:00:00 2001 From: Kelly Stathis Date: Fri, 17 Apr 2020 15:42:53 -0700 Subject: [PATCH 21/67] Include access in rightsList where applicable. Remove container elements (rightsList, descriptions) when empty --- viringo/services/frdr.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py index 917980c..c781826 100644 --- a/viringo/services/frdr.py +++ b/viringo/services/frdr.py @@ -139,14 +139,17 @@ def construct_datacite_xml(data): relatedIdentifier.text = data['series'] # Add rightsList - if len(data['dc:rights']) > 0: - rightsList = ET.SubElement(resource, "rightsList") - for rights_entry in data['dc:rights']: + rightsList = ET.SubElement(resource, "rightsList") + for rights_entry in data['dc:rights'] + data['frdr:access']: + if rights_entry != '': rights = ET.SubElement(rightsList, "rights") rights.text = rights_entry if "http" in rights_entry: rights.set("rightsURI", rights_entry[rights_entry.find("http"):].strip()) rights.text = rights_entry[:rights_entry.find("http")].strip() + # If rightsList is empty, remove it + if len(rightsList) == 0: + resource.remove(rightsList) # Add description(s) descriptions = ET.SubElement(resource, "descriptions") @@ -155,6 +158,10 @@ def construct_datacite_xml(data): description = ET.SubElement(descriptions, "description") description.set("descriptionType", "Abstract") description.text = description_entry + # If descriptions is empty, remove it + if len(descriptions) == 0: + resource.remove(descriptions) + xml_string = ET.tostring(resource) return xml_string From fa46e9272c56623f508ed90a83b92c00f83009f7 Mon Sep 17 00:00:00 2001 From: Alex Garnett Date: Mon, 20 Apr 2020 09:31:03 -0700 Subject: [PATCH 22/67] always use datacite unless frdr configuration provided --- viringo/oai.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/viringo/oai.py b/viringo/oai.py index 9ef1b7e..5e4fbc7 100644 --- a/viringo/oai.py +++ b/viringo/oai.py @@ -97,13 +97,10 @@ def handleVerb(self, verb, kw): def get_oai_server(): """Returns a pyoai server object that can process and return OAI requests""" if 'oai' not in g: - if config.CATALOG_SET == 'DateCite': - catalog_server = DataCiteOAIServer() - elif config.CATALOG_SET == 'FRDR': + if config.CATALOG_SET == 'FRDR': catalog_server = FRDROAIServer() else: - print('No valid metadata catalog configured') - sys.exit(1) + catalog_server = DataCiteOAIServer() metadata_registry = oaipmh.metadata.MetadataRegistry() metadata_registry.registerWriter('oai_dc', metadata.oai_dc_writer) From 6a49965f12c8da88bab52fd869995a551d2999ad Mon Sep 17 00:00:00 2001 From: Alex Garnett Date: Thu, 23 Apr 2020 13:47:53 -0700 Subject: [PATCH 23/67] support from and until parameters for frdr --- viringo/catalogs.py | 4 ++++ viringo/services/frdr.py | 6 ++++++ 2 files changed, 10 insertions(+) diff --git a/viringo/catalogs.py b/viringo/catalogs.py index c103cef..43b46be 100644 --- a/viringo/catalogs.py +++ b/viringo/catalogs.py @@ -388,6 +388,8 @@ def listRecords( port=config.POSTGRES_PORT, query=search_query, set=set, + from_datetime=from_, + until_datetime=until, cursor=paging_cursor ) @@ -434,6 +436,8 @@ def listIdentifiers( port=config.POSTGRES_PORT, query=search_query, set=set, + from_datetime=from_, + until_datetime=until, cursor=paging_cursor ) diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py index c781826..db37697 100644 --- a/viringo/services/frdr.py +++ b/viringo/services/frdr.py @@ -344,6 +344,8 @@ def get_metadata_list( port, query=None, set=None, + from_datetime=None, + until_datetime=None, cursor=None ): @@ -353,6 +355,10 @@ def get_metadata_list( records_sql = """SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp, repos.homepage_url FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id""" if set is not None and set != 'openaire_data': records_sql = records_sql + " AND (repos.homepage_url='" + set + "')" + if from_datetime is not None: + records_sql = records_sql + " AND recs.pub_date>='" + from_datetime + "'" + if until_datetime is not None: + records_sql = records_sql + " AND recs.pub_date<'" + until_datetime + "'" if cursor is not None: records_sql = records_sql + " OFFSET " + cursor db_cursor.execute(records_sql) From 6ab32f16f676f6536a0fc2d93bc91ec7a527bde7 Mon Sep 17 00:00:00 2001 From: Kelly Stathis Date: Thu, 23 Apr 2020 13:50:37 -0700 Subject: [PATCH 24/67] Use URL for identifier, remove dx from DOIs --- viringo/services/frdr.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py index db37697..192b8ad 100644 --- a/viringo/services/frdr.py +++ b/viringo/services/frdr.py @@ -169,7 +169,7 @@ def build_metadata(data): """Parse single FRDR result into metadata object""" result = Metadata() - result.identifier = data['record_id'] + result.identifier = data['dc:source'] # Here we want to parse a ISO date but convert to UTC and then remove the TZinfo entirely # This is because OAI always works in UTC. @@ -234,7 +234,7 @@ def construct_local_url(record): doi = re.search("(doi|DOI):\s?\S+", record["local_identifier"]) if doi: doi = doi.group(0).rstrip('\.') - local_url = re.sub("(doi|DOI):\s?", "https://dx.doi.org/", doi) + local_url = re.sub("(doi|DOI):\s?", "https://doi.org/", doi) return local_url # If the item has a source URL, use it From 14a16ab56724bb75071560a0d70b9f39def95119 Mon Sep 17 00:00:00 2001 From: Alex Garnett Date: Thu, 23 Apr 2020 13:54:26 -0700 Subject: [PATCH 25/67] update deletedRecord policy for frdr --- viringo/catalogs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/viringo/catalogs.py b/viringo/catalogs.py index 43b46be..e944bd7 100644 --- a/viringo/catalogs.py +++ b/viringo/catalogs.py @@ -294,7 +294,7 @@ def identify(self): protocolVersion="2.0", adminEmails=[config.OAIPMH_ADMIN_EMAIL], earliestDatestamp=datetime(2011, 1, 1), - deletedRecord='persistent', + deletedRecord='no', granularity='YYYY-MM-DDThh:mm:ssZ', compression=['gzip', 'deflate'], toolkit_description=False) From 369a478adcc3d8146ca439ce73780ba3ef239e1b Mon Sep 17 00:00:00 2001 From: Kelly Stathis Date: Thu, 7 May 2020 16:38:20 -0700 Subject: [PATCH 26/67] Use database pk for identifiers and add URL to the identifiers list (appears in oai_dc) --- viringo/catalogs.py | 4 +--- viringo/services/frdr.py | 4 +++- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/viringo/catalogs.py b/viringo/catalogs.py index e944bd7..4d708c3 100644 --- a/viringo/catalogs.py +++ b/viringo/catalogs.py @@ -508,9 +508,7 @@ def build_record(self, metadata): def build_metadata_map(self, result): """Construct a metadata map object for oai metadata writing""" - identifiers = [ - identifier_to_string(identifier) for identifier in result.identifiers - ] + identifiers = result.identifiers relations = [ identifier_to_string(relation) diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py index 192b8ad..397fecf 100644 --- a/viringo/services/frdr.py +++ b/viringo/services/frdr.py @@ -169,7 +169,7 @@ def build_metadata(data): """Parse single FRDR result into metadata object""" result = Metadata() - result.identifier = data['dc:source'] + result.identifier = data['record_id'] # Here we want to parse a ISO date but convert to UTC and then remove the TZinfo entirely # This is because OAI always works in UTC. @@ -207,6 +207,8 @@ def build_metadata(data): result.client = data['homepage_url'] result.active = True + result.identifiers.append(construct_local_url(data)) + return result From e2610cfeb09ea8bd752eaea7d28b4cd676920971 Mon Sep 17 00:00:00 2001 From: Kelly Stathis Date: Mon, 25 May 2020 17:48:24 -0700 Subject: [PATCH 27/67] Use item_url as OAI identifier with "oai:" prefix --- viringo/services/frdr.py | 66 ++++++++-------------------------------- 1 file changed, 13 insertions(+), 53 deletions(-) diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py index 397fecf..0757764 100644 --- a/viringo/services/frdr.py +++ b/viringo/services/frdr.py @@ -73,10 +73,7 @@ def construct_datacite_xml(data): # Add resource URL as identifier identifier = ET.SubElement(resource, "identifier") identifier.set("identifierType", "URL") - identifier.text = data['source_url'] - if data['source_url'] == '': - if data['item_url_pattern'] != '' and "%id%" in data['item_url_pattern'] and data['local_identifier'] != '': - identifier.text = data['item_url_pattern'].replace("%id%", data['local_identifier']) + identifier.text = data['item_url'] # Add creators creators = ET.SubElement(resource, "creators") @@ -169,7 +166,7 @@ def build_metadata(data): """Parse single FRDR result into metadata object""" result = Metadata() - result.identifier = data['record_id'] + result.identifier = "oai:" + data['item_url'] # Add oai: to identifier URL # Here we want to parse a ISO date but convert to UTC and then remove the TZinfo entirely # This is because OAI always works in UTC. @@ -207,52 +204,11 @@ def build_metadata(data): result.client = data['homepage_url'] result.active = True - result.identifiers.append(construct_local_url(data)) + result.identifiers.append(data['item_url']) return result -def construct_local_url(record): - # Check if the local_identifier has already been turned into a url - if "http" in record["local_identifier"].lower(): - return record["local_identifier"] - - # Check for OAI format of identifier (oai:domain:id) - oai_id = None - oai_search = re.search("oai:(.+):(.+)", record["local_identifier"]) - if oai_search: - oai_id = oai_search.group(2) - oai_id = oai_id.replace("_", ":") - - # If given a pattern then substitue in the item ID and return it - if "item_url_pattern" in record and record["item_url_pattern"]: - if oai_id: - local_url = re.sub("(\%id\%)", oai_id, record["item_url_pattern"]) - else: - local_url = re.sub("(\%id\%)", record["local_identifier"], record["item_url_pattern"]) - return local_url - - # Check if the identifier is a DOI - doi = re.search("(doi|DOI):\s?\S+", record["local_identifier"]) - if doi: - doi = doi.group(0).rstrip('\.') - local_url = re.sub("(doi|DOI):\s?", "https://doi.org/", doi) - return local_url - - # If the item has a source URL, use it - if ('source_url' in record) and record['source_url']: - return record['source_url'] - - # URL is in the identifier - local_url = re.search("(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?", - record["local_identifier"]) - if local_url: - return local_url.group(0) - - local_url = None - return local_url - - def rows_to_dict(cursor): newdict = [] if cursor: @@ -266,8 +222,8 @@ def rows_to_dict(cursor): def assemble_record(record, db, user, password, server, port): - record["dc:source"] = construct_local_url(record) - if record["dc:source"] is None: + + if record["item_url"] is None: return None if int(record["deleted"]) == 1: @@ -354,7 +310,7 @@ def get_metadata_list( records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s' port='%s'" % (db, user, password, server, port)) with records_con: db_cursor = records_con.cursor() - records_sql = """SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp, repos.homepage_url FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id""" + records_sql = """SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.item_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp, repos.homepage_url FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id""" if set is not None and set != 'openaire_data': records_sql = records_sql + " AND (repos.homepage_url='" + set + "')" if from_datetime is not None: @@ -369,7 +325,7 @@ def get_metadata_list( results = [] for row in record_set: - record = (dict(zip(['record_id', 'title', 'pub_date', 'contact', 'series', 'source_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp', 'homepage_url'], row))) + record = (dict(zip(['record_id', 'title', 'pub_date', 'contact', 'series', 'source_url', 'item_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp', 'homepage_url'], row))) full_record = assemble_record(record, db, user, password, server, port) if full_record is not None: @@ -382,10 +338,14 @@ def get_metadata(identifier, db, user, password, server, port): records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s' port='%s'" % (db, user, password, server, port)) with records_con: records_cursor = records_con.cursor() - records_sql = """SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp, repos.homepage_url FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id AND recs.record_id =""" + identifier + records_sql = ("""SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, + recs.item_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, + repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp, + repos.homepage_url FROM records recs, repositories repos + WHERE recs.repository_id = repos.repository_id AND recs.item_url =\'""" + identifier[4:] + "\'") # use identifier substring excluding oai: prefix records_cursor.execute(records_sql) row = records_cursor.fetchone() - record = (dict(zip(['record_id', 'title', 'pub_date', 'contact', 'series', 'source_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp', 'homepage_url'], row))) + record = (dict(zip(['record_id', 'title', 'pub_date', 'contact', 'series', 'source_url', 'item_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp', 'homepage_url'], row))) full_record = assemble_record(record, db, user, password, server, port) return build_metadata(full_record) From 36bb95b03b4ec069ba4af0c36e7e94c49b3bb7ce Mon Sep 17 00:00:00 2001 From: Kelly Stathis Date: Wed, 27 May 2020 11:29:03 -0700 Subject: [PATCH 28/67] Capitalize publicationYear --- viringo/services/frdr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py index 0757764..d179360 100644 --- a/viringo/services/frdr.py +++ b/viringo/services/frdr.py @@ -92,7 +92,7 @@ def construct_datacite_xml(data): publisher.text = data['repository_name'] # Add publication year - publicationyear = ET.SubElement(resource, "publicationyear") + publicationyear = ET.SubElement(resource, "publicationYear") publicationyear.text = data['pub_date'][:4] # Add subjects From 174b8708ef5cea2877446c160eabaffb8edb5613 Mon Sep 17 00:00:00 2001 From: Kelly Stathis Date: Wed, 27 May 2020 11:29:29 -0700 Subject: [PATCH 29/67] Move series into description with SeriesInformation type --- viringo/services/frdr.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py index d179360..dda7a35 100644 --- a/viringo/services/frdr.py +++ b/viringo/services/frdr.py @@ -128,13 +128,6 @@ def construct_datacite_xml(data): alternateIdentifier.set("alternateIdentifierType", "local") alternateIdentifier.text = data['local_identifier'] - # Add relatedIdentifiers (series) - if data['series'] != "": - relatedIdentifiers = ET.SubElement(resource, "relatedIdentifiers") - relatedIdentifier = ET.SubElement(relatedIdentifiers, "relatedIdentifier") - relatedIdentifier.set("relationType", "isPartOf") - relatedIdentifier.text = data['series'] - # Add rightsList rightsList = ET.SubElement(resource, "rightsList") for rights_entry in data['dc:rights'] + data['frdr:access']: @@ -155,6 +148,13 @@ def construct_datacite_xml(data): description = ET.SubElement(descriptions, "description") description.set("descriptionType", "Abstract") description.text = description_entry + + # Add series (series) + if data['series'] != "": + description_series = ET.SubElement(descriptions, "description") + description_series.set("descriptionType", "SeriesInformation") + description_series.text = data['series'] + # If descriptions is empty, remove it if len(descriptions) == 0: resource.remove(descriptions) From bc8143fcc0013fe1905cf9119745db6b35dae1c6 Mon Sep 17 00:00:00 2001 From: Kelly Stathis Date: Thu, 28 May 2020 16:06:52 -0700 Subject: [PATCH 30/67] Put openaire_data set first in list --- viringo/services/frdr.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py index dda7a35..251593b 100644 --- a/viringo/services/frdr.py +++ b/viringo/services/frdr.py @@ -356,9 +356,10 @@ def get_sets(db, user, password, server, port): with repos_con: repos_cursor = repos_con.cursor() - repos_cursor.execute("SELECT homepage_url, repository_name from repositories") - results = repos_cursor.fetchall() - + results = [] results.append(['openaire_data', 'OpenAIRE']) - return results, len(results) + repos_cursor.execute("SELECT homepage_url, repository_name from repositories") + results.extend(repos_cursor.fetchall()) + + return results, len(results) \ No newline at end of file From 355c351a3275350a2d6dac56612fbed965c06fc3 Mon Sep 17 00:00:00 2001 From: Kelly Stathis Date: Thu, 2 Jul 2020 13:17:39 -0700 Subject: [PATCH 31/67] Only include rights entries that have a URL --- viringo/services/frdr.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py index 251593b..583bfca 100644 --- a/viringo/services/frdr.py +++ b/viringo/services/frdr.py @@ -130,13 +130,11 @@ def construct_datacite_xml(data): # Add rightsList rightsList = ET.SubElement(resource, "rightsList") - for rights_entry in data['dc:rights'] + data['frdr:access']: - if rights_entry != '': + for rights_entry in data['dc:rights']: + if rights_entry != '' and "http" in rights_entry: rights = ET.SubElement(rightsList, "rights") - rights.text = rights_entry - if "http" in rights_entry: - rights.set("rightsURI", rights_entry[rights_entry.find("http"):].strip()) - rights.text = rights_entry[:rights_entry.find("http")].strip() + rights.set("rightsURI", rights_entry[rights_entry.find("http"):].strip()) + rights.text = rights_entry[:rights_entry.find("http")].strip() # If rightsList is empty, remove it if len(rightsList) == 0: resource.remove(rightsList) From e28231ff0fd2ff66daba4c1030ef77b367379e7a Mon Sep 17 00:00:00 2001 From: Kelly Stathis Date: Thu, 2 Jul 2020 13:18:28 -0700 Subject: [PATCH 32/67] Update contributor to have xml:lang attribute, separate en/fr --- viringo/services/frdr.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py index 583bfca..ee9bde0 100644 --- a/viringo/services/frdr.py +++ b/viringo/services/frdr.py @@ -106,10 +106,16 @@ def construct_datacite_xml(data): # Add FRDR as HostingInstituton contributors = ET.SubElement(resource, "contributors") - contributor = ET.SubElement(contributors, "contributor") - contributor.set("contributorType", "HostingInstitution") - contributorName = ET.SubElement(contributor, "contributorName") - contributorName.text = "Federated Research Data Repository / dépôt fédéré de données de recherche" + contributor_en = ET.SubElement(contributors, "contributor") + contributor_en.set("contributorType", "HostingInstitution") + contributor_en.set("xml:lang", "en") + contributorName_en = ET.SubElement(contributor_en, "contributorName") + contributorName_en.text = "Federated Research Data Repository" + contributor_fr = ET.SubElement(contributors, "contributor") + contributor_fr.set("contributorType", "HostingInstitution") + contributor_fr.set("xml:lang", "fr") + contributorName_fr = ET.SubElement(contributor_fr, "contributorName") + contributorName_fr.text = "Dépôt fédéré de données de recherche" # Add dates dates = ET.SubElement(resource, "dates") From 0e7e3fa6732bf1146e7439c203177d3f9007e21c Mon Sep 17 00:00:00 2001 From: Alex Garnett Date: Thu, 2 Jul 2020 14:06:59 -0700 Subject: [PATCH 33/67] fix resetting cursor position after two pages --- viringo/services/frdr.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py index ee9bde0..1fea665 100644 --- a/viringo/services/frdr.py +++ b/viringo/services/frdr.py @@ -335,7 +335,10 @@ def get_metadata_list( if full_record is not None: results.append(build_metadata(full_record)) - return results, db_cursor.rowcount, len(record_set) + if cursor is not None: + return results, db_cursor.rowcount, (len(record_set) + cursor) + else: + return results, db_cursor.rowcount, len(record_set) def get_metadata(identifier, db, user, password, server, port): From 2eb82e6c409c14b9e3dbc52511cbc5623c65e402 Mon Sep 17 00:00:00 2001 From: Kelly Stathis Date: Mon, 13 Jul 2020 11:59:16 -0700 Subject: [PATCH 34/67] Include rights entries that do not have a URL --- viringo/services/frdr.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py index 1fea665..7fc9f53 100644 --- a/viringo/services/frdr.py +++ b/viringo/services/frdr.py @@ -136,11 +136,14 @@ def construct_datacite_xml(data): # Add rightsList rightsList = ET.SubElement(resource, "rightsList") - for rights_entry in data['dc:rights']: - if rights_entry != '' and "http" in rights_entry: + for rights_entry in data['dc:rights'] + data['frdr:access']: + if rights_entry != '': rights = ET.SubElement(rightsList, "rights") - rights.set("rightsURI", rights_entry[rights_entry.find("http"):].strip()) - rights.text = rights_entry[:rights_entry.find("http")].strip() + rights.text = rights_entry + if "http" in rights_entry: + rights.set("rightsURI", rights_entry[rights_entry.find("http"):].strip()) + rights.text = rights_entry[:rights_entry.find("http")].strip() + # If rightsList is empty, remove it if len(rightsList) == 0: resource.remove(rightsList) From daa5938f54f0d9c2f2bb6cc06125f50592e56e45 Mon Sep 17 00:00:00 2001 From: Kelly Stathis Date: Mon, 13 Jul 2020 11:59:43 -0700 Subject: [PATCH 35/67] Add xml:lang for description and subject --- viringo/services/frdr.py | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py index 7fc9f53..b4b0586 100644 --- a/viringo/services/frdr.py +++ b/viringo/services/frdr.py @@ -71,6 +71,7 @@ def construct_datacite_xml(data): "http://datacite.org/schema/kernel-4 http://schema.datacite.org/meta/kernel-4/metadata.xsd") # Add resource URL as identifier + # TODO: Check if the URL is a DOI, reformat and use identifierType="DOI" identifier = ET.SubElement(resource, "identifier") identifier.set("identifierType", "URL") identifier.text = data['item_url'] @@ -98,10 +99,23 @@ def construct_datacite_xml(data): # Add subjects subject_and_tags = [] subjects = ET.SubElement(resource, "subjects") - for subject_entry in data['dc:subject'] + data['frdr:tags'] + data['frdr:tags_fr']: - if subject_entry not in subject_and_tags: + for subject_entry in data['dc:subject']: + if subject_entry not in subject_and_tags and subject_entry != "": subject_and_tags.append(subject_entry) subject = ET.SubElement(subjects, "subject") + subject.set("xml:lang", "en") + subject.text = subject_entry + for subject_entry in data['frdr:tags']: + if subject_entry not in subject_and_tags and subject_entry != "": + subject_and_tags.append(subject_entry) + subject = ET.SubElement(subjects, "subject") + subject.set("xml:lang", "en") + subject.text = subject_entry + for subject_entry in data['frdr:tags_fr']: + if subject_entry not in subject_and_tags and subject_entry != "": + subject_and_tags.append(subject_entry) + subject = ET.SubElement(subjects, "subject") + subject.set("xml:lang", "fr") subject.text = subject_entry # Add FRDR as HostingInstituton @@ -150,10 +164,17 @@ def construct_datacite_xml(data): # Add description(s) descriptions = ET.SubElement(resource, "descriptions") - for description_entry in data['dc:description'] + data['frdr:description_fr']: + for description_entry in data['dc:description']: + if description_entry != "": + description = ET.SubElement(descriptions, "description") + description.set("descriptionType", "Abstract") + description.set("xml:lang", "en") + description.text = description_entry + for description_entry in data['frdr:description_fr']: if description_entry != "": description = ET.SubElement(descriptions, "description") description.set("descriptionType", "Abstract") + description.set("xml:lang", "fr") description.text = description_entry # Add series (series) From 66d8bfe627a5c66397e852f6732e50407bc73e1a Mon Sep 17 00:00:00 2001 From: Alex Garnett Date: Mon, 13 Jul 2020 13:35:31 -0700 Subject: [PATCH 36/67] fix parsing single-element list values, make sure cursor is cast as int on reusme --- .gitignore | 4 +++- viringo/metadata.py | 2 ++ viringo/services/frdr.py | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 8c5f805..923fb3b 100644 --- a/.gitignore +++ b/.gitignore @@ -119,4 +119,6 @@ dmypy.json .vscode # Env configs -*.env \ No newline at end of file +*.env + +.DS_Store diff --git a/viringo/metadata.py b/viringo/metadata.py index 4ea2ee8..ff123c6 100644 --- a/viringo/metadata.py +++ b/viringo/metadata.py @@ -43,6 +43,8 @@ def nsdc(name): ]: for value in _map.get(name, []): if value: + if isinstance(value, list) and len(value) == 1: + value = value[0] new_element = etree.SubElement(e_dc, nsdc(name)) # The regular expression here is to filter only valid XML chars # Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py index b4b0586..8f297af 100644 --- a/viringo/services/frdr.py +++ b/viringo/services/frdr.py @@ -360,7 +360,7 @@ def get_metadata_list( results.append(build_metadata(full_record)) if cursor is not None: - return results, db_cursor.rowcount, (len(record_set) + cursor) + return results, db_cursor.rowcount, (len(record_set) + int(cursor)) else: return results, db_cursor.rowcount, len(record_set) From 7699d722741c792e645f80b870350aad7426e4fc Mon Sep 17 00:00:00 2001 From: Kelly Stathis Date: Mon, 20 Jul 2020 12:32:45 -0700 Subject: [PATCH 37/67] Only include subjects block if not empty --- viringo/services/frdr.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py index 8f297af..9bc764c 100644 --- a/viringo/services/frdr.py +++ b/viringo/services/frdr.py @@ -118,6 +118,10 @@ def construct_datacite_xml(data): subject.set("xml:lang", "fr") subject.text = subject_entry + # If subjects is empty, remove it + if len(subjects) == 0: + resource.remove(subjects) + # Add FRDR as HostingInstituton contributors = ET.SubElement(resource, "contributors") contributor_en = ET.SubElement(contributors, "contributor") From f6efb24a95f6a55513f03d417e7e84388ab9aafb Mon Sep 17 00:00:00 2001 From: Kelly Stathis Date: Mon, 20 Jul 2020 12:34:38 -0700 Subject: [PATCH 38/67] Use OAI-compliant identifier and query database using local_identifier --- viringo/services/frdr.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py index 9bc764c..379b941 100644 --- a/viringo/services/frdr.py +++ b/viringo/services/frdr.py @@ -198,7 +198,11 @@ def build_metadata(data): """Parse single FRDR result into metadata object""" result = Metadata() - result.identifier = "oai:" + data['item_url'] # Add oai: to identifier URL + # Construct identifier compliant with OAI spec + namespace = data['homepage_url'].replace("https://", "").replace("www.", "").replace("http://", "") + if namespace[-1] == "/": + namespace = namespace[:-1] + result.identifier = "oai:" + namespace + ":" + data['local_identifier'] # Here we want to parse a ISO date but convert to UTC and then remove the TZinfo entirely # This is because OAI always works in UTC. @@ -229,15 +233,13 @@ def build_metadata(data): result.geo_locations = data['frdr:geospatial'] result.resource_types = ['Dataset'] result.formats = [] - result.identifiers = [] + result.identifiers = [data['item_url']] result.language = '' result.relations = [] result.rights = data['dc:rights'] result.client = data['homepage_url'] result.active = True - result.identifiers.append(data['item_url']) - return result @@ -370,6 +372,7 @@ def get_metadata_list( def get_metadata(identifier, db, user, password, server, port): + local_identifier = identifier.split(":")[len(identifier.split(":"))-1] # get local_identifier substring from identifier records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s' port='%s'" % (db, user, password, server, port)) with records_con: records_cursor = records_con.cursor() @@ -377,7 +380,7 @@ def get_metadata(identifier, db, user, password, server, port): recs.item_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp, repos.homepage_url FROM records recs, repositories repos - WHERE recs.repository_id = repos.repository_id AND recs.item_url =\'""" + identifier[4:] + "\'") # use identifier substring excluding oai: prefix + WHERE recs.repository_id = repos.repository_id AND recs.local_identifier =\'""" + local_identifier + "\'") # use local_identifier records_cursor.execute(records_sql) row = records_cursor.fetchone() record = (dict(zip(['record_id', 'title', 'pub_date', 'contact', 'series', 'source_url', 'item_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp', 'homepage_url'], row))) From f87f9746648699e56e1aecd97f339d0993e675ab Mon Sep 17 00:00:00 2001 From: Kelly Stathis Date: Mon, 20 Jul 2020 13:00:47 -0700 Subject: [PATCH 39/67] Use info:eu-repo-Access-Terms vocabulary for access metadata (openAccess for Public) --- viringo/services/frdr.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py index 379b941..ac5d035 100644 --- a/viringo/services/frdr.py +++ b/viringo/services/frdr.py @@ -154,13 +154,20 @@ def construct_datacite_xml(data): # Add rightsList rightsList = ET.SubElement(resource, "rightsList") - for rights_entry in data['dc:rights'] + data['frdr:access']: + for rights_entry in data['dc:rights']: if rights_entry != '': rights = ET.SubElement(rightsList, "rights") rights.text = rights_entry if "http" in rights_entry: rights.set("rightsURI", rights_entry[rights_entry.find("http"):].strip()) rights.text = rights_entry[:rights_entry.find("http")].strip() + for access_entry in data["frdr:access"]: + rights = ET.SubElement(rightsList, "rights") + if access_entry == "Public": + rights.text = "info:eu-repo/semantics/openAccess" + else: + rights.text = "info:eu-repo/semantics/restrictedAccess" + # If rightsList is empty, remove it if len(rightsList) == 0: From c1dda75ad3a5a50a5058b61bd86f57f0dfd59fc3 Mon Sep 17 00:00:00 2001 From: Kelly Stathis Date: Fri, 24 Jul 2020 14:15:14 -0700 Subject: [PATCH 40/67] Use repo_oai_name in identifier and for setSpec --- viringo/services/frdr.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py index ac5d035..5287119 100644 --- a/viringo/services/frdr.py +++ b/viringo/services/frdr.py @@ -206,9 +206,7 @@ def build_metadata(data): result = Metadata() # Construct identifier compliant with OAI spec - namespace = data['homepage_url'].replace("https://", "").replace("www.", "").replace("http://", "") - if namespace[-1] == "/": - namespace = namespace[:-1] + namespace = data['repo_oai_name'] result.identifier = "oai:" + namespace + ":" + data['local_identifier'] # Here we want to parse a ISO date but convert to UTC and then remove the TZinfo entirely @@ -244,7 +242,7 @@ def build_metadata(data): result.language = '' result.relations = [] result.rights = data['dc:rights'] - result.client = data['homepage_url'] + result.client = data['repo_oai_name'] result.active = True return result @@ -351,9 +349,9 @@ def get_metadata_list( records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s' port='%s'" % (db, user, password, server, port)) with records_con: db_cursor = records_con.cursor() - records_sql = """SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, recs.item_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp, repos.homepage_url FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id""" + records_sql = """SELECT recs.record_id, recs.title, recs.pub_date, recs.series, recs.source_url, recs.item_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp, repos.homepage_url, repos.repo_oai_name FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id""" if set is not None and set != 'openaire_data': - records_sql = records_sql + " AND (repos.homepage_url='" + set + "')" + records_sql = records_sql + " AND (repos.repo_oai_name='" + set + "')" if from_datetime is not None: records_sql = records_sql + " AND recs.pub_date>='" + from_datetime + "'" if until_datetime is not None: @@ -366,7 +364,7 @@ def get_metadata_list( results = [] for row in record_set: - record = (dict(zip(['record_id', 'title', 'pub_date', 'contact', 'series', 'source_url', 'item_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp', 'homepage_url'], row))) + record = (dict(zip(['record_id', 'title', 'pub_date', 'series', 'source_url', 'item_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp', 'homepage_url', 'repo_oai_name'], row))) full_record = assemble_record(record, db, user, password, server, port) if full_record is not None: @@ -379,18 +377,20 @@ def get_metadata_list( def get_metadata(identifier, db, user, password, server, port): - local_identifier = identifier.split(":")[len(identifier.split(":"))-1] # get local_identifier substring from identifier + namespace = identifier.split(":")[1] + local_identifier = identifier.split(":")[2] # get local_identifier substring from identifier records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s' port='%s'" % (db, user, password, server, port)) with records_con: records_cursor = records_con.cursor() - records_sql = ("""SELECT recs.record_id, recs.title, recs.pub_date, recs.contact, recs.series, recs.source_url, + records_sql = ("""SELECT recs.record_id, recs.title, recs.pub_date, recs.series, recs.source_url, recs.item_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp, - repos.homepage_url FROM records recs, repositories repos - WHERE recs.repository_id = repos.repository_id AND recs.local_identifier =\'""" + local_identifier + "\'") # use local_identifier + repos.homepage_url, repos.repo_oai_name FROM records recs, repositories repos + WHERE recs.repository_id = repos.repository_id + AND recs.local_identifier =\'""" + local_identifier + "\'" + "AND repos.repo_oai_name=\'""" + namespace + "\'") records_cursor.execute(records_sql) row = records_cursor.fetchone() - record = (dict(zip(['record_id', 'title', 'pub_date', 'contact', 'series', 'source_url', 'item_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp', 'homepage_url'], row))) + record = (dict(zip(['record_id', 'title', 'pub_date', 'series', 'source_url', 'item_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp', 'homepage_url', 'repo_oai_name'], row))) full_record = assemble_record(record, db, user, password, server, port) return build_metadata(full_record) @@ -404,7 +404,7 @@ def get_sets(db, user, password, server, port): results = [] results.append(['openaire_data', 'OpenAIRE']) - repos_cursor.execute("SELECT homepage_url, repository_name from repositories") + repos_cursor.execute("SELECT repo_oai_name, repository_name from repositories") results.extend(repos_cursor.fetchall()) return results, len(results) \ No newline at end of file From b9750ab7365be491058ca1d832b1925df0f6274e Mon Sep 17 00:00:00 2001 From: Kelly Stathis Date: Fri, 24 Jul 2020 14:50:06 -0700 Subject: [PATCH 41/67] Add French titles, descriptions, and categories; subject/tags renamed to category/keywords --- viringo/services/frdr.py | 62 +++++++++++++++++++++++++--------------- 1 file changed, 39 insertions(+), 23 deletions(-) diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py index 5287119..cd073c1 100644 --- a/viringo/services/frdr.py +++ b/viringo/services/frdr.py @@ -83,10 +83,18 @@ def construct_datacite_xml(data): creatorName = ET.SubElement(creator, "creatorName") creatorName.text = creator_entry - # Add title + # Add titles titles = ET.SubElement(resource, "titles") - title = ET.SubElement(titles, "title") - title.text = data['title'] + if data['title_en'] != "": + title = ET.SubElement(titles, "title") + title.text = data['title_en'] + title.set("xml:lang", "en") + if data['title_fr'] != "": + title = ET.SubElement(titles, "title") + title.text = data['title_fr'] + title.set("xml:lang", "fr") + if data['title_en'] != "": + title.set("titleType", "TranslatedTitle") # Add publisher publisher = ET.SubElement(resource, "publisher") @@ -99,19 +107,25 @@ def construct_datacite_xml(data): # Add subjects subject_and_tags = [] subjects = ET.SubElement(resource, "subjects") - for subject_entry in data['dc:subject']: + for subject_entry in data['frdr:category_en']: if subject_entry not in subject_and_tags and subject_entry != "": subject_and_tags.append(subject_entry) subject = ET.SubElement(subjects, "subject") subject.set("xml:lang", "en") subject.text = subject_entry - for subject_entry in data['frdr:tags']: + for subject_entry in data['frdr:category_fr']: + if subject_entry not in subject_and_tags and subject_entry != "": + subject_and_tags.append(subject_entry) + subject = ET.SubElement(subjects, "subject") + subject.set("xml:lang", "fr") + subject.text = subject_entry + for subject_entry in data['frdr:keywords_en']: if subject_entry not in subject_and_tags and subject_entry != "": subject_and_tags.append(subject_entry) subject = ET.SubElement(subjects, "subject") subject.set("xml:lang", "en") subject.text = subject_entry - for subject_entry in data['frdr:tags_fr']: + for subject_entry in data['frdr:keywords_fr']: if subject_entry not in subject_and_tags and subject_entry != "": subject_and_tags.append(subject_entry) subject = ET.SubElement(subjects, "subject") @@ -175,13 +189,13 @@ def construct_datacite_xml(data): # Add description(s) descriptions = ET.SubElement(resource, "descriptions") - for description_entry in data['dc:description']: + for description_entry in data['dc:description_en']: if description_entry != "": description = ET.SubElement(descriptions, "description") description.set("descriptionType", "Abstract") description.set("xml:lang", "en") description.text = description_entry - for description_entry in data['frdr:description_fr']: + for description_entry in data['dc:description_fr']: if description_entry != "": description = ET.SubElement(descriptions, "description") description.set("descriptionType", "Abstract") @@ -218,17 +232,16 @@ def build_metadata(data): result.xml = construct_datacite_xml(data) result.metadata_version = None - result.titles = [data['title']] + result.titles = [data['title_en'], data['title_fr']] result.creators = data['dc:contributor.author'] result.subjects = [] # De-duplicate subjects and tags - for subject in data['dc:subject'] + data['frdr:tags'] + data['frdr:tags_fr']: + for subject in data['frdr:category_en'] + data['frdr:category_fr'] + data['frdr:keywords_en'] + data['frdr:keywords_fr']: if subject not in result.subjects: result.subjects.append(subject) - # TODO: Add French description - result.descriptions = data['dc:description'] + result.descriptions = data['dc:description_en'] + data['dc:description_fr'] result.publisher = data['dc:publisher'] result.publication_year = dateutil.parser.parse(data['pub_date']).year result.dates = [data['pub_date']] @@ -268,7 +281,7 @@ def assemble_record(record, db, user, password, server, port): if int(record["deleted"]) == 1: return None - if (len(record['title']) == 0): + if (len(record['title_en']) == 0 and len(record['title_fr']) == 0): return None con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s' port='%s'" % (db, user, password, server, port)) @@ -306,8 +319,11 @@ def assemble_record(record, db, user, password, server, port): lookup_cur.execute("""SELECT creators.creator FROM creators JOIN records_x_creators on records_x_creators.creator_id = creators.creator_id WHERE records_x_creators.record_id=%s AND records_x_creators.is_contributor=1 order by records_x_creators_id asc""", [record["record_id"]]) record["dc:contributor"] = rows_to_dict(lookup_cur) - lookup_cur.execute("""SELECT subjects.subject FROM subjects JOIN records_x_subjects on records_x_subjects.subject_id = subjects.subject_id WHERE records_x_subjects.record_id=%s""", [record["record_id"]]) - record["dc:subject"] = rows_to_dict(lookup_cur) + lookup_cur.execute("""SELECT subjects.subject FROM subjects JOIN records_x_subjects on records_x_subjects.subject_id = subjects.subject_id WHERE records_x_subjects.record_id=%s and subjects.language = 'en' """, [record["record_id"]]) + record["frdr:category_en"] = rows_to_dict(lookup_cur) + + lookup_cur.execute("""SELECT subjects.subject FROM subjects JOIN records_x_subjects on records_x_subjects.subject_id = subjects.subject_id WHERE records_x_subjects.record_id=%s and subjects.language = 'fr' """, [record["record_id"]]) + record["frdr:category_fr"] = rows_to_dict(lookup_cur) lookup_cur.execute("""SELECT publishers.publisher FROM publishers JOIN records_x_publishers on records_x_publishers.publisher_id = publishers.publisher_id WHERE records_x_publishers.record_id=%s""", [record["record_id"]]) record["dc:publisher"] = rows_to_dict(lookup_cur) @@ -316,16 +332,16 @@ def assemble_record(record, db, user, password, server, port): record["dc:rights"] = rows_to_dict(lookup_cur) lookup_cur.execute("SELECT description FROM descriptions WHERE record_id=%s and language='en' ", [record["record_id"]]) - record["dc:description"] = rows_to_dict(lookup_cur) + record["dc:description_en"] = rows_to_dict(lookup_cur) lookup_cur.execute("SELECT description FROM descriptions WHERE record_id=%s and language='fr' ", [record["record_id"]]) - record["frdr:description_fr"] = rows_to_dict(lookup_cur) + record["dc:description_fr"] = rows_to_dict(lookup_cur) lookup_cur.execute("""SELECT tags.tag FROM tags JOIN records_x_tags on records_x_tags.tag_id = tags.tag_id WHERE records_x_tags.record_id=%s and tags.language = 'en' """, [record["record_id"]]) - record["frdr:tags"] = rows_to_dict(lookup_cur) + record["frdr:keywords_en"] = rows_to_dict(lookup_cur) lookup_cur.execute("""SELECT tags.tag FROM tags JOIN records_x_tags on records_x_tags.tag_id = tags.tag_id WHERE records_x_tags.record_id=%s and tags.language = 'fr' """, [record["record_id"]]) - record["frdr:tags_fr"] = rows_to_dict(lookup_cur) + record["frdr:keywords_fr"] = rows_to_dict(lookup_cur) lookup_cur.execute("""SELECT access.access FROM access JOIN records_x_access on records_x_access.access_id = access.access_id WHERE records_x_access.record_id=%s""", [record["record_id"]]) record["frdr:access"] = rows_to_dict(lookup_cur) @@ -349,7 +365,7 @@ def get_metadata_list( records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s' port='%s'" % (db, user, password, server, port)) with records_con: db_cursor = records_con.cursor() - records_sql = """SELECT recs.record_id, recs.title, recs.pub_date, recs.series, recs.source_url, recs.item_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp, repos.homepage_url, repos.repo_oai_name FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id""" + records_sql = """SELECT recs.record_id, recs.title, recs.title_fr, recs.pub_date, recs.series, recs.source_url, recs.item_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp, repos.homepage_url, repos.repo_oai_name FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id""" if set is not None and set != 'openaire_data': records_sql = records_sql + " AND (repos.repo_oai_name='" + set + "')" if from_datetime is not None: @@ -364,7 +380,7 @@ def get_metadata_list( results = [] for row in record_set: - record = (dict(zip(['record_id', 'title', 'pub_date', 'series', 'source_url', 'item_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp', 'homepage_url', 'repo_oai_name'], row))) + record = (dict(zip(['record_id', 'title_en', 'title_fr', 'pub_date', 'series', 'source_url', 'item_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp', 'homepage_url', 'repo_oai_name'], row))) full_record = assemble_record(record, db, user, password, server, port) if full_record is not None: @@ -382,7 +398,7 @@ def get_metadata(identifier, db, user, password, server, port): records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s' port='%s'" % (db, user, password, server, port)) with records_con: records_cursor = records_con.cursor() - records_sql = ("""SELECT recs.record_id, recs.title, recs.pub_date, recs.series, recs.source_url, + records_sql = ("""SELECT recs.record_id, recs.title, recs.title_fr, recs.pub_date, recs.series, recs.source_url, recs.item_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp, repos.homepage_url, repos.repo_oai_name FROM records recs, repositories repos @@ -390,7 +406,7 @@ def get_metadata(identifier, db, user, password, server, port): AND recs.local_identifier =\'""" + local_identifier + "\'" + "AND repos.repo_oai_name=\'""" + namespace + "\'") records_cursor.execute(records_sql) row = records_cursor.fetchone() - record = (dict(zip(['record_id', 'title', 'pub_date', 'series', 'source_url', 'item_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp', 'homepage_url', 'repo_oai_name'], row))) + record = (dict(zip(['record_id', 'title_en', 'title_fr', 'pub_date', 'series', 'source_url', 'item_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp', 'homepage_url', 'repo_oai_name'], row))) full_record = assemble_record(record, db, user, password, server, port) return build_metadata(full_record) From ec43f700342146bf892106b0a3c0a394cce2a623 Mon Sep 17 00:00:00 2001 From: Kelly Stathis Date: Fri, 24 Jul 2020 15:04:31 -0700 Subject: [PATCH 42/67] Add DOI as identifier when available --- viringo/services/frdr.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py index cd073c1..cb68fbe 100644 --- a/viringo/services/frdr.py +++ b/viringo/services/frdr.py @@ -71,10 +71,14 @@ def construct_datacite_xml(data): "http://datacite.org/schema/kernel-4 http://schema.datacite.org/meta/kernel-4/metadata.xsd") # Add resource URL as identifier - # TODO: Check if the URL is a DOI, reformat and use identifierType="DOI" identifier = ET.SubElement(resource, "identifier") identifier.set("identifierType", "URL") identifier.text = data['item_url'] + if "doi.org/" in data['item_url']: + identifier = ET.SubElement(resource, "identifier") + identifier.set("identifierType", "DOI") + identifier.text = data['item_url'].split("doi.org/")[1] + # Add creators creators = ET.SubElement(resource, "creators") From 6a7b5838f1e59aac16c18857bad55408b7c173ca Mon Sep 17 00:00:00 2001 From: Kelly Stathis Date: Fri, 24 Jul 2020 15:10:15 -0700 Subject: [PATCH 43/67] Only include one identifier (URL or DOI) --- viringo/services/frdr.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py index cb68fbe..153169c 100644 --- a/viringo/services/frdr.py +++ b/viringo/services/frdr.py @@ -72,12 +72,12 @@ def construct_datacite_xml(data): # Add resource URL as identifier identifier = ET.SubElement(resource, "identifier") - identifier.set("identifierType", "URL") - identifier.text = data['item_url'] if "doi.org/" in data['item_url']: - identifier = ET.SubElement(resource, "identifier") identifier.set("identifierType", "DOI") identifier.text = data['item_url'].split("doi.org/")[1] + else: + identifier.set("identifierType", "URL") + identifier.text = data['item_url'] # Add creators From 2666785aa940f43fb3746f782fbf839162eb46c4 Mon Sep 17 00:00:00 2001 From: Kelly Stathis Date: Fri, 24 Jul 2020 16:04:27 -0700 Subject: [PATCH 44/67] Fix issue where local_identifiers with colons weren't working --- viringo/services/frdr.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py index 153169c..ad34362 100644 --- a/viringo/services/frdr.py +++ b/viringo/services/frdr.py @@ -397,8 +397,9 @@ def get_metadata_list( def get_metadata(identifier, db, user, password, server, port): - namespace = identifier.split(":")[1] - local_identifier = identifier.split(":")[2] # get local_identifier substring from identifier + identifier = identifier[4:] + namespace = identifier[:identifier.find(":")] + local_identifier = identifier[identifier.find(":")+1:] records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s' port='%s'" % (db, user, password, server, port)) with records_con: records_cursor = records_con.cursor() From 6504bcefc5ba770ccd438c97c7066c3dd40494ab Mon Sep 17 00:00:00 2001 From: Kelly Stathis Date: Fri, 24 Jul 2020 16:10:25 -0700 Subject: [PATCH 45/67] Add contributors to XML metadata (type is unknown, use "Other") --- viringo/services/frdr.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py index ad34362..ba1ea15 100644 --- a/viringo/services/frdr.py +++ b/viringo/services/frdr.py @@ -140,8 +140,15 @@ def construct_datacite_xml(data): if len(subjects) == 0: resource.remove(subjects) - # Add FRDR as HostingInstituton + # Add contributors (contributorType "Other") contributors = ET.SubElement(resource, "contributors") + for contributor_entry in data["dc:contributor"]: + contributor = ET.SubElement(contributors, "contributor") + contributor.set("contributorType", "Other") + contributorName = ET.SubElement(contributor, "contributorName") + contributorName.text = contributor_entry + + # Add FRDR as HostingInstituton contributor_en = ET.SubElement(contributors, "contributor") contributor_en.set("contributorType", "HostingInstitution") contributor_en.set("xml:lang", "en") From ca5df37ced0830e9da6c7b8d658c3de4df5704f0 Mon Sep 17 00:00:00 2001 From: Kelly Stathis Date: Fri, 24 Jul 2020 16:22:23 -0700 Subject: [PATCH 46/67] Add openAccess statement for records without explicit access statement (these are public) --- viringo/services/frdr.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py index ba1ea15..6a44f3f 100644 --- a/viringo/services/frdr.py +++ b/viringo/services/frdr.py @@ -186,17 +186,16 @@ def construct_datacite_xml(data): if "http" in rights_entry: rights.set("rightsURI", rights_entry[rights_entry.find("http"):].strip()) rights.text = rights_entry[:rights_entry.find("http")].strip() - for access_entry in data["frdr:access"]: + if len(data["frdr:access"]) > 0: + for access_entry in data["frdr:access"]: + rights = ET.SubElement(rightsList, "rights") + if access_entry == "Public": + rights.text = "info:eu-repo/semantics/openAccess" + else: + rights.text = "info:eu-repo/semantics/restrictedAccess" + else: # Assume Public/openAccess rights = ET.SubElement(rightsList, "rights") - if access_entry == "Public": - rights.text = "info:eu-repo/semantics/openAccess" - else: - rights.text = "info:eu-repo/semantics/restrictedAccess" - - - # If rightsList is empty, remove it - if len(rightsList) == 0: - resource.remove(rightsList) + rights.text = "info:eu-repo/semantics/openAccess" # Add description(s) descriptions = ET.SubElement(resource, "descriptions") From 35fab8f390c1ff1c6ea3eed4fa98edc0fd2ba38b Mon Sep 17 00:00:00 2001 From: Kelly Stathis Date: Tue, 4 Aug 2020 15:25:57 -0700 Subject: [PATCH 47/67] Use rightsURI for eu-repo/semantics terms --- viringo/services/frdr.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py index 6a44f3f..b3f25dd 100644 --- a/viringo/services/frdr.py +++ b/viringo/services/frdr.py @@ -190,12 +190,12 @@ def construct_datacite_xml(data): for access_entry in data["frdr:access"]: rights = ET.SubElement(rightsList, "rights") if access_entry == "Public": - rights.text = "info:eu-repo/semantics/openAccess" + rights.set("rightsURI", "info:eu-repo/semantics/openAccess") else: - rights.text = "info:eu-repo/semantics/restrictedAccess" + rights.set("rightsURI", "info:eu-repo/semantics/restrictedAccess") else: # Assume Public/openAccess rights = ET.SubElement(rightsList, "rights") - rights.text = "info:eu-repo/semantics/openAccess" + rights.set("rightsURI", "info:eu-repo/semantics/openAccess") # Add description(s) descriptions = ET.SubElement(resource, "descriptions") From 1fee002f682a3eae9a1b9d3449fe47c849d877aa Mon Sep 17 00:00:00 2001 From: Kelly Stathis Date: Mon, 10 Aug 2020 14:26:58 -0700 Subject: [PATCH 48/67] Only retrieve records with pub_date --- viringo/services/frdr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py index b3f25dd..11b2868 100644 --- a/viringo/services/frdr.py +++ b/viringo/services/frdr.py @@ -375,7 +375,7 @@ def get_metadata_list( records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s' port='%s'" % (db, user, password, server, port)) with records_con: db_cursor = records_con.cursor() - records_sql = """SELECT recs.record_id, recs.title, recs.title_fr, recs.pub_date, recs.series, recs.source_url, recs.item_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp, repos.homepage_url, repos.repo_oai_name FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id""" + records_sql = """SELECT recs.record_id, recs.title, recs.title_fr, recs.pub_date, recs.series, recs.source_url, recs.item_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp, repos.homepage_url, repos.repo_oai_name FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id AND recs.pub_date!=''""" if set is not None and set != 'openaire_data': records_sql = records_sql + " AND (repos.repo_oai_name='" + set + "')" if from_datetime is not None: From b07edd27ede51542a77499f9461574e1cda6a0b7 Mon Sep 17 00:00:00 2001 From: Alex Garnett Date: Thu, 10 Sep 2020 11:05:25 -0700 Subject: [PATCH 49/67] fix null value handling and order by record ids to facilitate debugging --- Pipfile | 1 + viringo/metadata.py | 8 +++++--- viringo/services/frdr.py | 3 ++- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/Pipfile b/Pipfile index 679071b..ba09f38 100644 --- a/Pipfile +++ b/Pipfile @@ -13,6 +13,7 @@ python-dotenv = "*" [packages] flask = "*" +ftfy = "*" pyoai = "*" psycopg2-binary = "*" requests = "*" diff --git a/viringo/metadata.py b/viringo/metadata.py index ff123c6..ba195ee 100644 --- a/viringo/metadata.py +++ b/viringo/metadata.py @@ -1,6 +1,7 @@ """This module deals with handling the representation of metadata formats for OAI""" import re +import ftfy from lxml import etree NS_OAIPMH = 'http://www.openarchives.org/OAI/2.0/' @@ -46,9 +47,10 @@ def nsdc(name): if isinstance(value, list) and len(value) == 1: value = value[0] new_element = etree.SubElement(e_dc, nsdc(name)) - # The regular expression here is to filter only valid XML chars - # Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] - new_element.text = re.sub(u'[^\u0020-\uD7FF\u0009\u000A\u000D\uE000-\uFFFD\U00010000-\U0010FFFF]+', '', value) + if value is not None: + new_element.text = ftfy.fix_text(value) + else: + new_element.text = '' def datacite_writer(element: etree.Element, metadata): """Writer for writing data in a metadata object out into raw datacite format""" diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py index 11b2868..dbca2aa 100644 --- a/viringo/services/frdr.py +++ b/viringo/services/frdr.py @@ -375,13 +375,14 @@ def get_metadata_list( records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s' port='%s'" % (db, user, password, server, port)) with records_con: db_cursor = records_con.cursor() - records_sql = """SELECT recs.record_id, recs.title, recs.title_fr, recs.pub_date, recs.series, recs.source_url, recs.item_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp, repos.homepage_url, repos.repo_oai_name FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id AND recs.pub_date!=''""" + records_sql = """SELECT recs.record_id, recs.title, recs.title_fr, recs.pub_date, recs.series, recs.source_url, recs.item_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp, repos.homepage_url, repos.repo_oai_name FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id AND recs.pub_date != ''""" if set is not None and set != 'openaire_data': records_sql = records_sql + " AND (repos.repo_oai_name='" + set + "')" if from_datetime is not None: records_sql = records_sql + " AND recs.pub_date>='" + from_datetime + "'" if until_datetime is not None: records_sql = records_sql + " AND recs.pub_date<'" + until_datetime + "'" + records_sql = records_sql + " ORDER BY recs.record_id" if cursor is not None: records_sql = records_sql + " OFFSET " + cursor db_cursor.execute(records_sql) From 6523551c077607df8b6a679fbe73477a5c55a17f Mon Sep 17 00:00:00 2001 From: Alex Garnett Date: Thu, 10 Sep 2020 11:57:17 -0700 Subject: [PATCH 50/67] fix reporting totals and paging at end of listrecords --- Pipfile.lock | 6 ++++++ viringo/catalogs.py | 5 +++++ viringo/services/frdr.py | 11 ++++++++--- 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/Pipfile.lock b/Pipfile.lock index 9bf8991..7d9a934 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -66,6 +66,12 @@ "index": "pypi", "version": "==1.1.1" }, + "ftfy": { + "hashes": [ + "sha256:51c7767f8c4b47d291fcef30b9625fb5341c06a31e6a3b627039c706c42f3720" + ], + "version": "==5.8" + }, "idna": { "hashes": [ "sha256:c357b3f628cf53ae2c4c05627ecc484553142ca23264e593d327bcde5e9c3407", diff --git a/viringo/catalogs.py b/viringo/catalogs.py index 4d708c3..87664f4 100644 --- a/viringo/catalogs.py +++ b/viringo/catalogs.py @@ -374,6 +374,7 @@ def listRecords( set=None, paging_cursor=None ): + #pylint: disable=no-self-use,invalid-name """Returns pyoai data tuple for list of records""" @@ -393,6 +394,10 @@ def listRecords( cursor=paging_cursor ) + batch_size = 50 + if len(results) <= batch_size: + paging_cursor = None + records = [] if results: for result in results: diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py index dbca2aa..180171e 100644 --- a/viringo/services/frdr.py +++ b/viringo/services/frdr.py @@ -375,7 +375,7 @@ def get_metadata_list( records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s' port='%s'" % (db, user, password, server, port)) with records_con: db_cursor = records_con.cursor() - records_sql = """SELECT recs.record_id, recs.title, recs.title_fr, recs.pub_date, recs.series, recs.source_url, recs.item_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp, repos.homepage_url, repos.repo_oai_name FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id AND recs.pub_date != ''""" + records_sql = """SELECT recs.record_id, recs.title, recs.title_fr, recs.pub_date, recs.series, recs.source_url, recs.item_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp, repos.homepage_url, repos.repo_oai_name, count(*) OVER() AS full_count FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id AND recs.pub_date != ''""" if set is not None and set != 'openaire_data': records_sql = records_sql + " AND (repos.repo_oai_name='" + set + "')" if from_datetime is not None: @@ -388,19 +388,24 @@ def get_metadata_list( db_cursor.execute(records_sql) record_set = db_cursor.fetchmany(config.RESULT_SET_SIZE) + full_count = 0 results = [] for row in record_set: record = (dict(zip(['record_id', 'title_en', 'title_fr', 'pub_date', 'series', 'source_url', 'item_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp', 'homepage_url', 'repo_oai_name'], row))) + # This is goofy, but full_count isn't always returned for empty results + if int(row[-1]) != 0: + full_count = row[-1] + full_record = assemble_record(record, db, user, password, server, port) if full_record is not None: results.append(build_metadata(full_record)) if cursor is not None: - return results, db_cursor.rowcount, (len(record_set) + int(cursor)) + return results, full_count, (len(record_set) + int(cursor)) else: - return results, db_cursor.rowcount, len(record_set) + return results, full_count, len(record_set) def get_metadata(identifier, db, user, password, server, port): From d78621415df3b361dd7fd6061d42a925860f860c Mon Sep 17 00:00:00 2001 From: Alex Garnett Date: Fri, 11 Sep 2020 10:52:20 -0700 Subject: [PATCH 51/67] forgot to sideload ftfy dependencies into pipfile lock --- Pipfile.lock | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Pipfile.lock b/Pipfile.lock index 7d9a934..3a08a9b 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -278,6 +278,13 @@ "sha256:6dc65cf9091cf750012f56f2cad759fa9e879f511b5ff8685e456b4e3bf90d16" ], "version": "==1.0.0" + }, + "wcwidth": { + "hashes": [ + "sha256:c4d647b99872929fdb7bdcaa4fbe7f01413ed3d98077df798530e5b04f116c83", + "sha256:beb4802a9cebb9144e99086eff703a642a13d6a0052920003a230f3294bbe784" + ], + "version": "==0.2.5" } }, "develop": { From d11a5e0ddf947c955b8c528ddd3fe85517e6c430 Mon Sep 17 00:00:00 2001 From: Alex Garnett Date: Fri, 11 Sep 2020 11:08:25 -0700 Subject: [PATCH 52/67] revert less than or equals test for batch sizes --- viringo/catalogs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/viringo/catalogs.py b/viringo/catalogs.py index 87664f4..504e21a 100644 --- a/viringo/catalogs.py +++ b/viringo/catalogs.py @@ -395,7 +395,7 @@ def listRecords( ) batch_size = 50 - if len(results) <= batch_size: + if len(results) < batch_size: paging_cursor = None records = [] From 92fe12cb7b245bcdac2ac51b90d441a597c5ca5b Mon Sep 17 00:00:00 2001 From: Kelly Stathis Date: Wed, 16 Sep 2020 10:41:01 -0700 Subject: [PATCH 53/67] Continue iteration when there are fewer than 50 records per page; only stop when the total_records is exceeded --- viringo/catalogs.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/viringo/catalogs.py b/viringo/catalogs.py index 504e21a..cdc9c4b 100644 --- a/viringo/catalogs.py +++ b/viringo/catalogs.py @@ -394,8 +394,7 @@ def listRecords( cursor=paging_cursor ) - batch_size = 50 - if len(results) < batch_size: + if paging_cursor >= total_records: paging_cursor = None records = [] From e2636579f5305e74bc96bd944094ae352b70f900 Mon Sep 17 00:00:00 2001 From: Kelly Stathis Date: Mon, 21 Sep 2020 15:48:19 -0700 Subject: [PATCH 54/67] Use ftfy.fix_text in XML for oai_datacite and datacite --- viringo/services/frdr.py | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py index 180171e..3c626f4 100644 --- a/viringo/services/frdr.py +++ b/viringo/services/frdr.py @@ -7,6 +7,7 @@ import dateutil.tz from viringo import config import xml.etree.cElementTree as ET +import ftfy class Metadata: """Represents a DataCite metadata resultset""" @@ -74,10 +75,10 @@ def construct_datacite_xml(data): identifier = ET.SubElement(resource, "identifier") if "doi.org/" in data['item_url']: identifier.set("identifierType", "DOI") - identifier.text = data['item_url'].split("doi.org/")[1] + identifier.text = ftfy.fix_text(data['item_url'].split("doi.org/")[1]) else: identifier.set("identifierType", "URL") - identifier.text = data['item_url'] + identifier.text = ftfy.fix_text(data['item_url']) # Add creators @@ -85,28 +86,28 @@ def construct_datacite_xml(data): for creator_entry in data['dc:contributor.author']: creator = ET.SubElement(creators, "creator") creatorName = ET.SubElement(creator, "creatorName") - creatorName.text = creator_entry + creatorName.text = ftfy.fix_text(creator_entry) # Add titles titles = ET.SubElement(resource, "titles") if data['title_en'] != "": title = ET.SubElement(titles, "title") - title.text = data['title_en'] + title.text = ftfy.fix_text(data['title_en']) title.set("xml:lang", "en") if data['title_fr'] != "": title = ET.SubElement(titles, "title") - title.text = data['title_fr'] + title.text = ftfy.fix_text(data['title_fr']) title.set("xml:lang", "fr") if data['title_en'] != "": title.set("titleType", "TranslatedTitle") # Add publisher publisher = ET.SubElement(resource, "publisher") - publisher.text = data['repository_name'] + publisher.text = ftfy.fix_text(data['repository_name']) # Add publication year publicationyear = ET.SubElement(resource, "publicationYear") - publicationyear.text = data['pub_date'][:4] + publicationyear.text = ftfy.fix_text(data['pub_date'][:4]) # Add subjects subject_and_tags = [] @@ -116,25 +117,25 @@ def construct_datacite_xml(data): subject_and_tags.append(subject_entry) subject = ET.SubElement(subjects, "subject") subject.set("xml:lang", "en") - subject.text = subject_entry + subject.text = ftfy.fix_text(subject_entry) for subject_entry in data['frdr:category_fr']: if subject_entry not in subject_and_tags and subject_entry != "": subject_and_tags.append(subject_entry) subject = ET.SubElement(subjects, "subject") subject.set("xml:lang", "fr") - subject.text = subject_entry + subject.text = ftfy.fix_text(subject_entry) for subject_entry in data['frdr:keywords_en']: if subject_entry not in subject_and_tags and subject_entry != "": subject_and_tags.append(subject_entry) subject = ET.SubElement(subjects, "subject") subject.set("xml:lang", "en") - subject.text = subject_entry + subject.text = ftfy.fix_text(subject_entry) for subject_entry in data['frdr:keywords_fr']: if subject_entry not in subject_and_tags and subject_entry != "": subject_and_tags.append(subject_entry) subject = ET.SubElement(subjects, "subject") subject.set("xml:lang", "fr") - subject.text = subject_entry + subject.text = ftfy.fix_text(subject_entry) # If subjects is empty, remove it if len(subjects) == 0: @@ -146,7 +147,7 @@ def construct_datacite_xml(data): contributor = ET.SubElement(contributors, "contributor") contributor.set("contributorType", "Other") contributorName = ET.SubElement(contributor, "contributorName") - contributorName.text = contributor_entry + contributorName.text = ftfy.fix_text(contributor_entry) # Add FRDR as HostingInstituton contributor_en = ET.SubElement(contributors, "contributor") @@ -164,7 +165,7 @@ def construct_datacite_xml(data): dates = ET.SubElement(resource, "dates") date = ET.SubElement(dates, "date") date.set("dateType", "Issued") - date.text = data['pub_date'] + date.text = ftfy.fix_text(data['pub_date']) # Add resourceType resourceType = ET.SubElement(resource, "resourceType") @@ -175,17 +176,17 @@ def construct_datacite_xml(data): alternateIdentifiers = ET.SubElement(resource, "alternateIdentifiers") alternateIdentifier = ET.SubElement(alternateIdentifiers, "alternateIdentifier") alternateIdentifier.set("alternateIdentifierType", "local") - alternateIdentifier.text = data['local_identifier'] + alternateIdentifier.text = ftfy.fix_text(data['local_identifier']) # Add rightsList rightsList = ET.SubElement(resource, "rightsList") for rights_entry in data['dc:rights']: if rights_entry != '': rights = ET.SubElement(rightsList, "rights") - rights.text = rights_entry + rights.text = ftfy.fix_text(rights_entry) if "http" in rights_entry: rights.set("rightsURI", rights_entry[rights_entry.find("http"):].strip()) - rights.text = rights_entry[:rights_entry.find("http")].strip() + rights.text = ftfy.fix_text(rights_entry[:rights_entry.find("http")].strip()) if len(data["frdr:access"]) > 0: for access_entry in data["frdr:access"]: rights = ET.SubElement(rightsList, "rights") @@ -204,19 +205,19 @@ def construct_datacite_xml(data): description = ET.SubElement(descriptions, "description") description.set("descriptionType", "Abstract") description.set("xml:lang", "en") - description.text = description_entry + description.text = ftfy.fix_text(description_entry) for description_entry in data['dc:description_fr']: if description_entry != "": description = ET.SubElement(descriptions, "description") description.set("descriptionType", "Abstract") description.set("xml:lang", "fr") - description.text = description_entry + description.text = ftfy.fix_text(description_entry) # Add series (series) if data['series'] != "": description_series = ET.SubElement(descriptions, "description") description_series.set("descriptionType", "SeriesInformation") - description_series.text = data['series'] + description_series.text = ftfy.fix_text(data['series']) # If descriptions is empty, remove it if len(descriptions) == 0: From 34a27796963b4dd8b2bca326f28b154b60d06a69 Mon Sep 17 00:00:00 2001 From: Kelly Stathis Date: Mon, 21 Sep 2020 16:43:25 -0700 Subject: [PATCH 55/67] helper function for fixing xml to check if none/zero length --- viringo/services/frdr.py | 44 +++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py index 3c626f4..e5583c9 100644 --- a/viringo/services/frdr.py +++ b/viringo/services/frdr.py @@ -64,6 +64,12 @@ def __init__( self.client = client self.active = active +def xml_fix_text(text): + if isinstance(text, str) and len(text) > 0: + return ftfy.fix_text(text) + else: + return '' + def construct_datacite_xml(data): resource = ET.Element("resource") resource.set("xmlns", "http://datacite.org/schema/kernel-4") @@ -75,10 +81,10 @@ def construct_datacite_xml(data): identifier = ET.SubElement(resource, "identifier") if "doi.org/" in data['item_url']: identifier.set("identifierType", "DOI") - identifier.text = ftfy.fix_text(data['item_url'].split("doi.org/")[1]) + identifier.text = xml_fix_text(data['item_url'].split("doi.org/")[1]) else: identifier.set("identifierType", "URL") - identifier.text = ftfy.fix_text(data['item_url']) + identifier.text = xml_fix_text(data['item_url']) # Add creators @@ -86,28 +92,28 @@ def construct_datacite_xml(data): for creator_entry in data['dc:contributor.author']: creator = ET.SubElement(creators, "creator") creatorName = ET.SubElement(creator, "creatorName") - creatorName.text = ftfy.fix_text(creator_entry) + creatorName.text = xml_fix_text(creator_entry) # Add titles titles = ET.SubElement(resource, "titles") if data['title_en'] != "": title = ET.SubElement(titles, "title") - title.text = ftfy.fix_text(data['title_en']) + title.text = xml_fix_text(data['title_en']) title.set("xml:lang", "en") if data['title_fr'] != "": title = ET.SubElement(titles, "title") - title.text = ftfy.fix_text(data['title_fr']) + title.text = xml_fix_text(data['title_fr']) title.set("xml:lang", "fr") if data['title_en'] != "": title.set("titleType", "TranslatedTitle") # Add publisher publisher = ET.SubElement(resource, "publisher") - publisher.text = ftfy.fix_text(data['repository_name']) + publisher.text = xml_fix_text(data['repository_name']) # Add publication year publicationyear = ET.SubElement(resource, "publicationYear") - publicationyear.text = ftfy.fix_text(data['pub_date'][:4]) + publicationyear.text = xml_fix_text(data['pub_date'][:4]) # Add subjects subject_and_tags = [] @@ -117,25 +123,25 @@ def construct_datacite_xml(data): subject_and_tags.append(subject_entry) subject = ET.SubElement(subjects, "subject") subject.set("xml:lang", "en") - subject.text = ftfy.fix_text(subject_entry) + subject.text = xml_fix_text(subject_entry) for subject_entry in data['frdr:category_fr']: if subject_entry not in subject_and_tags and subject_entry != "": subject_and_tags.append(subject_entry) subject = ET.SubElement(subjects, "subject") subject.set("xml:lang", "fr") - subject.text = ftfy.fix_text(subject_entry) + subject.text = xml_fix_text(subject_entry) for subject_entry in data['frdr:keywords_en']: if subject_entry not in subject_and_tags and subject_entry != "": subject_and_tags.append(subject_entry) subject = ET.SubElement(subjects, "subject") subject.set("xml:lang", "en") - subject.text = ftfy.fix_text(subject_entry) + subject.text = xml_fix_text(subject_entry) for subject_entry in data['frdr:keywords_fr']: if subject_entry not in subject_and_tags and subject_entry != "": subject_and_tags.append(subject_entry) subject = ET.SubElement(subjects, "subject") subject.set("xml:lang", "fr") - subject.text = ftfy.fix_text(subject_entry) + subject.text = xml_fix_text(subject_entry) # If subjects is empty, remove it if len(subjects) == 0: @@ -147,7 +153,7 @@ def construct_datacite_xml(data): contributor = ET.SubElement(contributors, "contributor") contributor.set("contributorType", "Other") contributorName = ET.SubElement(contributor, "contributorName") - contributorName.text = ftfy.fix_text(contributor_entry) + contributorName.text = xml_fix_text(contributor_entry) # Add FRDR as HostingInstituton contributor_en = ET.SubElement(contributors, "contributor") @@ -165,7 +171,7 @@ def construct_datacite_xml(data): dates = ET.SubElement(resource, "dates") date = ET.SubElement(dates, "date") date.set("dateType", "Issued") - date.text = ftfy.fix_text(data['pub_date']) + date.text = xml_fix_text(data['pub_date']) # Add resourceType resourceType = ET.SubElement(resource, "resourceType") @@ -176,17 +182,17 @@ def construct_datacite_xml(data): alternateIdentifiers = ET.SubElement(resource, "alternateIdentifiers") alternateIdentifier = ET.SubElement(alternateIdentifiers, "alternateIdentifier") alternateIdentifier.set("alternateIdentifierType", "local") - alternateIdentifier.text = ftfy.fix_text(data['local_identifier']) + alternateIdentifier.text = xml_fix_text(data['local_identifier']) # Add rightsList rightsList = ET.SubElement(resource, "rightsList") for rights_entry in data['dc:rights']: if rights_entry != '': rights = ET.SubElement(rightsList, "rights") - rights.text = ftfy.fix_text(rights_entry) + rights.text = xml_fix_text(rights_entry) if "http" in rights_entry: rights.set("rightsURI", rights_entry[rights_entry.find("http"):].strip()) - rights.text = ftfy.fix_text(rights_entry[:rights_entry.find("http")].strip()) + rights.text = xml_fix_text(rights_entry[:rights_entry.find("http")].strip()) if len(data["frdr:access"]) > 0: for access_entry in data["frdr:access"]: rights = ET.SubElement(rightsList, "rights") @@ -205,19 +211,19 @@ def construct_datacite_xml(data): description = ET.SubElement(descriptions, "description") description.set("descriptionType", "Abstract") description.set("xml:lang", "en") - description.text = ftfy.fix_text(description_entry) + description.text = xml_fix_text(description_entry) for description_entry in data['dc:description_fr']: if description_entry != "": description = ET.SubElement(descriptions, "description") description.set("descriptionType", "Abstract") description.set("xml:lang", "fr") - description.text = ftfy.fix_text(description_entry) + description.text = xml_fix_text(description_entry) # Add series (series) if data['series'] != "": description_series = ET.SubElement(descriptions, "description") description_series.set("descriptionType", "SeriesInformation") - description_series.text = ftfy.fix_text(data['series']) + description_series.text = xml_fix_text(data['series']) # If descriptions is empty, remove it if len(descriptions) == 0: From dafcbad7bd5aaa48dd92d0ab10ff251064320f95 Mon Sep 17 00:00:00 2001 From: Kelly Stathis Date: Wed, 23 Sep 2020 14:51:28 -0700 Subject: [PATCH 56/67] Add openAccess or restrictedAccess flag to oai_dc for Primo --- viringo/services/frdr.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py index e5583c9..9bfd918 100644 --- a/viringo/services/frdr.py +++ b/viringo/services/frdr.py @@ -275,6 +275,20 @@ def build_metadata(data): result.client = data['repo_oai_name'] result.active = True + # Add openAccess or restrictedAccess indicator to dc:rights + if len(data["frdr:access"]) > 0: + for access_entry in data["frdr:access"]: + # If Public in frdr:access, use openAccess + if access_entry == "Public": + result.rights.append("openAccess") + break + if "openAccess" not in result.rights: + # If there are access values and none are Public, use restrictedAccess + result.rights.append("restrictedAccess") + else: + # If not indicated, assume Public/openAccess + result.rights.append("openAccess") + return result From ec2f12ba007a703b3d7445664635e05df5a838be Mon Sep 17 00:00:00 2001 From: Kelly Stathis Date: Tue, 6 Oct 2020 10:52:03 -0700 Subject: [PATCH 57/67] Replace form feed chars (\x0c) with space --- viringo/metadata.py | 11 +++++++++-- viringo/services/frdr.py | 1 + 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/viringo/metadata.py b/viringo/metadata.py index ba195ee..f8f323b 100644 --- a/viringo/metadata.py +++ b/viringo/metadata.py @@ -48,7 +48,11 @@ def nsdc(name): value = value[0] new_element = etree.SubElement(e_dc, nsdc(name)) if value is not None: - new_element.text = ftfy.fix_text(value) + try: + value = value.replace('\x0c', " ") + new_element.text = ftfy.fix_text(value) + except: + print(value) else: new_element.text = '' @@ -66,7 +70,10 @@ def oai_datacite_writer(element: etree.Element, metadata): _map = metadata.getMap() raw_xml = _map.get('xml', '') - xml_resource_element = etree.fromstring(raw_xml) + try: + xml_resource_element = etree.fromstring(raw_xml) + except: + print(raw_xml) e_oai_datacite = etree.SubElement( element, "oai_datacite", {'xmlns': 'http://schema.datacite.org/oai/oai-1.1/'}, diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py index 9bfd918..b2478e7 100644 --- a/viringo/services/frdr.py +++ b/viringo/services/frdr.py @@ -66,6 +66,7 @@ def __init__( def xml_fix_text(text): if isinstance(text, str) and len(text) > 0: + text = text.replace('\x0c', " ") return ftfy.fix_text(text) else: return '' From edd4fe0958a4244c05ff8fb3afffddcb82e54eef Mon Sep 17 00:00:00 2001 From: Kelly Stathis Date: Tue, 6 Oct 2020 17:04:54 -0700 Subject: [PATCH 58/67] Use repository_name for publisher field in oai_dc (matches oai_datacite) --- viringo/services/frdr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py index b2478e7..ee8ddb5 100644 --- a/viringo/services/frdr.py +++ b/viringo/services/frdr.py @@ -260,7 +260,7 @@ def build_metadata(data): result.subjects.append(subject) result.descriptions = data['dc:description_en'] + data['dc:description_fr'] - result.publisher = data['dc:publisher'] + result.publisher = data['repository_name'] result.publication_year = dateutil.parser.parse(data['pub_date']).year result.dates = [data['pub_date']] result.contributors = data['dc:contributor'] From 1404d231ab857c830915fcec49d84d2f3190e7c7 Mon Sep 17 00:00:00 2001 From: Kelly Stathis Date: Tue, 6 Oct 2020 17:06:28 -0700 Subject: [PATCH 59/67] Ensure that only strings are passed to ftfy.fix_text and catch exceptions --- viringo/metadata.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/viringo/metadata.py b/viringo/metadata.py index f8f323b..5c2f6d0 100644 --- a/viringo/metadata.py +++ b/viringo/metadata.py @@ -44,15 +44,18 @@ def nsdc(name): ]: for value in _map.get(name, []): if value: - if isinstance(value, list) and len(value) == 1: - value = value[0] + if isinstance(value, list): + if len(value) == 1: + value = value[0] + else: + value = str(value) new_element = etree.SubElement(e_dc, nsdc(name)) - if value is not None: + if isinstance(value, str): try: value = value.replace('\x0c', " ") new_element.text = ftfy.fix_text(value) except: - print(value) + new_element.text = '' else: new_element.text = '' From 7e6ac1231c8db2f566d084f2ccbf8fbafd93c0c5 Mon Sep 17 00:00:00 2001 From: Kelly Stathis Date: Tue, 6 Oct 2020 17:08:56 -0700 Subject: [PATCH 60/67] Exclude deleted records and records without item_url from selection --- viringo/services/frdr.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py index ee8ddb5..ffaf422 100644 --- a/viringo/services/frdr.py +++ b/viringo/services/frdr.py @@ -397,7 +397,12 @@ def get_metadata_list( records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s' port='%s'" % (db, user, password, server, port)) with records_con: db_cursor = records_con.cursor() - records_sql = """SELECT recs.record_id, recs.title, recs.title_fr, recs.pub_date, recs.series, recs.source_url, recs.item_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp, repos.homepage_url, repos.repo_oai_name, count(*) OVER() AS full_count FROM records recs, repositories repos WHERE recs.repository_id = repos.repository_id AND recs.pub_date != ''""" + records_sql = """SELECT recs.record_id, recs.title, recs.title_fr, recs.pub_date, recs.series, recs.source_url, + recs.item_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, + repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp, + repos.homepage_url, repos.repo_oai_name, count(*) OVER() AS full_count FROM records recs, repositories repos + WHERE recs.repository_id = repos.repository_id AND recs.deleted!=1 AND recs.item_url!='' AND + recs.pub_date != ''""" if set is not None and set != 'openaire_data': records_sql = records_sql + " AND (repos.repo_oai_name='" + set + "')" if from_datetime is not None: From 2a84e4ea2a8d0dd57bd703fdaaf131789bcd2448 Mon Sep 17 00:00:00 2001 From: Kelly Stathis Date: Mon, 9 Nov 2020 08:57:59 -0800 Subject: [PATCH 61/67] Only set one info:eu-repo/semantics access statement (still needs testing) --- viringo/services/frdr.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py index ffaf422..cb68956 100644 --- a/viringo/services/frdr.py +++ b/viringo/services/frdr.py @@ -194,15 +194,19 @@ def construct_datacite_xml(data): if "http" in rights_entry: rights.set("rightsURI", rights_entry[rights_entry.find("http"):].strip()) rights.text = xml_fix_text(rights_entry[:rights_entry.find("http")].strip()) + # Add access statement + rights = ET.SubElement(rightsList, "rights") if len(data["frdr:access"]) > 0: for access_entry in data["frdr:access"]: - rights = ET.SubElement(rightsList, "rights") + # If Public in frdr:access, use openAccess if access_entry == "Public": rights.set("rightsURI", "info:eu-repo/semantics/openAccess") - else: - rights.set("rightsURI", "info:eu-repo/semantics/restrictedAccess") - else: # Assume Public/openAccess - rights = ET.SubElement(rightsList, "rights") + break + if "rightsURI" not in rights.attrib: + # If there are access values and none are Public, use restrictedAccess + rights.set("rightsURI", "info:eu-repo/semantics/restrictedAccess") + else: + # If not indicated, assume Public/openAccess rights.set("rightsURI", "info:eu-repo/semantics/openAccess") # Add description(s) From 1cc56bd2cab9fd786445c781de1388a1986e4cfc Mon Sep 17 00:00:00 2001 From: Kelly Stathis Date: Tue, 15 Dec 2020 14:52:56 -0800 Subject: [PATCH 62/67] Parse pub_date to YYYY-MM-DD format from datetime --- viringo/services/frdr.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py index cb68956..ea69699 100644 --- a/viringo/services/frdr.py +++ b/viringo/services/frdr.py @@ -410,9 +410,9 @@ def get_metadata_list( if set is not None and set != 'openaire_data': records_sql = records_sql + " AND (repos.repo_oai_name='" + set + "')" if from_datetime is not None: - records_sql = records_sql + " AND recs.pub_date>='" + from_datetime + "'" + records_sql = records_sql + " AND recs.pub_date>='" + from_datetime.strftime('%Y-%M-%D') + "'" if until_datetime is not None: - records_sql = records_sql + " AND recs.pub_date<'" + until_datetime + "'" + records_sql = records_sql + " AND recs.pub_date<'" + until_datetime.strftime('%Y-%M-%D') + "'" records_sql = records_sql + " ORDER BY recs.record_id" if cursor is not None: records_sql = records_sql + " OFFSET " + cursor From 75df50669a567bf1e063ee1863dba0b2bd36752b Mon Sep 17 00:00:00 2001 From: Kelly Stathis Date: Tue, 15 Dec 2020 14:59:39 -0800 Subject: [PATCH 63/67] Switch from pub_date to upstream_modified_timestamp --- viringo/services/frdr.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py index ea69699..ad0a19f 100644 --- a/viringo/services/frdr.py +++ b/viringo/services/frdr.py @@ -410,9 +410,11 @@ def get_metadata_list( if set is not None and set != 'openaire_data': records_sql = records_sql + " AND (repos.repo_oai_name='" + set + "')" if from_datetime is not None: - records_sql = records_sql + " AND recs.pub_date>='" + from_datetime.strftime('%Y-%M-%D') + "'" + from_timestamp = int(datetime.timestamp(from_datetime)) + records_sql = records_sql + " AND recs.upstream_modified_timestamp>=" + str(from_timestamp) if until_datetime is not None: - records_sql = records_sql + " AND recs.pub_date<'" + until_datetime.strftime('%Y-%M-%D') + "'" + until_timestamp = int(datetime.timestamp(until_datetime)) + records_sql = records_sql + " AND recs.upstream_modified_timestamp<" + str(until_timestamp) records_sql = records_sql + " ORDER BY recs.record_id" if cursor is not None: records_sql = records_sql + " OFFSET " + cursor From 30b9a1a75dca12260efdbb9804f4f6e3f72d9062 Mon Sep 17 00:00:00 2001 From: Kelly Stathis Date: Mon, 21 Dec 2020 18:18:51 -0800 Subject: [PATCH 64/67] Add GeoLocation metadata to oai_datacite format --- viringo/services/frdr.py | 90 +++++++++++++++++++++++++++++++--------- 1 file changed, 71 insertions(+), 19 deletions(-) diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py index ad0a19f..ec09579 100644 --- a/viringo/services/frdr.py +++ b/viringo/services/frdr.py @@ -234,6 +234,41 @@ def construct_datacite_xml(data): if len(descriptions) == 0: resource.remove(descriptions) + # Add GeoLocation + geolocations = ET.SubElement(resource, "geoLocations") + if "geoLocationBox" in data["datacite_geoLocation"]: + for geobbox in data["datacite_geoLocation"]["geoLocationBox"]: + geolocation = ET.SubElement(geolocations, "geoLocation") + geolocationBox = ET.SubElement(geolocation, "geolocationBox") + geolocationBox.text = xml_fix_text(str(geobbox["southBoundLatitude"]) + " " + str(geobbox["westBoundLongitude"]) + " " + + str(geobbox["northBoundLatitude"]) + " " + str(geobbox["eastBoundLongitude"])) + + if "geoLocationPoint" in data["datacite_geoLocation"]: + for geopoint in data["datacite_geoLocation"]["geoLocationPoint"]: + geolocation = ET.SubElement(geolocations, "geoLocation") + geoLocationPoint = ET.SubElement(geolocation, "geoLocationPoint") + geoLocationPoint.text = xml_fix_text(str(geopoint["pointLatitude"]) + " " + str(geopoint["pointLongitude"])) + + if "geoLocationPlace" in data["datacite_geoLocation"]: + for geoplace in data["datacite_geoLocation"]["geoLocationPlace"]: + geolocation = ET.SubElement(geolocations, "geoLocation") + geoLocationPlace = ET.SubElement(geolocation, "geoLocationPlace") + components = [] + if geoplace["place_name"]: + components.append(geoplace["place_name"]) + if geoplace["additional"]: + components.append(geoplace["additional"]) + if geoplace["city"]: + components.append(geoplace["city"]) + if geoplace["province_state"]: + components.append(geoplace["province_state"]) + if geoplace["country"]: + components.append(geoplace["country"]) + geoLocationPlace.text = xml_fix_text("; ".join(components)) + + if len(geolocations) == 0: + resource.remove(geolocations) + xml_string = ET.tostring(resource) return xml_string @@ -270,7 +305,7 @@ def build_metadata(data): result.contributors = data['dc:contributor'] result.funding_references = '' result.sizes = [] - result.geo_locations = data['frdr:geospatial'] + result.geo_locations = [] result.resource_types = ['Dataset'] result.formats = [] result.identifiers = [data['item_url']] @@ -322,24 +357,41 @@ def assemble_record(record, db, user, password, server, port): con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s' port='%s'" % (db, user, password, server, port)) with con: - lookup_cur = con.cursor(cursor_factory=None) - - lookup_cur.execute("SELECT coordinate_type, lat, lon FROM geospatial WHERE record_id=%s", [record["record_id"]]) - geodata = lookup_cur.fetchall() - record["frdr:geospatial"] = [] - polycoordinates = [] - - try: - for coordinate in geodata: - if coordinate[0] == "Polygon": - polycoordinates.append([float(coordinate[1]), float(coordinate[2])]) - else: - record["frdr:geospatial"].append({"frdr:geospatial_type": "Feature", "frdr:geospatial_geometry": {"frdr:geometry_type": coordinate[0], "frdr:geometry_coordinates": [float(coordinate[1]), float(coordinate[2])]}}) - except: - pass - - if polycoordinates: - record["frdr:geospatial"].append({"frdr:geospatial_type": "Feature", "frdr:geospatial_geometry": {"frdr:geometry_type": "Polygon", "frdr:geometry_coordinates": polycoordinates}}) + from psycopg2.extras import DictCursor + lookup_cur = con.cursor(cursor_factory=DictCursor) + + record["datacite_geoLocation"] = {} + lookup_cur.execute("""SELECT geobbox.westLon, geobbox.eastLon, geobbox.northLat, geobbox.southLat + FROM geobbox WHERE geobbox.record_id=%s""", [record["record_id"]]) + geobboxes = lookup_cur.fetchall() + if len(geobboxes) > 0: + record["datacite_geoLocation"]["geoLocationBox"] = [] + for geobbox in geobboxes: + record["datacite_geoLocation"]["geoLocationBox"].append({"westBoundLongitude": geobbox["westlon"], + "eastBoundLongitude": geobbox["eastlon"], + "northBoundLatitude": geobbox["northlat"], + "southBoundLatitude": geobbox["southlat"]}) + lookup_cur.execute("""SELECT geopoint.lat, geopoint.lon FROM geopoint WHERE geopoint.record_id=%s""", + [record["record_id"]]) + geopoints = lookup_cur.fetchall() + if len(geopoints) > 0: + record["datacite_geoLocation"]["geoLocationPoint"] = [] + for geopoint in geopoints: + record["datacite_geoLocation"]["geoLocationPoint"].append({"pointLatitude": geopoint["lat"], + "pointLongitude": geopoint["lon"]}) + + lookup_cur.execute("""SELECT geoplace.country, geoplace.province_state, geoplace.city, geoplace.other, geoplace.place_name + FROM geoplace JOIN records_x_geoplace on records_x_geoplace.geoplace_id = geoplace.geoplace_id + WHERE records_x_geoplace.record_id=%s""", [record["record_id"]]) + geoplaces = lookup_cur.fetchall() + if len(geoplaces) > 0: + record["datacite_geoLocation"]["geoLocationPlace"] = [] + for geoplace in geoplaces: + record["datacite_geoLocation"]["geoLocationPlace"].append({"country": geoplace["country"], + "province_state": geoplace["province_state"], + "city": geoplace["city"], + "additional": geoplace["other"], + "place_name": geoplace["place_name"]}) with con: from psycopg2.extras import DictCursor From e47e3554977d30c517abec28e94f6b1e3a27c81b Mon Sep 17 00:00:00 2001 From: Kelly Stathis Date: Mon, 21 Dec 2020 18:25:50 -0800 Subject: [PATCH 65/67] Use the same dict cursor throughout; comments and spacing --- viringo/services/frdr.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py index ec09579..ce9a6e5 100644 --- a/viringo/services/frdr.py +++ b/viringo/services/frdr.py @@ -242,13 +242,11 @@ def construct_datacite_xml(data): geolocationBox = ET.SubElement(geolocation, "geolocationBox") geolocationBox.text = xml_fix_text(str(geobbox["southBoundLatitude"]) + " " + str(geobbox["westBoundLongitude"]) + " " + str(geobbox["northBoundLatitude"]) + " " + str(geobbox["eastBoundLongitude"])) - if "geoLocationPoint" in data["datacite_geoLocation"]: for geopoint in data["datacite_geoLocation"]["geoLocationPoint"]: geolocation = ET.SubElement(geolocations, "geoLocation") geoLocationPoint = ET.SubElement(geolocation, "geoLocationPoint") geoLocationPoint.text = xml_fix_text(str(geopoint["pointLatitude"]) + " " + str(geopoint["pointLongitude"])) - if "geoLocationPlace" in data["datacite_geoLocation"]: for geoplace in data["datacite_geoLocation"]["geoLocationPlace"]: geolocation = ET.SubElement(geolocations, "geoLocation") @@ -264,8 +262,10 @@ def construct_datacite_xml(data): components.append(geoplace["province_state"]) if geoplace["country"]: components.append(geoplace["country"]) + # Combine all components of the place name separated by "; " geoLocationPlace.text = xml_fix_text("; ".join(components)) + # If geolocations is empty, remove it if len(geolocations) == 0: resource.remove(geolocations) @@ -360,6 +360,7 @@ def assemble_record(record, db, user, password, server, port): from psycopg2.extras import DictCursor lookup_cur = con.cursor(cursor_factory=DictCursor) + # get geolocation metadata record["datacite_geoLocation"] = {} lookup_cur.execute("""SELECT geobbox.westLon, geobbox.eastLon, geobbox.northLat, geobbox.southLat FROM geobbox WHERE geobbox.record_id=%s""", [record["record_id"]]) @@ -382,20 +383,16 @@ def assemble_record(record, db, user, password, server, port): lookup_cur.execute("""SELECT geoplace.country, geoplace.province_state, geoplace.city, geoplace.other, geoplace.place_name FROM geoplace JOIN records_x_geoplace on records_x_geoplace.geoplace_id = geoplace.geoplace_id - WHERE records_x_geoplace.record_id=%s""", [record["record_id"]]) + WHERE records_x_geoplace.record_id=%s""", [record["record_id"]]) geoplaces = lookup_cur.fetchall() if len(geoplaces) > 0: record["datacite_geoLocation"]["geoLocationPlace"] = [] for geoplace in geoplaces: record["datacite_geoLocation"]["geoLocationPlace"].append({"country": geoplace["country"], - "province_state": geoplace["province_state"], - "city": geoplace["city"], - "additional": geoplace["other"], - "place_name": geoplace["place_name"]}) - - with con: - from psycopg2.extras import DictCursor - lookup_cur = con.cursor(cursor_factory=DictCursor) + "province_state": geoplace["province_state"], + "city": geoplace["city"], + "additional": geoplace["other"], + "place_name": geoplace["place_name"]}) # attach the other values to the dict lookup_cur.execute("""SELECT creators.creator FROM creators JOIN records_x_creators on records_x_creators.creator_id = creators.creator_id WHERE records_x_creators.record_id=%s AND records_x_creators.is_contributor=0 order by records_x_creators_id asc""", [record["record_id"]]) From 44b3035a374c7c53b8077f6061402d9fdf595450 Mon Sep 17 00:00:00 2001 From: Kelly Stathis Date: Thu, 11 Mar 2021 14:47:59 -0800 Subject: [PATCH 66/67] Stop ListIdentifiers iteration when total records is exceeded --- viringo/catalogs.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/viringo/catalogs.py b/viringo/catalogs.py index fad0bd8..78729aa 100644 --- a/viringo/catalogs.py +++ b/viringo/catalogs.py @@ -448,6 +448,9 @@ def listIdentifiers( cursor=paging_cursor ) + if paging_cursor >= total_records: + paging_cursor = None + records = [] if results: for result in results: From b9d5663f7904b33c03e2ddae61b8674d040e24bb Mon Sep 17 00:00:00 2001 From: Kelly Stathis Date: Tue, 4 Jan 2022 11:36:25 -0800 Subject: [PATCH 67/67] Change record_id to record_uuid --- viringo/services/frdr.py | 42 ++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/viringo/services/frdr.py b/viringo/services/frdr.py index ce9a6e5..eea4346 100644 --- a/viringo/services/frdr.py +++ b/viringo/services/frdr.py @@ -363,7 +363,7 @@ def assemble_record(record, db, user, password, server, port): # get geolocation metadata record["datacite_geoLocation"] = {} lookup_cur.execute("""SELECT geobbox.westLon, geobbox.eastLon, geobbox.northLat, geobbox.southLat - FROM geobbox WHERE geobbox.record_id=%s""", [record["record_id"]]) + FROM geobbox WHERE geobbox.record_uuid=%s""", [record["record_uuid"]]) geobboxes = lookup_cur.fetchall() if len(geobboxes) > 0: record["datacite_geoLocation"]["geoLocationBox"] = [] @@ -372,8 +372,8 @@ def assemble_record(record, db, user, password, server, port): "eastBoundLongitude": geobbox["eastlon"], "northBoundLatitude": geobbox["northlat"], "southBoundLatitude": geobbox["southlat"]}) - lookup_cur.execute("""SELECT geopoint.lat, geopoint.lon FROM geopoint WHERE geopoint.record_id=%s""", - [record["record_id"]]) + lookup_cur.execute("""SELECT geopoint.lat, geopoint.lon FROM geopoint WHERE geopoint.record_uuid=%s""", + [record["record_uuid"]]) geopoints = lookup_cur.fetchall() if len(geopoints) > 0: record["datacite_geoLocation"]["geoLocationPoint"] = [] @@ -383,7 +383,7 @@ def assemble_record(record, db, user, password, server, port): lookup_cur.execute("""SELECT geoplace.country, geoplace.province_state, geoplace.city, geoplace.other, geoplace.place_name FROM geoplace JOIN records_x_geoplace on records_x_geoplace.geoplace_id = geoplace.geoplace_id - WHERE records_x_geoplace.record_id=%s""", [record["record_id"]]) + WHERE records_x_geoplace.record_uuid=%s""", [record["record_uuid"]]) geoplaces = lookup_cur.fetchall() if len(geoplaces) > 0: record["datacite_geoLocation"]["geoLocationPlace"] = [] @@ -395,40 +395,40 @@ def assemble_record(record, db, user, password, server, port): "place_name": geoplace["place_name"]}) # attach the other values to the dict - lookup_cur.execute("""SELECT creators.creator FROM creators JOIN records_x_creators on records_x_creators.creator_id = creators.creator_id WHERE records_x_creators.record_id=%s AND records_x_creators.is_contributor=0 order by records_x_creators_id asc""", [record["record_id"]]) + lookup_cur.execute("""SELECT creators.creator FROM creators JOIN records_x_creators on records_x_creators.creator_id = creators.creator_id WHERE records_x_creators.record_uuid=%s AND records_x_creators.is_contributor=0 order by records_x_creators_id asc""", [record["record_uuid"]]) record["dc:contributor.author"] = rows_to_dict(lookup_cur) - lookup_cur.execute("""SELECT affiliations.affiliation FROM affiliations JOIN records_x_affiliations on records_x_affiliations.affiliation_id = affiliations.affiliation_id WHERE records_x_affiliations.record_id=%s""", [record["record_id"]]) + lookup_cur.execute("""SELECT affiliations.affiliation FROM affiliations JOIN records_x_affiliations on records_x_affiliations.affiliation_id = affiliations.affiliation_id WHERE records_x_affiliations.record_uuid=%s""", [record["record_uuid"]]) record["datacite:creatorAffiliation"] = rows_to_dict(lookup_cur) - lookup_cur.execute("""SELECT creators.creator FROM creators JOIN records_x_creators on records_x_creators.creator_id = creators.creator_id WHERE records_x_creators.record_id=%s AND records_x_creators.is_contributor=1 order by records_x_creators_id asc""", [record["record_id"]]) + lookup_cur.execute("""SELECT creators.creator FROM creators JOIN records_x_creators on records_x_creators.creator_id = creators.creator_id WHERE records_x_creators.record_uuid=%s AND records_x_creators.is_contributor=1 order by records_x_creators_id asc""", [record["record_uuid"]]) record["dc:contributor"] = rows_to_dict(lookup_cur) - lookup_cur.execute("""SELECT subjects.subject FROM subjects JOIN records_x_subjects on records_x_subjects.subject_id = subjects.subject_id WHERE records_x_subjects.record_id=%s and subjects.language = 'en' """, [record["record_id"]]) + lookup_cur.execute("""SELECT subjects.subject FROM subjects JOIN records_x_subjects on records_x_subjects.subject_id = subjects.subject_id WHERE records_x_subjects.record_uuid=%s and subjects.language = 'en' """, [record["record_uuid"]]) record["frdr:category_en"] = rows_to_dict(lookup_cur) - lookup_cur.execute("""SELECT subjects.subject FROM subjects JOIN records_x_subjects on records_x_subjects.subject_id = subjects.subject_id WHERE records_x_subjects.record_id=%s and subjects.language = 'fr' """, [record["record_id"]]) + lookup_cur.execute("""SELECT subjects.subject FROM subjects JOIN records_x_subjects on records_x_subjects.subject_id = subjects.subject_id WHERE records_x_subjects.record_uuid=%s and subjects.language = 'fr' """, [record["record_uuid"]]) record["frdr:category_fr"] = rows_to_dict(lookup_cur) - lookup_cur.execute("""SELECT publishers.publisher FROM publishers JOIN records_x_publishers on records_x_publishers.publisher_id = publishers.publisher_id WHERE records_x_publishers.record_id=%s""", [record["record_id"]]) + lookup_cur.execute("""SELECT publishers.publisher FROM publishers JOIN records_x_publishers on records_x_publishers.publisher_id = publishers.publisher_id WHERE records_x_publishers.record_uuid=%s""", [record["record_uuid"]]) record["dc:publisher"] = rows_to_dict(lookup_cur) - lookup_cur.execute("""SELECT rights.rights FROM rights JOIN records_x_rights on records_x_rights.rights_id = rights.rights_id WHERE records_x_rights.record_id=%s""", [record["record_id"]]) + lookup_cur.execute("""SELECT rights.rights FROM rights JOIN records_x_rights on records_x_rights.rights_id = rights.rights_id WHERE records_x_rights.record_uuid=%s""", [record["record_uuid"]]) record["dc:rights"] = rows_to_dict(lookup_cur) - lookup_cur.execute("SELECT description FROM descriptions WHERE record_id=%s and language='en' ", [record["record_id"]]) + lookup_cur.execute("SELECT description FROM descriptions WHERE record_uuid=%s and language='en' ", [record["record_uuid"]]) record["dc:description_en"] = rows_to_dict(lookup_cur) - lookup_cur.execute("SELECT description FROM descriptions WHERE record_id=%s and language='fr' ", [record["record_id"]]) + lookup_cur.execute("SELECT description FROM descriptions WHERE record_uuid=%s and language='fr' ", [record["record_uuid"]]) record["dc:description_fr"] = rows_to_dict(lookup_cur) - lookup_cur.execute("""SELECT tags.tag FROM tags JOIN records_x_tags on records_x_tags.tag_id = tags.tag_id WHERE records_x_tags.record_id=%s and tags.language = 'en' """, [record["record_id"]]) + lookup_cur.execute("""SELECT tags.tag FROM tags JOIN records_x_tags on records_x_tags.tag_id = tags.tag_id WHERE records_x_tags.record_uuid=%s and tags.language = 'en' """, [record["record_uuid"]]) record["frdr:keywords_en"] = rows_to_dict(lookup_cur) - lookup_cur.execute("""SELECT tags.tag FROM tags JOIN records_x_tags on records_x_tags.tag_id = tags.tag_id WHERE records_x_tags.record_id=%s and tags.language = 'fr' """, [record["record_id"]]) + lookup_cur.execute("""SELECT tags.tag FROM tags JOIN records_x_tags on records_x_tags.tag_id = tags.tag_id WHERE records_x_tags.record_uuid=%s and tags.language = 'fr' """, [record["record_uuid"]]) record["frdr:keywords_fr"] = rows_to_dict(lookup_cur) - lookup_cur.execute("""SELECT access.access FROM access JOIN records_x_access on records_x_access.access_id = access.access_id WHERE records_x_access.record_id=%s""", [record["record_id"]]) + lookup_cur.execute("""SELECT access.access FROM access JOIN records_x_access on records_x_access.access_id = access.access_id WHERE records_x_access.record_uuid=%s""", [record["record_uuid"]]) record["frdr:access"] = rows_to_dict(lookup_cur) return record @@ -450,7 +450,7 @@ def get_metadata_list( records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s' port='%s'" % (db, user, password, server, port)) with records_con: db_cursor = records_con.cursor() - records_sql = """SELECT recs.record_id, recs.title, recs.title_fr, recs.pub_date, recs.series, recs.source_url, + records_sql = """SELECT recs.record_uuid, recs.title, recs.title_fr, recs.pub_date, recs.series, recs.source_url, recs.item_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp, repos.homepage_url, repos.repo_oai_name, count(*) OVER() AS full_count FROM records recs, repositories repos @@ -464,7 +464,7 @@ def get_metadata_list( if until_datetime is not None: until_timestamp = int(datetime.timestamp(until_datetime)) records_sql = records_sql + " AND recs.upstream_modified_timestamp<" + str(until_timestamp) - records_sql = records_sql + " ORDER BY recs.record_id" + records_sql = records_sql + " ORDER BY recs.record_uuid" if cursor is not None: records_sql = records_sql + " OFFSET " + cursor db_cursor.execute(records_sql) @@ -474,7 +474,7 @@ def get_metadata_list( results = [] for row in record_set: - record = (dict(zip(['record_id', 'title_en', 'title_fr', 'pub_date', 'series', 'source_url', 'item_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp', 'homepage_url', 'repo_oai_name'], row))) + record = (dict(zip(['record_uuid', 'title_en', 'title_fr', 'pub_date', 'series', 'source_url', 'item_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp', 'homepage_url', 'repo_oai_name'], row))) # This is goofy, but full_count isn't always returned for empty results if int(row[-1]) != 0: @@ -497,7 +497,7 @@ def get_metadata(identifier, db, user, password, server, port): records_con = psycopg2.connect("dbname='%s' user='%s' password='%s' host='%s' port='%s'" % (db, user, password, server, port)) with records_con: records_cursor = records_con.cursor() - records_sql = ("""SELECT recs.record_id, recs.title, recs.title_fr, recs.pub_date, recs.series, recs.source_url, + records_sql = ("""SELECT recs.record_uuid, recs.title, recs.title_fr, recs.pub_date, recs.series, recs.source_url, recs.item_url, recs.deleted, recs.local_identifier, recs.modified_timestamp, repos.repository_url, repos.repository_name, repos.repository_thumbnail, repos.item_url_pattern, repos.last_crawl_timestamp, repos.homepage_url, repos.repo_oai_name FROM records recs, repositories repos @@ -505,7 +505,7 @@ def get_metadata(identifier, db, user, password, server, port): AND recs.local_identifier =\'""" + local_identifier + "\'" + "AND repos.repo_oai_name=\'""" + namespace + "\'") records_cursor.execute(records_sql) row = records_cursor.fetchone() - record = (dict(zip(['record_id', 'title_en', 'title_fr', 'pub_date', 'series', 'source_url', 'item_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp', 'homepage_url', 'repo_oai_name'], row))) + record = (dict(zip(['record_uuid', 'title_en', 'title_fr', 'pub_date', 'series', 'source_url', 'item_url', 'deleted', 'local_identifier', 'modified_timestamp', 'repository_url', 'repository_name', 'repository_thumbnail', 'item_url_pattern', 'last_crawl_timestamp', 'homepage_url', 'repo_oai_name'], row))) full_record = assemble_record(record, db, user, password, server, port) return build_metadata(full_record)