diff --git a/cds_migrator_kit/errors.py b/cds_migrator_kit/errors.py index 3d28a338..ba6d2d4c 100644 --- a/cds_migrator_kit/errors.py +++ b/cds_migrator_kit/errors.py @@ -36,7 +36,7 @@ def __init__( self.exc = exc self.message = message self.priority = priority - super(CDSMigrationException, self).__init__(*args) + super(CDSMigrationException, self).__init__(message, *args) class RecordModelMissing(CDSMigrationException): diff --git a/cds_migrator_kit/rdm/README.md b/cds_migrator_kit/rdm/README.md index d580110d..0d5371fc 100644 --- a/cds_migrator_kit/rdm/README.md +++ b/cds_migrator_kit/rdm/README.md @@ -6,6 +6,8 @@ https://gitlab.cern.ch/cds-team/production_scripts/-/blob/master/cds-rdm/migration/dump_users.py?ref_type=heads + + ## Dump a subset of records on legacy on webnode: `cds-migration-01` diff --git a/cds_migrator_kit/rdm/cli.py b/cds_migrator_kit/rdm/cli.py index 21103645..bcdf811d 100644 --- a/cds_migrator_kit/rdm/cli.py +++ b/cds_migrator_kit/rdm/cli.py @@ -59,8 +59,18 @@ def migration(): "--keep-logs", is_flag=True, ) +@click.option( + "--workers", + type=int, + default=None, + help=( + "Number of threads for parallel record transformation. " + "Defaults to sequential (no threads). " + "Can also be set per-collection in streams.yaml under transform.workers." + ), +) @with_appcontext -def run(collection, dry_run=False, keep_logs=False): +def run(collection, dry_run=False, keep_logs=False, workers=None): """Run.""" stream_config = current_app.config["CDS_MIGRATOR_KIT_STREAM_CONFIG"] runner = Runner( @@ -70,7 +80,9 @@ def run(collection, dry_run=False, keep_logs=False): dry_run=dry_run, collection=collection, keep_logs=keep_logs, + workers=workers, ) + runner.run() diff --git a/cds_migrator_kit/rdm/data/courier/duplicated_pids.json b/cds_migrator_kit/rdm/data/courier/duplicated_pids.json deleted file mode 100644 index 3c6fcce2..00000000 --- a/cds_migrator_kit/rdm/data/courier/duplicated_pids.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "1514615": 1734891, - "1631096": 1734905, - "1517538": 1734919, - "1537017": 1734933, - "1563198": 1734975, - "1603700": 1735007, - "1613779": 1735023, - "1544352": 1734947, - "1452195": 1734665, - "1550751": 1734960, - "1595531": 1734993 -} diff --git a/cds_migrator_kit/rdm/data/hr/duplicated_pids.json b/cds_migrator_kit/rdm/data/hr/duplicated_pids.json deleted file mode 100644 index 0967ef42..00000000 --- a/cds_migrator_kit/rdm/data/hr/duplicated_pids.json +++ /dev/null @@ -1 +0,0 @@ -{} diff --git a/cds_migrator_kit/rdm/data/it_dep/duplicated_pids.json b/cds_migrator_kit/rdm/data/it_dep/duplicated_pids.json deleted file mode 100644 index 28f9789f..00000000 --- a/cds_migrator_kit/rdm/data/it_dep/duplicated_pids.json +++ /dev/null @@ -1,3 +0,0 @@ -{ - "002828388": 378023 -} diff --git a/cds_migrator_kit/rdm/data/thesis/.gitkeep b/cds_migrator_kit/rdm/data/thesis/.gitkeep deleted file mode 100644 index e69de29b..00000000 diff --git a/cds_migrator_kit/rdm/data/thesis/README.md b/cds_migrator_kit/rdm/data/thesis/README.md deleted file mode 100644 index 8e3dd41b..00000000 --- a/cds_migrator_kit/rdm/data/thesis/README.md +++ /dev/null @@ -1,63 +0,0 @@ -## Dump users - -! Attention If you need to dump the users from legacy DB or you need to process the people collection - -https://gitlab.cern.ch/cds-team/production_scripts/-/blob/master/cds-rdm/migration/dump_users.py?ref_type=heads - -(creates active_users.json and missing_users.json) - -## Dump latest people collection -``` -inveniomigrator dump records -q '980__:"AUTHORITY" 980__:"PEOPLE"' --file-prefix peoples --chunk-size=1000 -``` - -extract info from people collection (creates people.csv) -``` -invenio migration users people-run --filepath cds_migrator_kit/rdm/data/users/people.csv --dirpath cds_migrator_kit/rdm/data/users/dump -``` - -add missing accounts (uses missing_users.json and people.csv) -``` -invenio migration users submitters-run --dirpath /Users/kprzerwa/INVENIO/cds-migrator-kit/cds_migrator_kit/rdm/data/thesis/dump -``` - -first creates latest dump - -```bash -inveniomigrator dump records -q '980:THESIS -980:DELETED -980:HIDDEN -980__c:MIGRATED -980__a:DUMMY' --file-prefix thesis --chunk-size=1000 - -``` - - - - - -invenio rdm-records add-to-fixture programmes -invenio rdm-records add-to-fixture awards -invenio rdm-records custom-fields init - - - -0-1. Adapt ILS to consume thesis -0. Push all thesis to ILS -1. Run affiliations -2. Run users -3. Run duplicates (and 981__b) mergers - 3.1. add duplicated_pids.json file -4. Identify UDC records -5. Identify records with relations (2) -6. irecords with comments, migrate comments - - -next deployment - -change branch installed in migrator-kit from feature to master - -1. on worker pod -2. -invenio rdm-records add-to-fixture programmes -invenio rdm-records add-to-fixture awards - -2. both on migration and worker pod -invenio rdm-records custom-fields init -invenio communities custom-fields init diff --git a/cds_migrator_kit/rdm/data/thesis/duplicated_pids.json b/cds_migrator_kit/rdm/data/thesis/duplicated_pids.json deleted file mode 100644 index 6ae87e65..00000000 --- a/cds_migrator_kit/rdm/data/thesis/duplicated_pids.json +++ /dev/null @@ -1,170 +0,0 @@ -{ - "497102": 180909, - "478526": 1088337, - "467164": 1088337, - "1023608": 1020207, - "394030": 394754, - "1614506": 401686, - "784152": 461129, - "461500": 537176, - "2495252": 808056, - "2474408": 784079, - "1537906": 979591, - "275917": 176062, - "392196": 234154, - "2226487": 2225220, - "1538465": 1065732, - "1047106": 248290, - "461503": 1287915, - "1311189": 1311200, - "1511042": 1331120, - "1614446": 453260, - "498703": 456558, - "1361034": 1350478, - "1382401": 1363946, - "1386725": 1365797, - "117856": 577816, - "1386718": 1376696, - "1386719": 1377081, - "632636": 612005, - "1748479": 1390481, - "893980": 420209, - "1361037": 1295237, - "1390830": 1328959, - "1331811": 1390829, - "1360247": 1358627, - "1426609": 1426611, - "1438924": 1434377, - "1434696": 1434459, - "1384159": 1447135, - "2293840": 1455689, - "550679": 1046833, - "2202430": 1470948, - "1514521": 1474886, - "1390832": 1389914, - "1292268": 1389914, - "1341861": 1431512, - "1446562": 1311188, - "545841": 1045997, - "1516172": 1489195, - "1604208": 1517436, - "505160": 169208, - "1475436": 1519644, - "505525": 419856, - "2296805": 532587, - "1604209": 1537430, - "1517439": 1386717, - "117892": 109593, - "1713035": 1565941, - "1603328": 1599138, - "1538462": 1604471, - "1605423": 1557072, - "1516942": 1500849, - "1633346": 1633349, - "1609043": 1635658, - "1644799": 1644800, - "1644797": 1644800, - "1703737": 1645669, - "1709784": 1645669, - "1754912": 1645861, - "1754911": 1645862, - "2153622": 1647404, - "1706321": 1706292, - "1662728": 1742041, - "1703738": 1742041, - "1743909": 1743910, - "1743909": 1743911, - "1743910": 1743911, - "1965969": 1966046, - "1969481": 1966952, - "1696852": 1967364, - "1974144": 1974143, - "1703732": 1630887, - "1705517": 1630887, - "2105540": 1981300, - "2001548": 1999498, - "2005900": 2007142, - "1955402": 2008723, - "2027521": 2026861, - "2036037": 2036036, - "2104465": 2036210, - "2041440": 2041829, - "2048186": 2047049, - "2053089": 2050951, - "2058238": 2057123, - "2223431": 2094394, - "2119310": 2119232, - "2233648": 2124426, - "2155560": 2133096, - "2147862": 2137267, - "2147863": 2146676, - "1741552": 1742056, - "1741551": 1742051, - "2160739": 2161058, - "2226488": 2197561, - "401985": 2226011, - "2238561": 2237705, - "2239319": 2239318, - "2209601": 2241354, - "2291518": 2242179, - "2268993": 2268992, - "2271157": 2271160, - "2298574": 2283136, - "979732": 2284606, - "2286286": 2285515, - "1382409": 2285529, - "2681020": 2292301, - "2298313": 2298763, - "2301857": 2301704, - "2684748": 2316953, - "2633775": 2622604, - "2646359": 2646403, - "2648592": 2646850, - "2688587": 2688527, - "2710230": 2694020, - "2703981": 2703983, - "2745035": 2744896, - "2745727": 2745724, - "2775887": 2775885, - "2453635": 854795, - "2480497": 790529, - "2842791": 2791641, - "2798742": 2801732, - "2805367": 2805495, - "2841700": 2813591, - "2842788": 2823368, - "2836900": 2836903, - "2839827": 2839232, - "2840881": 2841697, - "2841148": 2841699, - "2843056": 2843748, - "2882389": 2882160, - "2882388": 2882116, - "2882472": 2883215, - "2882548": 2883217, - "2882554": 2883218, - "2896498": 2891746, - "2919820": 2919818, - "420033": 344990, - "2148589": 2148631, - "2238507": 2621052, - "2022949": 2022781, - "814998": 1387969, - "1981101": 1980680, - "1369286": 1350519, - "2215209": 2204912, - "1953884": 1757585, - "1400749": 1392392, - "691653": 640764, - "366331": 366331, - "1046489": 298073, - "2925387": 1323559, - "886464": 846441, - "1703737": 1709784, - "1504257": 1748203, - "2132790": 1999457, - "1446266": 1451438, - "1703736": 1705521, - "2636829": 2637337, - "2790950": 2683237 -} diff --git a/cds_migrator_kit/rdm/migration_config.py b/cds_migrator_kit/rdm/migration_config.py index 36b71748..7392910d 100644 --- a/cds_migrator_kit/rdm/migration_config.py +++ b/cds_migrator_kit/rdm/migration_config.py @@ -7,6 +7,8 @@ https://inveniordm.docs.cern.ch/reference/configuration/. """ +from pathlib import Path + import json import os from datetime import datetime, timedelta @@ -86,6 +88,7 @@ def _(x): # needed to avoid start time failure with lazy strings # See https://flask-sqlalchemy.palletsprojects.com/en/2.x/config/ SQLALCHEMY_DATABASE_URI = "postgresql+psycopg2://cds-rdm:cds-rdm@localhost/cds-rdm" +SQLALCHEMY_ENGINE_OPTIONS = {"connect_args": {"options": "-c timezone=UTC"}} # Invenio-App # =========== @@ -342,9 +345,7 @@ def _(x): # needed to avoid start time failure with lazy strings RDM_NAMESPACES = {**NAMESPACES} RDM_CUSTOM_FIELDS = CUSTOM_FIELDS -import cds_migrator_kit - -base_path = os.path.dirname(os.path.realpath(cds_migrator_kit.__file__)) +base_path = Path(__file__).resolve().parent.parent logs_dir = os.path.join(base_path, "tmp/logs/") CDS_MIGRATOR_KIT_LOGS_PATH = ( os.environ.get("INVENIO_CDS_MIGRATOR_KIT_LOGS_PATH") or logs_dir @@ -385,6 +386,7 @@ def _(x): # needed to avoid start time failure with lazy strings "validator": schemes.is_indico, "datacite": "INDICO", }, + "hal": {"label": "HAL", "validator": schemes.is_hal, "datacite": "HAL"}, }, # keep internal identifiers' schemes for internal record relations **RDM_RECORDS_IDENTIFIERS_SCHEMES, @@ -464,6 +466,10 @@ def resolve_record_pid(pid): CDS_MIGRATOR_KIT_ENV = "local" +CDS_MIGRATOR_KIT_VOCABULARIES_DIR = None +"""Absolute path to the vocabularies directory. Defaults to +{instance_path}/app_data/vocabularies when None.""" + CDS_ACCESS_GROUP_MAPPINGS = { "SSO": ["cern-accounts-primary"], "ITDepRestrFile": ["it-dep"], diff --git a/cds_migrator_kit/rdm/records/load/load.py b/cds_migrator_kit/rdm/records/load/load.py index 894090e5..7763aa6a 100644 --- a/cds_migrator_kit/rdm/records/load/load.py +++ b/cds_migrator_kit/rdm/records/load/load.py @@ -14,11 +14,9 @@ import arrow from cds_rdm.clc_sync.models import CDSToCLCSyncModel -from cds_rdm.components import MintAlternateIdentifierComponent from cds_rdm.legacy.models import CDSMigrationLegacyRecord from cds_rdm.legacy.resolver import get_pid_by_legacy_recid from cds_rdm.minters import legacy_recid_minter -from cds_rdm.tasks import sync_alternate_identifiers from flask import current_app from invenio_access.permissions import system_identity from invenio_accounts.models import User @@ -30,6 +28,8 @@ from invenio_rdm_records.proxies import current_rdm_records_service from invenio_records.systemfields.relations import InvalidRelationValue from marshmallow import ValidationError +from sqlalchemy.exc import IntegrityError +from psycopg2.errors import UniqueViolation from cds_migrator_kit.errors import ( CDSMigrationException, @@ -79,7 +79,7 @@ def _prepare(self, entry): """Prepare the record.""" pass - def _load_files(self, draft, entry, version_files): + def _load_files(self, draft, entry, version_files, uow=None): """Load files to draft.""" recid = entry.get("record", {}).get("recid", {}) identity = system_identity # Should we create an identity for the migration? @@ -103,6 +103,7 @@ def _load_files(self, draft, entry, version_files): "access": {"hidden": False}, } ], + uow=uow, ) # TODO change to eos move or xrootd command instead of going through the app # TODO leave the init part to pre-create the destination folder @@ -127,9 +128,10 @@ def _load_files(self, draft, entry, version_files): draft.id, file_data["key"], import_legacy_files(file_data["eos_tmp_path"]), + uow=uow, ) result = current_rdm_records_service.draft_files.commit_file( - identity, draft.id, file_data["key"] + identity, draft.id, file_data["key"], uow=uow ) legacy_checksum = f"md5:{file_data['checksum']}" new_checksum = result.to_dict()["checksum"] @@ -159,29 +161,21 @@ def _load_files(self, draft, entry, version_files): self.migration_logger.add_log(exc, record=entry) raise e - def _load_parent_access(self, draft, entry): - """Load access rights.""" + def _load_parent_access_and_communities(self, draft, entry): + """Load access rights and communities in a single parent commit.""" parent = draft._record.parent - # Set parent access from entry data - access = entry["parent"]["json"]["access"] - parent.access = access - + parent.access = entry["parent"]["json"]["access"] + communities = entry["parent"]["json"]["communities"]["ids"] + for community in communities: + parent.communities.add(community) + parent.communities.default = entry["parent"]["json"]["communities"]["default"] parent.commit() def _load_record_access(self, draft, access_dict): record = draft._record - record.access = access_dict["access_obj"] record.commit() - def _load_communities(self, draft, entry): - parent = draft._record.parent - communities = entry["parent"]["json"]["communities"]["ids"] - for community in communities: - parent.communities.add(community) - parent.communities.default = entry["parent"]["json"]["communities"]["default"] - parent.commit() - def _after_publish_update_dois(self, identity, record, entry, uow): """Update migrated DOIs post publish.""" migrated_pids = entry["record"]["json"]["pids"] @@ -391,7 +385,7 @@ def _after_publish_update_created(self, record, entry, version): ) record._record.model.created = creation_date - record._record.commit() + db.session.add(record._record.model) def _after_publish_mint_recid(self, record, entry, version): """Mint legacy ids for redirections assigned to the parent.""" @@ -414,7 +408,7 @@ def _after_publish_update_files_created(self, record, entry, version): file.model.created = arrow.get(file_data["creation_date"]).datetime.replace( tzinfo=None ) - file.commit() + db.session.add(file.model) def _after_publish(self, identity, published_record, entry, version, uow): """Run fixes after record publish.""" @@ -470,11 +464,12 @@ def _pre_publish(self, identity, entry, version, draft, uow): # we decided to skip it and act normal try: draft = current_rdm_records_service.create( - identity, data=entry["record"]["json"] + identity, data=entry["record"]["json"], uow=uow ) self._assign_rep_numbers(draft) + except (UniqueViolation, IntegrityError) as e: + raise ManualImportRequired(message=str(e)) except Exception as e: - raise ManualImportRequired(message=str(e)) if draft.errors: raise ManualImportRequired( @@ -486,11 +481,11 @@ def _pre_publish(self, identity, entry, version, draft, uow): value=draft._record.pid.pid_value, subfield=None, ) - # TODO we can use unit of work when it is moved to invenio-db module - self._load_parent_access(draft, entry) - self._load_communities(draft, entry) + self._load_parent_access_and_communities(draft, entry) else: - draft = current_rdm_records_service.new_version(identity, draft["id"]) + draft = current_rdm_records_service.new_version( + identity, draft["id"], uow=uow + ) draft_dict = draft.to_dict() missing_data = { **draft_dict, @@ -503,11 +498,11 @@ def _pre_publish(self, identity, entry, version, draft, uow): }, } draft = current_rdm_records_service.update_draft( - identity, draft["id"], data=missing_data + identity, draft["id"], data=missing_data, uow=uow ) self._load_record_access(draft, access) - self._load_files(draft, entry, files) + self._load_files(draft, entry, files, uow=uow) return draft @@ -665,6 +660,10 @@ def _load(self, entry): if entry: recid = entry.get("record", {}).get("recid", {}) if self._should_skip_recid(recid): + self.migration_logger.add_information( + recid, state={"message": "Record already migrated", "value": recid} + ) + self.migration_logger.finalise_record(recid) return self.clc_sync = deepcopy(entry.get("_clc_sync", False)) @@ -689,7 +688,15 @@ def _load(self, entry): except GrantCreationError as e: self.migration_logger.add_log(e, record=entry) except (CDSMigrationException, ValidationError, InvalidRelationValue) as e: - + exc = ManualImportRequired( + message=str(e), + field="validation", + stage="load", + recid=recid, + priority="warning", + ) + self.migration_logger.add_log(exc, record=entry) + except Exception as e: exc = ManualImportRequired( message=str(e), field="validation", diff --git a/cds_migrator_kit/rdm/records/transform/config.py b/cds_migrator_kit/rdm/records/transform/config.py index 7dc96085..4a0f7f3b 100644 --- a/cds_migrator_kit/rdm/records/transform/config.py +++ b/cds_migrator_kit/rdm/records/transform/config.py @@ -34,6 +34,12 @@ "WAI01", "KEK", "ATLATL", + "HOLALE", + "UNCOVER", + "INTINT", + "KEKSCAN", + "ALIALI", + "AIS.FOUNDATION.EXPERIMENTS", ] IDENTIFIERS_VALUES_TO_DROP = "oai:arXiv.org" @@ -51,6 +57,8 @@ "eucard", "inspec", "desy", + "medline", + "aip", ] KEYWORD_SCHEMES_TO_DROP = ["proquest", "disxa"] diff --git a/cds_migrator_kit/rdm/records/transform/models/research.py b/cds_migrator_kit/rdm/records/transform/models/research.py new file mode 100644 index 00000000..92bb71b2 --- /dev/null +++ b/cds_migrator_kit/rdm/records/transform/models/research.py @@ -0,0 +1,106 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2025 CERN. +# +# CDS-RDM is free software; you can redistribute it and/or modify it under +# the terms of the MIT License; see LICENSE file for more details. + +"""CDS-RDM CMS note model.""" +from cds_migrator_kit.rdm.records.transform.models.base_publication_record import ( + rdm_base_publication_model, +) +from cds_migrator_kit.transform.overdo import CdsOverdo + + +class ResearchModel(CdsOverdo): + """Translation model for research.""" + + __query__ = '980__:L3_Papers OR 980__:INTNOTEALEPHPRIV OR 980__:OPAL_Papers OR 980__:OPAL_Misc OR 980__:DELPHI_Misc OR 980__:DELPHI_Papers OR 980__:L3_Misc OR 693__.e:L3 OR 693__.e:DELPHI OR 693__.e:OPAL OR 693__.e:ALEPH OR 690C_.a:PUBLDELPHINOTE OR 690C_.a:PRIVDELPHINOTE OR 710__.g:"ALEPH Collaboration" OR 710__.g:"Aleph Collaboration" OR 980__.a:ALEPH_Papers OR 980__.a:ALEPHDRAFT OR 037__:CERN-ALEPH-PUB-* OR 037__:CERN-ALEPH-ARCH-DATA-* OR 980__:LCD-Notes OR 980__:LCD-NOTES OR 693__.e:"DAMPE RE29" OR 037__:DIRAC-NOTE* OR 037__:DIRAC-Note* OR 037__:DIRAC-CONF* OR 037__:DIRAC-DOC* OR 037__:DIRAC-PUB* OR 693__:UA2 OR 693__:UA4 OR 693:__UA5 OR 693__:UA8 OR 980__:ANTARESCERNTALK OR (980__.a:"POSTER" AND 693__.e:ANTARES) OR 980__:INTNOTEHARPCDPPUBL OR 980__:PRIVIMXGAM OR 980__:PRIVANTARES -980__:THESIS -037__:CERN-STUDENTS-Note-* -980__:DELETED -980__.c:MIGRATED -980__.a:DUMMY -690C_.a:SCICOM' + + __ignore_keys__ = { + "0248_a", + "0248_p", + "0248_q", + "0247_9", # provenance of the DOI + "030__a", # TODO coden designation to drop? + "035__h", # oai identifiers in 1215391 + "035__d", # oai identifiers in 1215391 + "035__t", # oai identifiers in 1215391 + "035__u", # oai identifiers in 1215391 + "035__m", # oai identifiers in 1215391 + "035__z", # oai identifiers in 1215391 + "500__9", # provenance of the note + "520__9", # provenance of the description + "520__h", # provenance of the description + "852__c", # holdings will be taken separately + "852__h", + "037__c", # arxiv subject + "100__m", # email of contributor + "245__9", # title provenance + "270__m", # document contact email + "300__a", # number of pages + "340__a", # TODO ignore material? + "540__3", # TODO still ignore the material of the license? + "542__3", # TODO still ignore the material of the license? + "595__i", # TODO ?? + "695__e", # some inspire tag + "700__m", # email of contributor + "700__q", # TODO ignore? aliteration of the name, used for searching + "700__v", # TODO drop? + "773__x", # INSPIRE publication note + "773__t", # INSPIRE publication note + "773__0", # from SIS: can be ignored + "8564_8", # file id + "8564_s", # bibdoc id + "8564_x", # icon thumbnails sizes + # "8564_y", # file description - done by files dump + "8564_w", # system field + "913__y", # citation + "913__v", # citation + "913__t", # citation + "913__a", # citation + "913__c", # citation + "916__y", # year, redundant value + "937__c", # last modified by + "937__s", # last modification date + "960__a", # base number + "961__c", # + "961__h", # + "961__l", # + "961__x", # + "964__a", # TODO: ignore? + "981__a", # duplicate record id + "999C50", + "999C52", # https://cds.cern.ch/record/2640188/export/hm?ln=en + "999C59", # https://cds.cern.ch/record/2284615/export/hm?ln=en + "999C5a", # https://cds.cern.ch/record/2678429/export/hm?ln=en + "999C5c", # https://cds.cern.ch/record/2284606/export/hm?ln=en + "999C5h", # https://cds.cern.ch/record/2284606/export/hm?ln=en + "999C5i", # https://cds.cern.ch/record/2284892/export/hm?ln=en + "999C5k", # https://cds.cern.ch/record/2671914/export/hm?ln=en + "999C5l", # https://cds.cern.ch/record/2283115/export/hm?ln=en + "999C5m", # https://cds.cern.ch/record/2284606/export/hm?ln=en + "999C5o", # https://cds.cern.ch/record/2284606/export/hm?ln=en + "999C5p", # https://cds.cern.ch/record/2284606/export/hm?ln=en + "999C5r", # https://cds.cern.ch/record/2284606/export/hm?ln=en + "999C5s", # https://cds.cern.ch/record/2284606/export/hm?ln=en + "999C5t", # https://cds.cern.ch/record/2284606/export/hm?ln=en + "999C5u", # https://cds.cern.ch/record/2284606/export/hm?ln=en + "999C5v", # https://cds.cern.ch/record/2283088/export/hm?ln=en + "999C5x", # https://cds.cern.ch/record/2710809/export/hm?ln=en + "999C5y", # https://cds.cern.ch/record/2284606/export/hm?ln=en + "999C5z", # https://cds.cern.ch/record/2710809/export/hm?ln=en + "999C6a", # https://cds.cern.ch/record/2284606/export/hm?ln=en + "999C6t", # https://cds.cern.ch/record/2284606/export/hm?ln=en + "999C6v", # https://cds.cern.ch/record/2284606/export/hm?ln=en + } + + _default_fields = { + "custom_fields": {}, + } + + +research_model = ResearchModel( + bases=(rdm_base_publication_model,), + entry_point_group="cds_migrator_kit.migrator.rdm.rules.publication", +) diff --git a/cds_migrator_kit/rdm/records/transform/models/research_committee.py b/cds_migrator_kit/rdm/records/transform/models/research_committee.py new file mode 100644 index 00000000..57bbfc75 --- /dev/null +++ b/cds_migrator_kit/rdm/records/transform/models/research_committee.py @@ -0,0 +1,105 @@ +# -*- coding: utf-8 -*- +# +# Copyright (C) 2025 CERN. +# +# CDS-RDM is free software; you can redistribute it and/or modify it under +# the terms of the MIT License; see LICENSE file for more details. + +"""CDS-RDM CMS note model.""" +from cds_migrator_kit.rdm.records.transform.models.base_publication_record import ( + rdm_base_publication_model, +) +from cds_migrator_kit.transform.overdo import CdsOverdo + + +class ResearchCommitteeModel(CdsOverdo): + """Translation model for research committees.""" + + __query__ = '980__:SCICOMMPUBLDRDC OR 980__:SCICOMMPUBLEEC OR 980__:SCICOMMPUBLEMC OR 980__:SCICOMMPUBLISC OR 980__:SCICOMMPUBLISRC OR 980__:SCICOMMPUBLISTC OR 980__:SCICOMMPUBLLEPC OR 980__:SCICOMMPUBLNPRC OR 980__:SCICOMMPUBLNSC OR 980__:SCICOMMPUBLPHI OR 980__:SCICOMMPUBLPHIII OR 980__:SCICOMMPUBLPSC OR 980__:SCICOMMPUBLPSCC OR 980__:SCICOMMPUBLSCC OR 980__.a:SC_and_PS_Advisory_Committee OR (980__:SCICOMMPUBLSPSC AND 260__.c:"0000"->"1990") OR 980__:SCICOMMPUBLSPSLC OR 980__:SCICOMMPUBLTCC -037__:CERN-STUDENTS-Note-* -980__:THESIS -980__:thesis -980__:Thesis -980__:DELETED -980__.c:MIGRATED -980__.a:DUMMY' + + __ignore_keys__ = { + "0248_a", + "0248_p", + "0248_q", + "0247_9", # provenance of the DOI + "030__a", # TODO coden designation to drop? + "035__h", # oai identifiers in 1215391 + "035__d", # oai identifiers in 1215391 + "035__t", # oai identifiers in 1215391 + "035__u", # oai identifiers in 1215391 + "035__m", # oai identifiers in 1215391 + "035__z", # oai identifiers in 1215391 + "500__9", # provenance of the note + "520__9", # provenance of the description + "520__h", # provenance of the description + "852__c", # holdings will be taken separately + "852__h", + "037__c", # arxiv subject + "100__m", # email of contributor + "245__9", # title provenance + "300__a", # number of pages + "340__a", # TODO ignore material? + "540__3", # TODO still ignore the material of the license? + "542__3", # TODO still ignore the material of the license? + "595__i", # TODO ?? + "695__e", # some inspire tag + "700__m", # email of contributor + "700__q", # TODO ignore? aliteration of the name, used for searching + "700__v", # TODO drop? + "773__x", # INSPIRE publication note + "8564_8", # file id + "8564_s", # bibdoc id + "8564_x", # icon thumbnails sizes + "8564_y", # file description - done by files dump + "8564_w", # system field + "913__y", # citation + "913__v", # citation + "913__t", # citation + "913__a", # citation + "913__c", # citation + "916__y", # year, redundant value + "937__c", # last modified by + "937__s", # last modification date + "960__a", # base number + "961__c", # + "961__h", # + "961__l", # + "961__x", # + "964__a", # TODO: ignore? + "981__a", # duplicate record id + "995__a", # INSPIRE as value + "999C50", + "999C52", # https://cds.cern.ch/record/2640188/export/hm?ln=en + "999C59", # https://cds.cern.ch/record/2284615/export/hm?ln=en + "999C5a", # https://cds.cern.ch/record/2678429/export/hm?ln=en + "999C5c", # https://cds.cern.ch/record/2284606/export/hm?ln=en + "999C5h", # https://cds.cern.ch/record/2284606/export/hm?ln=en + "999C5i", # https://cds.cern.ch/record/2284892/export/hm?ln=en + "999C5k", # https://cds.cern.ch/record/2671914/export/hm?ln=en + "999C5l", # https://cds.cern.ch/record/2283115/export/hm?ln=en + "999C5m", # https://cds.cern.ch/record/2284606/export/hm?ln=en + "999C5o", # https://cds.cern.ch/record/2284606/export/hm?ln=en + "999C5p", # https://cds.cern.ch/record/2284606/export/hm?ln=en + "999C5r", # https://cds.cern.ch/record/2284606/export/hm?ln=en + "999C5s", # https://cds.cern.ch/record/2284606/export/hm?ln=en + "999C5t", # https://cds.cern.ch/record/2284606/export/hm?ln=en + "999C5u", # https://cds.cern.ch/record/2284606/export/hm?ln=en + "999C5v", # https://cds.cern.ch/record/2283088/export/hm?ln=en + "999C5x", # https://cds.cern.ch/record/2710809/export/hm?ln=en + "999C5y", # https://cds.cern.ch/record/2284606/export/hm?ln=en + "999C5z", # https://cds.cern.ch/record/2710809/export/hm?ln=en + "999C6a", # https://cds.cern.ch/record/2284606/export/hm?ln=en + "999C6t", # https://cds.cern.ch/record/2284606/export/hm?ln=en + "999C6v", # https://cds.cern.ch/record/2284606/export/hm?ln=en + } + + _default_fields = { + "custom_fields": {}, + "resource_type": {"id": "publication-other"}, + } + + +research_comm_model = ResearchCommitteeModel( + bases=(rdm_base_publication_model,), + entry_point_group="cds_migrator_kit.migrator.rdm.rules.publication", +) diff --git a/cds_migrator_kit/rdm/records/transform/transform.py b/cds_migrator_kit/rdm/records/transform/transform.py index f2fb96bc..4fdad6a5 100644 --- a/cds_migrator_kit/rdm/records/transform/transform.py +++ b/cds_migrator_kit/rdm/records/transform/transform.py @@ -13,12 +13,12 @@ from pathlib import Path import arrow +import yaml from cds_rdm.legacy.models import CDSMigrationAffiliationMapping from dateutil.parser import ParserError, parse from flask import current_app from idutils import normalize_ror from idutils.validators import is_doi, is_ror -from invenio_access.permissions import system_identity from invenio_accounts.models import User, UserIdentity from invenio_db import db from invenio_pidstore.models import PersistentIdentifier, PIDStatus @@ -26,9 +26,8 @@ RDMRecordEntry, RDMRecordTransform, ) -from invenio_records_resources.proxies import current_service_registry +from invenio_vocabularies.contrib.affiliations.models import AffiliationsMetadata from invenio_vocabularies.contrib.names.models import NamesMetadata -from opensearchpy import RequestError from sqlalchemy.exc import NoResultFound from cds_migrator_kit.errors import ( @@ -49,32 +48,73 @@ PIDS_SCHEMES_ALLOWED, PIDS_SCHEMES_TO_DROP, ) -from cds_migrator_kit.reports.log import MigrationProgressLogger, RecordStateLogger from cds_migrator_kit.transform.dumper import CDSRecordDump from cds_migrator_kit.transform.errors import LossyConversion cli_logger = logging.getLogger("migrator") +_VOCAB_FILENAMES = { + "experiments": "experiments.yaml", + "departments": "departments.yaml", + "programmes": "programmes.yaml", + "accelerators": "accelerators.yaml", + "beams": "beams.yaml", +} + + +class VocabularyCache: + """Vocabulary lookup cache loaded once from YAML files at startup.""" + + def __init__(self, vocab_dir): + """Load all vocabularies from the given directory into memory.""" + self._cache = {} + vocab_dir = Path(vocab_dir) + for vocab_type, filename in _VOCAB_FILENAMES.items(): + self._cache[vocab_type] = self._load(vocab_dir / filename) + + @staticmethod + def _load(filepath): + """Build a case-insensitive term→id lookup from a vocabulary YAML.""" + with open(filepath) as f: + entries = yaml.safe_load(f) + lookup = {} + for entry in entries: + entry_id = entry["id"] + lookup[entry_id.lower()] = entry_id + title = entry.get("title", {}).get("en", "") + if title and title.lower() != entry_id.lower(): + lookup[title.lower()] = entry_id + return lookup + + def get(self, term, vocab_type): + """Return {"id": vocab_id} if term matches, else None.""" + entry_id = self._cache[vocab_type].get(term.strip().lower()) + return {"id": entry_id} if entry_id else None + + +_vocabulary_cache = None + + +def _get_vocabulary_cache(): + global _vocabulary_cache + if _vocabulary_cache is None: + vocab_dir = current_app.config.get("CDS_MIGRATOR_KIT_VOCABULARIES_DIR") + if vocab_dir is None: + import cds_rdm + + vocab_dir = Path(cds_rdm.__file__).parent / "app_data" / "vocabularies" + else: + vocab_dir = Path(vocab_dir) + _vocabulary_cache = VocabularyCache(vocab_dir) + return _vocabulary_cache + def search_vocabulary(term, vocab_type): - """Search vocabulary utility function.""" - service = current_service_registry.get("vocabularies") - if "/" in term: - # escape the slashes - term = f'"{term}"' - try: - vocabulary_result = service.search( - system_identity, type=vocab_type, q=f'"{term}"' - ).to_dict() - return vocabulary_result - except RequestError: - raise UnexpectedValue( - subfield="a", - value=term, - field=vocab_type, - message=f"Vocabulary {vocab_type} term {term} not valid search phrase.", - stage="vocabulary match", - ) + """Look up a vocabulary term using the pre-loaded YAML cache. + + Returns {"id": vocab_id} if found, else None. + """ + return _get_vocabulary_cache().get(term, vocab_type) class CDSToRDMRecordEntry(RDMRecordEntry): @@ -150,8 +190,6 @@ def _media_files(self, entry): return {} def _pids(self, json_entry): - from flask import current_app - DATACITE_PREFIX = current_app.config["DATACITE_PREFIX"] pids = json_entry.get("_pids", {}) @@ -195,7 +233,7 @@ def _files(self, record_dump): """Transform the files of a record.""" record_dump.prepare_files() files = record_dump.files - return {"enabled": True if files else False} + return {"enabled": bool(files)} def _communities(self, json_entry): return json_entry.get("communities", []) @@ -208,7 +246,7 @@ def _owner(self, json_entry): user = User.query.filter_by(email=email).one() return user.id except NoResultFound: - return UnexpectedValue( + raise UnexpectedValue( message=f"{email} not found - did you run user migration?", stage="transform", recid=json_entry["legacy_recid"], @@ -216,9 +254,24 @@ def _owner(self, json_entry): priority="critical", ) - def _match_affiliation(self, affiliation_name): + def _match_affiliation(self, affiliation_name, json_entry): """Match an affiliation against `CDSMigrationAffiliationMapping` db table.""" if is_ror(affiliation_name): + ror = normalize_ror(affiliation_name) + name = AffiliationsMetadata.query.filter_by(pid=ror).one_or_none() + if name is None: + raise ManualImportRequired( + message="Affiliation {ror} does not exist in the AffiliationMetadata table".format( + ror=ror + ), + field="validation", + stage="transform", + description="Add this affiliation", + recid=json_entry["recid"], + priority="critical", + value=None, + subfield=None, + ) return {"id": normalize_ror(affiliation_name)} # Step 1: search in the affiliation mapping (ROR organizations) match = self.affiliations_mapping.query.filter_by( @@ -229,10 +282,10 @@ def _match_affiliation(self, affiliation_name): if match.curated_affiliation: return match.curated_affiliation # Step 2: check if there is an exact match - elif match.ror_exact_match: + if match.ror_exact_match: return {"id": normalize_ror(match.ror_exact_match)} # Step 3: check if there is not exact match - elif match.ror_not_exact_match: + if match.ror_not_exact_match: _affiliation_ror_id = normalize_ror(match.ror_not_exact_match) raise RecordFlaggedCuration( subfield="u", @@ -241,24 +294,15 @@ def _match_affiliation(self, affiliation_name): message=f"Affiliation {_affiliation_ror_id} not found as an exact match, ROR id should be checked.", stage="vocabulary match", ) - else: - # Step 4: set the originally inserted value from legacy - raise RecordFlaggedCuration( - subfield="u", - value={"name": affiliation_name}, - field="author", - message=f"Affiliation {affiliation_name} not found as an exact match, custom value should be checked.", - stage="vocabulary match", - ) - else: - # Step 4: set the originally inserted value from legacy - raise RecordFlaggedCuration( - subfield="u", - value={"name": affiliation_name}, - field="author", - message=f"Affiliation {affiliation_name} not found as an exact match, custom value should be checked.", - stage="vocabulary match", - ) + # Step 4: set the originally inserted value from legacy (no match, or match + # found but has no ROR id of any kind) + raise RecordFlaggedCuration( + subfield="u", + value={"name": affiliation_name}, + field="author", + message=f"Affiliation {affiliation_name} not found as an exact match, custom value should be checked.", + stage="vocabulary match", + ) def _metadata(self, json_entry, record_dump): @@ -268,15 +312,18 @@ def creator_affiliations(creator): for affiliation_name in affiliations: try: - affiliation = self._match_affiliation(affiliation_name) - transformed_aff.append(affiliation) + affiliation = self._match_affiliation(affiliation_name, json_entry) + if affiliation not in transformed_aff: + transformed_aff.append(affiliation) except RecordFlaggedCuration as exc: # Save not exact match affiliation and reraise to flag the record self.migration_logger.add_information( json_entry["recid"], {"message": exc.message, "value": exc.value}, ) - transformed_aff.append(exc.value) + aff = {"name": affiliation_name} + if aff not in transformed_aff: + transformed_aff.append({"name": affiliation_name}) creator["affiliations"] = transformed_aff def creator_identifiers(creator): @@ -419,6 +466,17 @@ def table_of_contents(json_entry): json_entry["additional_descriptions"] = additional_desc json_entry.pop("table_of_content") + def subjects(json_entry): + _subjects = json_entry.get("subjects") + if _subjects: + for subject in reversed(_subjects): + if subject.get("subject", "").lower() in ["xx", "talk"]: + _subjects.remove(subject) + elif subject.get("id", "").lower() in ["xx", "talk"]: + _subjects.remove(subject) + return _subjects + + _subjects = subjects(json_entry) table_of_contents(json_entry) metadata = { @@ -428,7 +486,7 @@ def table_of_contents(json_entry): "description": json_entry.get("description"), "publication_date": _publication_date(json_entry, record_dump), "contributors": creators(json_entry, key="contributors"), - "subjects": json_entry.get("subjects"), + "subjects": _subjects, "publisher": json_entry.get("publisher"), "additional_descriptions": json_entry.get("additional_descriptions"), "additional_titles": json_entry.get("additional_titles"), @@ -472,15 +530,12 @@ def field_experiments(record_json, custom_fields_dict): "cern:experiments", [] ) for experiment in experiments: - if experiment.lower().strip() == "not applicable": + if experiment.lower().strip() in ["not applicable", "xx"]: continue result = search_vocabulary(experiment, "experiments") - - if result["hits"]["total"]: - custom_fields_dict["cern:experiments"].append( - {"id": result["hits"]["hits"][0]["id"]} - ) - else: + if result and result not in custom_fields_dict["cern:experiments"]: + custom_fields_dict["cern:experiments"].append(result) + elif not result: subj = json_output["metadata"].get("subjects", []) subj.append({"subject": experiment}) json_output["metadata"]["subjects"] = subj @@ -496,9 +551,8 @@ def field_programmes(record_json): programme = record_json.get("custom_fields", {}).get("cern:programmes") if programme: result = search_vocabulary(programme, "programmes") - - if result["hits"]["total"]: - return {"id": result["hits"]["hits"][0]["id"]} + if result: + return result else: raise UnexpectedValue( value=programme, @@ -517,20 +571,25 @@ def field_departments(record_json, custom_fields_dict): "cern:departments", [] ) for department in departments: - result = search_vocabulary(department, "departments") - if result["hits"]["total"]: - custom_fields_dict["cern:departments"].append( - {"id": result["hits"]["hits"][0]["id"]} - ) + if "-" in department: + units = department.split("-") + dep = units[0] else: + dep = department + result = search_vocabulary(dep, "departments") + if result and result not in custom_fields_dict["cern:departments"]: + custom_fields_dict["cern:departments"].append(result) + elif not result: subj = json_output["metadata"].get("subjects", []) subj.append({"subject": department}) json_output["metadata"]["subjects"] = subj + custom_fields_dict["cern:administrative_unit"] = department raise RecordFlaggedCuration( subfield="a", value=department, field="department", - message=f"Department {department} not found. added as subject", + message=f"Department {department} not found. " + f"Added as unit and subject", stage="vocabulary match", ) @@ -542,13 +601,9 @@ def field_accelerators(record_json, custom_fields_dict): if accelerator.lower().strip() in ["not applicable", "xx"]: continue result = search_vocabulary(accelerator, "accelerators") - if result["hits"]["total"]: - - custom_fields_dict["cern:accelerators"].append( - {"id": result["hits"]["hits"][0]["id"]} - ) - - else: + if result and result not in custom_fields_dict["cern:accelerators"]: + custom_fields_dict["cern:accelerators"].append(result) + elif not result: raise UnexpectedValue( subfield="a", value=accelerator, @@ -563,12 +618,9 @@ def field_beams(record_json, custom_fields_dict): if beam.lower().strip() == "not applicable": continue result = search_vocabulary(beam, "beams") - if result["hits"]["total"]: - custom_fields_dict["cern:beams"].append( - {"id": result["hits"]["hits"][0]["id"]} - ) - - else: + if result and result not in custom_fields_dict["cern:beams"]: + custom_fields_dict["cern:beams"].append(result) + elif not result: raise UnexpectedValue( subfield="a", value=beam, @@ -577,34 +629,24 @@ def field_beams(record_json, custom_fields_dict): stage="vocabulary match", ) + _cf = json_entry.get("custom_fields", {}) custom_fields = { "cern:experiments": [], "cern:departments": [], "cern:accelerators": [], - "cern:administrative_unit": json_entry.get("custom_fields", {}).get( - "cern:administrative_unit", [] - ), - "cern:projects": json_entry.get("custom_fields", {}).get( - "cern:projects", [] - ), - "cern:facilities": json_entry.get("custom_fields", {}).get( - "cern:facilities", [] - ), - "cern:studies": json_entry.get("custom_fields", {}).get("cern:studies", []), + "cern:administrative_unit": _cf.get("cern:administrative_unit", []), + "cern:projects": _cf.get("cern:projects", []), + "cern:facilities": _cf.get("cern:facilities", []), + "cern:studies": _cf.get("cern:studies", []), "cern:beams": [], "cern:programmes": field_programmes(json_entry), - "thesis:thesis": json_entry.get("custom_fields", {}).get( - "thesis:thesis", {} - ), - "journal:journal": json_entry.get("custom_fields", {}).get( - "journal:journal", {} - ), - "imprint:imprint": json_entry.get("custom_fields", {}).get( - "imprint:imprint", {} - ), - "meeting:meeting": json_entry.get("custom_fields", {}).get( - "meeting:meeting", {} - ), + "cern:committees": _cf.get("cern:committees"), + "cern:oa_level": _cf.get("cern:oa_level"), + "cern:oa_funding_model": _cf.get("cern:oa_funding_model"), + "thesis:thesis": _cf.get("thesis:thesis", {}), + "journal:journal": _cf.get("journal:journal", {}), + "imprint:imprint": _cf.get("imprint:imprint", {}), + "meeting:meeting": _cf.get("meeting:meeting", {}), } try: field_experiments(json_entry, custom_fields) @@ -868,57 +910,57 @@ def compute_access(file, record_access): def compute_files(file_dump, versions_dict): legacy_path_root = Path("/opt/cdsweb/var/data/files/") tmp_eos_root = Path(self.files_dump_dir) - full_path = Path(file["full_path"]) + full_path = Path(file_dump["full_path"]) - if file["subformat"] in FILE_SUBFORMATS_TO_DROP: + if file_dump["subformat"] in FILE_SUBFORMATS_TO_DROP: self.migration_logger.add_information( - str(file["recid"]), + str(file_dump["recid"]), { - "message": f"File subformat {file['subformat']} dropped.", - "value": file["full_name"], + "message": f"File subformat {file_dump['subformat']} dropped.", + "value": file_dump["full_name"], }, ) return - if not self.plots and file["type"] == "Plot": + if not self.plots and file_dump["type"] == "Plot": # skip figures if configuration says so self.migration_logger.add_information( - str(file["recid"]), + str(file_dump["recid"]), { "message": f"Plot file dropped.", - "value": file["full_name"], + "value": file_dump["full_name"], }, ) return - if file["hidden"]: + if file_dump["hidden"]: # skip hidden files self.migration_logger.add_information( - str(file["recid"]), + str(file_dump["recid"]), { "message": f"Hidden file dropped.", - "value": file["full_name"], + "value": file_dump["full_name"], }, ) versions_dict[file_dump["version"]]["files"].update( { - file["full_name"]: { + file_dump["full_name"]: { "eos_tmp_path": tmp_eos_root / full_path.relative_to(legacy_path_root), - "id_bibdoc": file["bibdocid"], - "key": file["full_name"], + "id_bibdoc": file_dump["bibdocid"], + "key": file_dump["full_name"], "metadata": { - "description": file["description"], - "name": file["name"], - "status": file["status"], - "comment": file["comment"], + "description": file_dump["description"], + "name": file_dump["name"], + "status": file_dump["status"], + "comment": file_dump["comment"], }, - "mimetype": file["mime"], - "checksum": file["checksum"], - "version": file["version"], - "access": file["status"], - "type": file["type"], - "creation_date": arrow.get(file["creation_date"]) + "mimetype": file_dump["mime"], + "checksum": file_dump["checksum"], + "version": file_dump["version"], + "access": file_dump["status"], + "type": file_dump["type"], + "creation_date": arrow.get(file_dump["creation_date"]) .replace(tzinfo=None) .date() .isoformat(), @@ -974,16 +1016,23 @@ def _record_files(self, entry, record): # TO implement if we decide not to go via draft publish return [] + def _load_migrated_recids(self): + """Load all already-migrated legacy record IDs into a set once.""" + return { + pid.pid_value + for pid in PersistentIdentifier.query.filter_by( + pid_type="lrecid", + status=PIDStatus.REGISTERED, + ).all() + } + def should_skip(self, entry): - pid = PersistentIdentifier.query.filter_by( - pid_type="lrecid", - pid_value=str(entry["recid"]), - status=PIDStatus.REGISTERED, - ).one_or_none() - return pid is not None + return str(entry["recid"]) in self._migrated_recids def run(self, entries): """Run transformation step.""" + self._migrated_recids = self._load_migrated_recids() + for entry in entries: if self.should_skip(entry): if current_app.config["CDS_MIGRATOR_KIT_ENV"] == "local": diff --git a/cds_migrator_kit/rdm/records/transform/xml_processing/rules/base.py b/cds_migrator_kit/rdm/records/transform/xml_processing/rules/base.py index 9a743263..15224526 100644 --- a/cds_migrator_kit/rdm/records/transform/xml_processing/rules/base.py +++ b/cds_migrator_kit/rdm/records/transform/xml_processing/rules/base.py @@ -172,6 +172,8 @@ def validate_subject_scheme(subject_scheme, subfield, key): raise IgnoreKey("subjects") else: subject_value = val_a.strip() + if subject_value.lower() == "xx": + raise IgnoreKey("subjects") _subjects = self.get("subjects", []) # invalid schema = euproject info scheme = scheme if validate_subject_scheme(scheme, subfield, key) == "eu": @@ -316,6 +318,9 @@ def report_number(self, key, value): raise IgnoreKey("identifiers") elif scheme.upper() == "CERN LIBRARY": raise IgnoreKey("identifiers") + elif scheme.startswith("SCOO"): + identifier = scheme + scheme = "other" else: raise UnexpectedValue("Missing ID value", field=key, value=value) new_id = {"scheme": scheme, "identifier": identifier} @@ -420,6 +425,8 @@ def identifiers(self, key, value): ): return rel_id else: + if "HOLALE" in id_value: + raise IgnoreKey("identifiers") raise UnexpectedValue( field=key, value=value, subfield="9", message="Invalid scheme" ) @@ -434,13 +441,18 @@ def _pids(self, key, value): scheme = value.get("2", "").lower() qualifier = value.get("q", "").lower().strip() identifier = value.get("a") + if not identifier: + identifier = value.get("A") + if not identifier: + raise UnexpectedValue( + "Missing identifier value", field=key, subfield="a", stage="transform" + ) if not scheme: scheme = value.get("9", "").lower() if not scheme: raise UnexpectedValue( "Missing identifier scheme", field=key, subfield="2", stage="transform" ) - is_doi_id = is_doi(identifier) is_handle_id = not is_doi_id and is_handle(identifier) if not is_doi_id and is_handle_id and (scheme == "doi" or scheme == "urn/hdl"): @@ -491,7 +503,7 @@ def _pids(self, key, value): raise IgnoreKey("_pids") -@model.over("contributors", "^710__") +@model.over("creators", "^710__") @for_each_value def corporate_author(self, key, value): """Translates corporate author.""" @@ -500,13 +512,13 @@ def corporate_author(self, key, value): if name.strip() == "CERN. Geneva": name = "CERN" + if "CERN. Geneva." in name: + name = name.replace("CERN. Geneva.", "CERN") contributor = { "person_or_org": { "type": "organizational", "name": StringValue(name).parse(), - "family_name": StringValue(name).parse(), }, - "role": {"id": "hostinginstitution"}, } return contributor if "5" in value: @@ -754,7 +766,12 @@ def yellow_reports(self, key, value): return new_id if scheme.lower() == "pacs": raise IgnoreKey("related_identifiers") - if not scheme and identifier.startswith("CERN-"): + if not scheme and ( + identifier.startswith("CERN-") + or identifier.startswith("EEC-") + or identifier.startswith("NPRC-") + or identifier.startswith("LEP-") + ): # report number new_id = { "identifier": identifier, @@ -933,7 +950,8 @@ def process(_note): _note = force_list(value.get("a", "")) _note_z = force_list(value.get("z", "")) - notes_list = _note_z + _note + _note_d = force_list(value.get("d", "")) + notes_list = _note_z + _note + _note_d _note_b = value.get("b", "") _note_c = value.get("c", "") @@ -966,6 +984,7 @@ def additional_titles(self, key, value): """Translates additional titles.""" additional_desc_text = value.get("p") + volume = value.get("n") if additional_desc_text: _additional_descriptions = self.get("additional_descriptions", []) _additional_descriptions.append( @@ -1018,6 +1037,7 @@ def additional_titles(self, key, value): "lang": {"id": "eng"}, } return _additional_title + raise IgnoreKey("additional_titles") diff --git a/cds_migrator_kit/rdm/records/transform/xml_processing/rules/hr.py b/cds_migrator_kit/rdm/records/transform/xml_processing/rules/hr.py index fe02cd72..108a8c61 100644 --- a/cds_migrator_kit/rdm/records/transform/xml_processing/rules/hr.py +++ b/cds_migrator_kit/rdm/records/transform/xml_processing/rules/hr.py @@ -365,11 +365,7 @@ def title(self, key, value): self["additional_titles"] = alt_titles identifiers = self.get("identifiers", []) rep_num = next( - ( - identifier - for identifier in identifiers - if identifier["scheme"] == "cdsrn" - ), + (identifier for identifier in identifiers if identifier["scheme"] == "cdsrn"), {}, ).get("identifier") @@ -393,7 +389,6 @@ def title(self, key, value): return title - @model.over("meeting_cf", "^773__") @for_each_value def meeting(self, key, value): diff --git a/cds_migrator_kit/rdm/records/transform/xml_processing/rules/publications.py b/cds_migrator_kit/rdm/records/transform/xml_processing/rules/publications.py index 35218b63..d547d8dd 100644 --- a/cds_migrator_kit/rdm/records/transform/xml_processing/rules/publications.py +++ b/cds_migrator_kit/rdm/records/transform/xml_processing/rules/publications.py @@ -20,7 +20,35 @@ udc_pattern, ) from ...models.base_publication_record import rdm_base_publication_model as model +from .base import licenses as _base_licenses from .base import normalize +from .base import note as _base_note +from .base import urls as _base_urls + +# Unwrapped base functions (strip @for_each_value to avoid double-wrapping). +# licenses also has @filter_values beneath @for_each_value, so two levels deep. +_raw_licenses = ( + _base_licenses.__wrapped__ +) # filter_values(raw) — handles None filtering +_raw_note = _base_note.__wrapped__ # raw note function +_raw_urls = _base_urls.__wrapped__ # raw urls function + +_FUNDING_MODEL_MAP = { + "scoap3": "scoap3", + "collective": "collective", + "cern-rp": "cern-rp", + "cern-apc": "cern-apc", + "other": "other", +} + +# Lower number = higher priority +_OA_LEVEL_PRIORITY = {"gold": 0, "bronze": 1, "green": 2, "closed": 3} + + +def _sub(v, code): + """Return first string value of a MARC subfield, handling dojson tuple packing.""" + val = force_list(v.get(code)) + return val[0] if val else "" @model.over("isbns", "^020__") @@ -99,6 +127,29 @@ def udc(self, key, value): ) +@model.over("creators", "(^110__)") +@for_each_value +def corpo_author(self, key, value): + author = value.get("a", "").strip() + if not author: + raise UnexpectedValue(subfield="a", value=value, field=key) + author = {"person_or_org": {"type": "organizational", "name": author}} + if author not in self.get("creators", []): + return author + raise IgnoreKey("creators") + + +@model.over("imprint_info", "(^250__)") +@for_each_value +@require(["a"]) +def imprint(self, key, value): + """Translates additional description.""" + _custom_fields = self.setdefault("custom_fields", {}) + imprint = _custom_fields.setdefault("imprint:imprint", {}) + imprint["edition"] = StringValue(value.get("a")).parse() + raise IgnoreKey("imprint_info") + + @model.over("publication_date", "(^260__)", override=True) def imprint_info(self, key, value): """Translates imprint - WARNING - also publisher and publication_date. @@ -186,6 +237,14 @@ def funding(self, key, value): raise IgnoreKey("funding") +@model.over("_approval", "(^591__)") +def status(self, key, value): + _status = value.get("b", "").lower().strip() + if _status == "approved": + raise IgnoreKey("_approval") + raise UnexpectedValue("Unexpected status value", field=key, value=value) + + @model.over("custom_fields", "(^773__)") def journal(self, key, value): _custom_fields = self.get("custom_fields", {}) @@ -200,10 +259,13 @@ def journal(self, key, value): break conference_cnum = value.get("w", "") + conference_acronym = value.get("q", "") + custom_meeting_fields = _custom_fields.get("meeting:meeting", {}) if conference_cnum: - custom_meeting_fields = _custom_fields.get("meeting:meeting", {}) identifiers = custom_meeting_fields.get("identifiers", []) identifiers.append({"scheme": "inspire", "identifier": conference_cnum}) + if conference_acronym: + custom_meeting_fields["acronym"] = conference_acronym pub_date = self.get("publication_date") # if we only have 773 in the record and no other journal fields, @@ -220,6 +282,135 @@ def journal(self, key, value): return _custom_fields +@model.over("_oa_license", "^540__", override=True) +def oa_level_from_license(self, key, value): + """Detect OA level and funding model; also runs base license logic for rights. + + 540__a: license identifier ('CC BY', 'CC-BY' → gold if 540__3='publication') + 540__f: 'Bronze' → bronze OA level; + 'SCOAP3'|'Collective'|'CERN-RP'|'CERN-APC'|'Other' → funding model + 540__3: 'publication' required for gold; 'preprint' alone → green + """ + _custom_fields = self.get("custom_fields", {}) + rights = self.get("rights", []) + + for v in force_list(value): + qualifier = _sub(v, "f").strip() + scope = _sub(v, "3").strip().lower() + + # Check ALL 'a' subfields: dojson packs repeated subfields as a tuple. + a_vals = force_list(v.get("a")) or () + is_cc_by = any(a.strip().lower() in ["cc by", "cc-by"] for a in a_vals) + is_bronze = qualifier.lower() == "bronze" + is_publication_scope = scope == "publication" + is_preprint_scope = scope == "preprint" + + funding_model_id = _FUNDING_MODEL_MAP.get(qualifier.lower()) + + current_level = (_custom_fields.get("cern:oa_level") or {}).get("id") + current_priority = _OA_LEVEL_PRIORITY.get(current_level, 99) + + new_level = None + if is_cc_by and is_publication_scope: + new_level = "gold" + elif is_bronze: + new_level = "bronze" + elif is_preprint_scope: + new_level = "green" + + if new_level and _OA_LEVEL_PRIORITY[new_level] < current_priority: + _custom_fields["cern:oa_level"] = {"id": new_level} + + if funding_model_id and not _custom_fields.get("cern:oa_funding_model"): + _custom_fields["cern:oa_funding_model"] = {"id": funding_model_id} + + # Base license logic: expand repeated 'a' subfields into individual calls + # because clean_val raises UnexpectedValue for tuple values by default. + for a_val in a_vals: + if not a_val: + continue + license_result = _raw_licenses(self, key, dict(v, a=a_val)) + if license_result and license_result not in rights: + rights.append(license_result) + + self["custom_fields"] = _custom_fields + if rights: + self["rights"] = rights + raise IgnoreKey("_oa_license") + + +@model.over("_oa_annual_report", "^595__", override=True) +def oa_level_from_annual_report(self, key, value): + """Detect 'For annual report' → closed OA; also runs base note logic for internal_notes. + + 595__a = 'For annual report': tentatively marks closed access. + If gold/bronze/green was already set by 540 rules, this is skipped. + The 8564 rule can still upgrade tentative 'closed' to green. + """ + for v in force_list(value): + note_text = _sub(v, "a").strip().lower() + if note_text == "for annual report": + _custom_fields = self.get("custom_fields", {}) + if not _custom_fields.get("cern:oa_level"): + _custom_fields["cern:oa_level"] = {"id": "closed"} + self["custom_fields"] = _custom_fields + + # Delegate base note logic — raises IgnoreKey("internal_notes") on success + try: + _raw_note(self, key, v) + except IgnoreKey: + pass + + raise IgnoreKey("_oa_annual_report") + + +@model.over("_oa_url", "^8564[1_]", override=True) +def oa_level_from_url(self, key, value): + """Detect green OA from preprint/manuscript file links; also runs base URL logic. + + 8564_y: 'preprint' or 'manuscript' → green OA level. + Overrides tentative 'closed' (from 595 rule) but not gold/bronze/green already set. + """ + rel_ids = self.get("related_identifiers", []) + + for v in force_list(value): + sub_y = _sub(v, "y").strip().lower() + if sub_y in ["preprint", "manuscript"]: + _custom_fields = self.get("custom_fields", {}) + current_level = (_custom_fields.get("cern:oa_level") or {}).get("id") + current_priority = _OA_LEVEL_PRIORITY.get(current_level, 99) + if _OA_LEVEL_PRIORITY["green"] < current_priority: + _custom_fields["cern:oa_level"] = {"id": "green"} + self["custom_fields"] = _custom_fields + + # Delegate base URL logic — requires self["recid"] which is always set + # in production (001 field), but may be absent in unit tests. + if "recid" in self: + try: + url_result = _raw_urls(self, key, v) + if url_result and url_result not in rel_ids: + rel_ids.append(url_result) + except IgnoreKey: + pass + + if rel_ids: + self["related_identifiers"] = rel_ids + raise IgnoreKey("_oa_url") + + +@model.over("access_grants", "^506[1_]_") +@for_each_value +def access_grants(self, key, value): + """Translates access permissions (by user email or group name).""" + raw_identifier = value.get("d") or value.get("m") or value.get("a") + subject_identifier = StringValue(raw_identifier).parse() + if not subject_identifier: + raise IgnoreKey("access_grants") + + permission_type = "view" + return {str(subject_identifier): permission_type} + + @model.over("internal_notes", "^562__") @for_each_value def internal_notes(self, key, value): @@ -241,6 +432,39 @@ def organisation(self, key, value): } +# @model.over("_approvals", "^903__") +# @for_each_value +# def organisation(self, key, value): +# contributor = value.get("u", "") +# return { +# "person_or_org": { +# "type": "organizational", +# "name": contributor, +# }, +# "role": {"id": "hostinginstitution"}, +# } + + +@model.over("dates", "^925__") +@for_each_value +def date(self, key, value): + """Translates dates.""" + dates = self.get("dates", []) + valid = value.get("a") + if valid: + date = { + "date": valid, + "type": {"id": "submitted"}, + } + dates.append(date) + withdrawn = value.get("b", "") + if withdrawn and "9999" not in withdrawn: + date = {"date": withdrawn, "type": {"id": "other"}, "description": "completed"} + dates.append(date) + self["dates"] = dates + raise IgnoreKey("dates") + + @model.over("related_identifiers", "^962__") @for_each_value def related_identifiers(self, key, value): @@ -279,3 +503,149 @@ def related_identifiers(self, key, value): if recid and new_id not in rel_ids: return new_id raise IgnoreKey("related_identifiers") + + +@model.over("resource_type", "(^980__)|(^697C_)", override=True) +def resource_type(self, key, value): + """Translates resource_type.""" + value_a = value.get("a", "") + value_b = value.get("b", "") + + ignore_res_types = [ + "publarda", + "aleph_misc", + "opal_misc", + "l3_misc", + "delphi_misc", + "l3_papers", + "delphi_papers", + "opal_papers", + "aleph_papers", + "ps212_papers", + ] + + committees = { + "scicommpubldrdc": "DRDC", + "scicommpubleec": "EEC", + "scicommpublemc": "EmC", + "scicommpublisc": "ISC", + "scicommpublisrc": "ISRC", + "scicommpublistc": "ISTC", + "scicommpubllepc": "LEPC", + "scicommpublnprc": "NPRC", + "scicommpublnsc": "NSC", + "scicommpublphi": "PH-I", + "scicommpublphiii": "PH-III", + "scicommpublpsc": "PSC", + "scicommpublpscc": "PSCC", + "scicommpublscc": "SCC", + "sc_and_ps_advisory_committee": "SC and PS Advisory Committee", + "scicommpublspsc": "SPSC", + "scicommpublspslc": "SPSLC", + "scicommpubltcc": "TCC", + } + + if (value_a and value_a.lower() in committees.keys()) or ( + value_b and value_b in committees + ): + + custom_fields = self.get("custom_fields", {}) + comm_cf = custom_fields.get("cern:committees", []) + if value_a: + comm_cf.append({"id": committees[value_a.lower()]}) + if value_b: + comm_cf.append({"id": committees[value_b.lower()]}) + self["custom_fields"]["cern:committees"] = comm_cf + raise IgnoreKey("resource_type") + if (value_a and value_a.lower() in ignore_res_types) or ( + value_b and value_b in ignore_res_types + ): + raise IgnoreKey("resource_type") + + # first has highest priority + priority = { + v: i + for i, v in enumerate( + [ + "conferencepaper", + "bookchapter", + "itcerntalk", + "antarescerntalk" "slides", + "article", + "preprint", + "intnotetspubl", + "intnoteitpubl", + "intnotealephpriv", + "intnoteeppubl", + "intnotehsepubl", + "note", + "lcd-notes", + "software", + ] + ) + } + current = self.get("resource_type") + + # Normalize both values (lowercase if not None) + candidates = [] + if value_a: + candidates.append(value_a.lower()) + if value_b: + candidates.append(value_b.lower()) + + if not candidates: + raise IgnoreKey("resource_type") # nothing to decide on + + # Select the candidate with the highest priority (lowest rank) + best_value = min(candidates, key=lambda v: priority.get(v, float("inf"))) + rank = priority.get(best_value, float("inf")) + + mapping = { + "preprint": {"id": "publication-preprint"}, + "conferencepaper": {"id": "publication-conferencepaper"}, + "article": {"id": "publication-article"}, + "note": {"id": "publication-technicalnote"}, + "lcd-notes": {"id": "publication-technicalnote"}, + "brochure": {"id": "publication-brochure"}, + "itcerntalk": {"id": "presentation"}, + "antarescerntalk": {"id": "presentation"}, + "slides": {"id": "presentation"}, + "peri": {"id": "publication-periodical"}, + "intnoteitpubl": {"id": "publication-technicalnote"}, + "intnotealephpriv": {"id": "publication-technicalnote"}, + "intnotetspubl": {"id": "publication-technicalnote"}, + "intnoteeppubl": {"id": "publication-technicalnote"}, + "intnotehsepubl": {"id": "publication-technicalnote"}, + "bookchapter": {"id": "publication-section"}, + "cnlissue": {"id": "publication-periodicalissue"}, + "cnlarticle": {"id": "publication-periodicalarticle"}, + "report": {"id": "publication-report"}, + "book": {"id": "publication-book"}, + "progress report": {"id": "publication-report"}, + "poster": {"id": "poster"}, + "software": {"id": "software"}, + } + + try: + + mapping[best_value] + except KeyError: + if key == "697C_" and "lexi" in value_b.lower() or "lexi" in value_a.lower(): + subjects = self.get("subjects") + subjects.append({"subject": value_a if value_a else value_b}) + self["subjects"] = subjects + raise IgnoreKey("resource_type") + raise UnexpectedValue( + "Unknown resource type (Publications)", value=best_value, field=key + ) + + if current: + current_key = next((k for k, v in mapping.items() if v == current), None) + current_rank = priority.get(current_key, float("inf")) + + if rank < current_rank: + return mapping[best_value] + else: + raise IgnoreKey("resource_type") + else: + return mapping[best_value] diff --git a/cds_migrator_kit/rdm/records/transform/xml_processing/rules/summer_student_report.py b/cds_migrator_kit/rdm/records/transform/xml_processing/rules/summer_student_report.py index 4ddcac00..ebf6f03d 100644 --- a/cds_migrator_kit/rdm/records/transform/xml_processing/rules/summer_student_report.py +++ b/cds_migrator_kit/rdm/records/transform/xml_processing/rules/summer_student_report.py @@ -146,3 +146,9 @@ def additional_descriptions(self, key, value): if _additional_description: return _additional_description raise IgnoreKey("additional_descriptions") + + +@model.over("resource_type", "^980__", override=True) +def resource_type(self, key, value): + """Translates resource type.""" + raise IgnoreKey("resource_type") diff --git a/cds_migrator_kit/rdm/records/transform/xml_processing/rules/thesis.py b/cds_migrator_kit/rdm/records/transform/xml_processing/rules/thesis.py index a2f31749..5b732179 100644 --- a/cds_migrator_kit/rdm/records/transform/xml_processing/rules/thesis.py +++ b/cds_migrator_kit/rdm/records/transform/xml_processing/rules/thesis.py @@ -296,7 +296,7 @@ def rec_affiliation(self, key, value): raise IgnoreKey("affiliations") -@model.over("collection", "^980__") +@model.over("resource_type", "^980__", override=True) @for_each_value def collection(self, key, value): col = value.get("a", "") @@ -315,7 +315,7 @@ def collection(self, key, value): subjects = self.get("subjects", []) subjects.append({"subject": f"collection:{colb.upper()}"}) self["subjects"] = subjects - raise IgnoreKey("collection") + raise IgnoreKey("resource_type") @model.over("related_identifiers", "^962_") diff --git a/cds_migrator_kit/rdm/streams.yaml b/cds_migrator_kit/rdm/streams.yaml index 87e52a0a..f649456b 100644 --- a/cds_migrator_kit/rdm/streams.yaml +++ b/cds_migrator_kit/rdm/streams.yaml @@ -1,115 +1,123 @@ db_uri: postgresql://cds-rdm:cds-rdm@localhost:5432/cds-rdm records: - it: - data_dir: cds_migrator_kit/rdm/data/it_dep - tmp_dir: cds_migrator_kit/rdm/tmp/it_dep - log_dir: cds_migrator_kit/rdm/log/it_dep + aleph: + data_dir: cds_migrator_kit/rdm/data/lep_exp/aleph + tmp_dir: cds_migrator_kit/rdm/tmp/lep_exp/aleph + log_dir: cds_migrator_kit/rdm/log/lep_exp/aleph extract: - dirpath: cds_migrator_kit/rdm/data/it_dep/dump/ + dirpath: cds_migrator_kit/rdm/data/lep_exp/aleph/dump/ transform: - files_dump_dir: cds_migrator_kit/rdm/data/it_dep/files/ + files_dump_dir: cds_migrator_kit/rdm/data/lep_exp/aleph/files/ missing_users: cds_migrator_kit/rdm/data/users communities_ids: - - "42da91cf-d476-4d09-9193-7841e03e271c" + - "7c568753-550b-4b48-8d76-461181973100" + - "c2c46ab3-5fb4-4d86-83c6-5d9dc8392d6f" load: - legacy_pids_to_redirect: cds_migrator_kit/rdm/data/it_dep/duplicated_pids.json - it_meetings: - data_dir: cds_migrator_kit/rdm/data/it_meetings - tmp_dir: cds_migrator_kit/rdm/tmp/it_meetings - log_dir: cds_migrator_kit/rdm/log/it_meetings + legacy_pids_to_redirect: cds_migrator_kit/rdm/data/lep_exp/aleph/duplicated_pids.json + aleph_drafts: + data_dir: cds_migrator_kit/rdm/data/lep_exp/aleph_drafts + tmp_dir: cds_migrator_kit/rdm/tmp/lep_exp/aleph_drafts + log_dir: cds_migrator_kit/rdm/log/lep_exp/aleph_drafts restricted: "True" extract: - dirpath: cds_migrator_kit/rdm/data/it_meetings/dump/ + dirpath: cds_migrator_kit/rdm/data/lep_exp/aleph_drafts/dump/ transform: - files_dump_dir: cds_migrator_kit/rdm/data/it_meetings/files/ + files_dump_dir: cds_migrator_kit/rdm/data/lep_exp/aleph_drafts/files/ missing_users: cds_migrator_kit/rdm/data/users communities_ids: - - "42da91cf-d476-4d09-9193-7841e03e271c" + - "7c568753-550b-4b48-8d76-461181973100" load: - legacy_pids_to_redirect: cds_migrator_kit/rdm/data/it_meetings/duplicated_pids.json - hr: - data_dir: cds_migrator_kit/rdm/data/hr - tmp_dir: cds_migrator_kit/rdm/tmp/hr - log_dir: cds_migrator_kit/rdm/log/hr + legacy_pids_to_redirect: cds_migrator_kit/rdm/data/lep_exp/aleph_drafts/duplicated_pids.json + l3: + data_dir: cds_migrator_kit/rdm/data/lep_exp/l3 + tmp_dir: cds_migrator_kit/rdm/tmp/lep_exp/l3 + log_dir: cds_migrator_kit/rdm/log/lep_exp/l3 extract: - dirpath: cds_migrator_kit/rdm/data/hr/dump/ + dirpath: cds_migrator_kit/rdm/data/lep_exp/l3/dump/ transform: - files_dump_dir: cds_migrator_kit/rdm/data/hr/files/ + files_dump_dir: cds_migrator_kit/rdm/data/lep_exp/l3/files/ missing_users: cds_migrator_kit/rdm/data/users communities_ids: - - "77617386-632c-4b86-8dd2-68de77ae0018" + - "bb0ac2d2-b90b-498e-bf64-1986791d1032" + - "c2c46ab3-5fb4-4d86-83c6-5d9dc8392d6f" load: - legacy_pids_to_redirect: cds_migrator_kit/rdm/data/hr/duplicated_pids.json - hr_ccp: - data_dir: cds_migrator_kit/rdm/data/hr_ccp - tmp_dir: cds_migrator_kit/rdm/tmp/hr_ccp - log_dir: cds_migrator_kit/rdm/log/hr_ccp - restricted: "True" + legacy_pids_to_redirect: cds_migrator_kit/rdm/data/lep_exp/l3/duplicated_pids.json + opal: + data_dir: cds_migrator_kit/rdm/data/lep_exp/opal + tmp_dir: cds_migrator_kit/rdm/tmp/lep_exp/opal + log_dir: cds_migrator_kit/rdm/log/lep_exp/opal extract: - dirpath: cds_migrator_kit/rdm/data/hr_ccp/dump/ + dirpath: cds_migrator_kit/rdm/data/lep_exp/opal/dump/ transform: - files_dump_dir: cds_migrator_kit/rdm/data/hr_ccp/files/ + files_dump_dir: cds_migrator_kit/rdm/data/lep_exp/opal/files/ missing_users: cds_migrator_kit/rdm/data/users communities_ids: - - "9e17f7a0-e742-4cb3-be6a-8fa84c4756e2" + - "473e34c5-4fe1-44fc-a4c2-3305cf6adcba" + - "c2c46ab3-5fb4-4d86-83c6-5d9dc8392d6f" load: - legacy_pids_to_redirect: cds_migrator_kit/rdm/data/hr_ccp/duplicated_pids.json - hr_admin_circ: - data_dir: cds_migrator_kit/rdm/data/hr_admin_circ - tmp_dir: cds_migrator_kit/rdm/tmp/hr_admin_circ - log_dir: cds_migrator_kit/rdm/log/hr_admin_circ + legacy_pids_to_redirect: cds_migrator_kit/rdm/data/lep_exp/opal/duplicated_pids.json + delphi: + data_dir: cds_migrator_kit/rdm/data/lep_exp/delphi + tmp_dir: cds_migrator_kit/rdm/tmp/lep_exp/delphi + log_dir: cds_migrator_kit/rdm/log/lep_exp/delphi extract: - dirpath: cds_migrator_kit/rdm/data/hr_admin_circ/dump/ + dirpath: cds_migrator_kit/rdm/data/lep_exp/delphi/dump/ transform: - files_dump_dir: cds_migrator_kit/rdm/data/hr_admin_circ/files/ + files_dump_dir: cds_migrator_kit/rdm/data/lep_exp/delphi/files/ missing_users: cds_migrator_kit/rdm/data/users communities_ids: - - "847852d1-b9de-4ebc-b302-57664dc4b6e7" + - "b6553d89-ea62-4a7c-9f5b-e76b5bfdb733" + - "c2c46ab3-5fb4-4d86-83c6-5d9dc8392d6f" load: - legacy_pids_to_redirect: cds_migrator_kit/rdm/data/hr_admin_circ/duplicated_pids.json - e_guide: - data_dir: cds_migrator_kit/rdm/data/e_guide - tmp_dir: cds_migrator_kit/rdm/tmp/e_guide - log_dir: cds_migrator_kit/rdm/log/e_guide + legacy_pids_to_redirect: cds_migrator_kit/rdm/data/lep_exp/delphi/duplicated_pids.json + delphi_priv: + data_dir: cds_migrator_kit/rdm/data/lep_exp/delphi_priv + tmp_dir: cds_migrator_kit/rdm/tmp/lep_exp/delphi_priv + log_dir: cds_migrator_kit/rdm/log/lep_exp/delphi_priv + restricted: "True" extract: - dirpath: cds_migrator_kit/rdm/data/e_guide/dump/ + dirpath: cds_migrator_kit/rdm/data/lep_exp/delphi_priv/dump/ transform: - files_dump_dir: cds_migrator_kit/rdm/data/e_guide/files/ + files_dump_dir: cds_migrator_kit/rdm/data/lep_exp/delphi_priv/files/ missing_users: cds_migrator_kit/rdm/data/users communities_ids: - - "d881ff3b-3a47-4cb3-a802-deaf577e2e35" + - "b6553d89-ea62-4a7c-9f5b-e76b5bfdb733" load: - legacy_pids_to_redirect: cds_migrator_kit/rdm/data/e_guide/duplicated_pids.json - e_guide_restr: - data_dir: cds_migrator_kit/rdm/data/e_guide_restr - tmp_dir: cds_migrator_kit/rdm/tmp/e_guide_restr - log_dir: cds_migrator_kit/rdm/log/e_guide_restr - restricted: "True" - access_grants_view: - - cern-personnel + legacy_pids_to_redirect: cds_migrator_kit/rdm/data/lep_exp/delphi_priv/duplicated_pids.json + committees: + data_dir: cds_migrator_kit/rdm/data/committees + tmp_dir: cds_migrator_kit/rdm/tmp/committees + log_dir: cds_migrator_kit/rdm/log/committees extract: - dirpath: cds_migrator_kit/rdm/data/e_guide_restr/dump/ + dirpath: cds_migrator_kit/rdm/data/committees/dump/ transform: - files_dump_dir: cds_migrator_kit/rdm/data/e_guide_restr/files/ + files_dump_dir: cds_migrator_kit/rdm/data/committees/files/ missing_users: cds_migrator_kit/rdm/data/users communities_ids: - - "d881ff3b-3a47-4cb3-a802-deaf577e2e35" - load: - legacy_pids_to_redirect: cds_migrator_kit/rdm/data/e_guide_restr/duplicated_pids.json - hr_restricted: - data_dir: cds_migrator_kit/rdm/data/hr_restricted - tmp_dir: cds_migrator_kit/rdm/tmp/hr_restricted - log_dir: cds_migrator_kit/rdm/log/hr_restricted + - "c2c46ab3-5fb4-4d86-83c6-5d9dc8392d6f" + former_exp: + data_dir: cds_migrator_kit/rdm/data/former_experiments/public + tmp_dir: cds_migrator_kit/rdm/tmp/former_experiments/public + log_dir: cds_migrator_kit/rdm/log/former_experiments/public + extract: + dirpath: cds_migrator_kit/rdm/data/former_experiments/public/dump/ + transform: + files_dump_dir: cds_migrator_kit/rdm/data/former_experiments/public/files/ + missing_users: cds_migrator_kit/rdm/data/users + communities_ids: + - "c2c46ab3-5fb4-4d86-83c6-5d9dc8392d6f" + former_exp_restr: + data_dir: cds_migrator_kit/rdm/data/former_experiments/restricted + tmp_dir: cds_migrator_kit/rdm/tmp/former_experiments/restricted + log_dir: cds_migrator_kit/rdm/log/former_experiments/restricted restricted: "True" extract: - dirpath: cds_migrator_kit/rdm/data/hr_restricted/dump/ + dirpath: cds_migrator_kit/rdm/data/former_experiments/restricted/dump/ transform: - files_dump_dir: cds_migrator_kit/rdm/data/hr_restricted/files/ + files_dump_dir: cds_migrator_kit/rdm/data/former_experiments/restricted/files/ missing_users: cds_migrator_kit/rdm/data/users communities_ids: - - "77617386-632c-4b86-8dd2-68de77ae0018" - load: - legacy_pids_to_redirect: cds_migrator_kit/rdm/data/hr_restricted/duplicated_pids.json + - "c2c46ab3-5fb4-4d86-83c6-5d9dc8392d6f" fap: data_dir: cds_migrator_kit/rdm/data/fap tmp_dir: cds_migrator_kit/rdm/tmp/fap diff --git a/cds_migrator_kit/rdm/streams_done.yaml b/cds_migrator_kit/rdm/streams_done.yaml index fddf9c77..204ba337 100644 --- a/cds_migrator_kit/rdm/streams_done.yaml +++ b/cds_migrator_kit/rdm/streams_done.yaml @@ -53,3 +53,113 @@ records: - 35699521-ba4f-4214-bf47-da5df659af16 load: legacy_pids_to_redirect: cds_migrator_kit/rdm/data/bulletin_art/duplicated_pids.json + it: + data_dir: cds_migrator_kit/rdm/data/it_dep + tmp_dir: cds_migrator_kit/rdm/tmp/it_dep + log_dir: cds_migrator_kit/rdm/log/it_dep + extract: + dirpath: cds_migrator_kit/rdm/data/it_dep/dump/ + transform: + files_dump_dir: cds_migrator_kit/rdm/data/it_dep/files/ + missing_users: cds_migrator_kit/rdm/data/users + communities_ids: + - "42da91cf-d476-4d09-9193-7841e03e271c" + load: + legacy_pids_to_redirect: cds_migrator_kit/rdm/data/it_dep/duplicated_pids.json + it_meetings: + data_dir: cds_migrator_kit/rdm/data/it_meetings + tmp_dir: cds_migrator_kit/rdm/tmp/it_meetings + log_dir: cds_migrator_kit/rdm/log/it_meetings + restricted: "True" + extract: + dirpath: cds_migrator_kit/rdm/data/it_meetings/dump/ + transform: + files_dump_dir: cds_migrator_kit/rdm/data/it_meetings/files/ + missing_users: cds_migrator_kit/rdm/data/users + communities_ids: + - "42da91cf-d476-4d09-9193-7841e03e271c" + load: + legacy_pids_to_redirect: cds_migrator_kit/rdm/data/it_meetings/duplicated_pids.json + hr: + data_dir: cds_migrator_kit/rdm/data/hr + tmp_dir: cds_migrator_kit/rdm/tmp/hr + log_dir: cds_migrator_kit/rdm/log/hr + extract: + dirpath: cds_migrator_kit/rdm/data/hr/dump/ + transform: + files_dump_dir: cds_migrator_kit/rdm/data/hr/files/ + missing_users: cds_migrator_kit/rdm/data/users + communities_ids: + - "77617386-632c-4b86-8dd2-68de77ae0018" + load: + legacy_pids_to_redirect: cds_migrator_kit/rdm/data/hr/duplicated_pids.json + hr_ccp: + data_dir: cds_migrator_kit/rdm/data/hr_ccp + tmp_dir: cds_migrator_kit/rdm/tmp/hr_ccp + log_dir: cds_migrator_kit/rdm/log/hr_ccp + restricted: "True" + extract: + dirpath: cds_migrator_kit/rdm/data/hr_ccp/dump/ + transform: + files_dump_dir: cds_migrator_kit/rdm/data/hr_ccp/files/ + missing_users: cds_migrator_kit/rdm/data/users + communities_ids: + - "9e17f7a0-e742-4cb3-be6a-8fa84c4756e2" + load: + legacy_pids_to_redirect: cds_migrator_kit/rdm/data/hr_ccp/duplicated_pids.json + hr_admin_circ: + data_dir: cds_migrator_kit/rdm/data/hr_admin_circ + tmp_dir: cds_migrator_kit/rdm/tmp/hr_admin_circ + log_dir: cds_migrator_kit/rdm/log/hr_admin_circ + extract: + dirpath: cds_migrator_kit/rdm/data/hr_admin_circ/dump/ + transform: + files_dump_dir: cds_migrator_kit/rdm/data/hr_admin_circ/files/ + missing_users: cds_migrator_kit/rdm/data/users + communities_ids: + - "847852d1-b9de-4ebc-b302-57664dc4b6e7" + load: + legacy_pids_to_redirect: cds_migrator_kit/rdm/data/hr_admin_circ/duplicated_pids.json + e_guide: + data_dir: cds_migrator_kit/rdm/data/e_guide + tmp_dir: cds_migrator_kit/rdm/tmp/e_guide + log_dir: cds_migrator_kit/rdm/log/e_guide + extract: + dirpath: cds_migrator_kit/rdm/data/e_guide/dump/ + transform: + files_dump_dir: cds_migrator_kit/rdm/data/e_guide/files/ + missing_users: cds_migrator_kit/rdm/data/users + communities_ids: + - "d881ff3b-3a47-4cb3-a802-deaf577e2e35" + load: + legacy_pids_to_redirect: cds_migrator_kit/rdm/data/e_guide/duplicated_pids.json + e_guide_restr: + data_dir: cds_migrator_kit/rdm/data/e_guide_restr + tmp_dir: cds_migrator_kit/rdm/tmp/e_guide_restr + log_dir: cds_migrator_kit/rdm/log/e_guide_restr + restricted: "True" + access_grants_view: + - cern-personnel + extract: + dirpath: cds_migrator_kit/rdm/data/e_guide_restr/dump/ + transform: + files_dump_dir: cds_migrator_kit/rdm/data/e_guide_restr/files/ + missing_users: cds_migrator_kit/rdm/data/users + communities_ids: + - "d881ff3b-3a47-4cb3-a802-deaf577e2e35" + load: + legacy_pids_to_redirect: cds_migrator_kit/rdm/data/e_guide_restr/duplicated_pids.json + hr_restricted: + data_dir: cds_migrator_kit/rdm/data/hr_restricted + tmp_dir: cds_migrator_kit/rdm/tmp/hr_restricted + log_dir: cds_migrator_kit/rdm/log/hr_restricted + restricted: "True" + extract: + dirpath: cds_migrator_kit/rdm/data/hr_restricted/dump/ + transform: + files_dump_dir: cds_migrator_kit/rdm/data/hr_restricted/files/ + missing_users: cds_migrator_kit/rdm/data/users + communities_ids: + - "77617386-632c-4b86-8dd2-68de77ae0018" + load: + legacy_pids_to_redirect: cds_migrator_kit/rdm/data/hr_restricted/duplicated_pids.json diff --git a/cds_migrator_kit/reports/log.py b/cds_migrator_kit/reports/log.py index df7597c1..c2d17645 100644 --- a/cds_migrator_kit/reports/log.py +++ b/cds_migrator_kit/reports/log.py @@ -117,6 +117,7 @@ def start_log(self): def read_log(self): """Read error log file.""" + csv.field_size_limit(10 * 1024 * 1024) # 10 MB with open(self.PROGRESS_LOG_FILEPATH, "r", newline="") as f: reader = csv.DictReader(f) for row in reader: @@ -142,7 +143,7 @@ def add_log(self, exc, record=None, key=None, value=None): "field": f"{getattr(exc, 'field', key)} {subfield}", "value": getattr(exc, "value", value), "stage": getattr(exc, "stage", None), - "message": getattr(exc, "message", str(exc)), + "message": getattr(exc, "message", None) or str(exc), "priority": getattr(exc, "priority", None), "clean": False, } diff --git a/cds_migrator_kit/runner/runner.py b/cds_migrator_kit/runner/runner.py index c2c71006..efe4bba9 100644 --- a/cds_migrator_kit/runner/runner.py +++ b/cds_migrator_kit/runner/runner.py @@ -32,7 +32,13 @@ def _read_config(self, filepath): return yaml.safe_load(f) def __init__( - self, stream_definitions, config_filepath, dry_run, collection, keep_logs + self, + stream_definitions, + config_filepath, + dry_run, + collection, + keep_logs, + workers=None, ): """Constructor.""" config = self._read_config(config_filepath) @@ -79,10 +85,16 @@ def __init__( **stream_config[collection].get("extract", {}) ) if definition.transform_cls: + transform_config = dict( + stream_config[collection].get("transform", {}) + ) + # CLI --workers takes precedence over streams.yaml workers + effective_workers = workers or transform_config.pop("workers", None) transform = definition.transform_cls( + workers=effective_workers, dry_run=dry_run, collection=collection, - **stream_config[collection].get("transform", {}), + **transform_config, restricted=self.restricted, access_grants_view=self.access_grants_view, migration_logger=self.migration_logger, @@ -109,6 +121,7 @@ def __init__( def run(self): """Run ETL streams.""" + self.migration_logger.start_log() self.record_state_logger.start_log() for stream in self.streams: diff --git a/cds_migrator_kit/transform/dumper.py b/cds_migrator_kit/transform/dumper.py index edc86c79..3cf09faa 100644 --- a/cds_migrator_kit/transform/dumper.py +++ b/cds_migrator_kit/transform/dumper.py @@ -43,7 +43,7 @@ def first_created(self): # modification datetime of first revision is the creation date of the whole record # this assumption is based on the hstRECORD dump from invenio-migrator module # for older records first revision is not the creation of record - # so we added creation_date field to dump and it's getting it from bibrec + # so we added creation_date field to dump and it's getting it from bibrec # https://github.com/inveniosoftware/invenio-migrator/blob/master/invenio_migrator/legacy/records.py#L216 return self.data["creation_date"] diff --git a/cds_migrator_kit/transform/xml_processing/quality/parsers.py b/cds_migrator_kit/transform/xml_processing/quality/parsers.py index e62e18bd..67a89bac 100644 --- a/cds_migrator_kit/transform/xml_processing/quality/parsers.py +++ b/cds_migrator_kit/transform/xml_processing/quality/parsers.py @@ -167,9 +167,17 @@ def _clean(value_to_clean): else: raise NotImplementedError except ValueError: - raise UnexpectedValue(subfield=subfield) + raise UnexpectedValue( + f"Cannot clean value to {var_type.__name__}", + subfield=subfield, + value=value_to_clean, + ) except TypeError: - raise UnexpectedValue(subfield=subfield) + raise UnexpectedValue( + f"Type error while cleaning value to {var_type.__name__}", + subfield=subfield, + value=value_to_clean, + ) except (UnexpectedValue, MissingRequiredField) as e: e.subfield = subfield e.message += str(force_list(value)) @@ -179,7 +187,11 @@ def _clean(value_to_clean): is_tuple = type(to_clean) is tuple if is_tuple and not multiple_values: - raise UnexpectedValue(subfield=subfield) + raise UnexpectedValue( + "Multiple values not allowed in subfield", + subfield=subfield, + value=to_clean, + ) if multiple_values: if is_tuple: diff --git a/cds_migrator_kit/transform/xml_processing/rules/base.py b/cds_migrator_kit/transform/xml_processing/rules/base.py index 56bae44d..f4de40e3 100644 --- a/cds_migrator_kit/transform/xml_processing/rules/base.py +++ b/cds_migrator_kit/transform/xml_processing/rules/base.py @@ -146,12 +146,12 @@ def process_contributors(key, value, orcid_subfield="k"): return contributor -@model.over("creators", "^100__") +@model.over("creators", "(^100__)|(^720__)") @for_each_value @require(["a"]) def creators(self, key, value): """Translates the creators field.""" - return process_contributors(key, value) + return process_contributors(key, value, orcid_subfield="j") @model.over("contributors", "^700__") @@ -159,7 +159,7 @@ def creators(self, key, value): @require(["a"]) def contributors(self, key, value): """Translates contributors.""" - return process_contributors(key, value) + return process_contributors(key, value, orcid_subfield="j") @model.over("submitter", "(^859__)") diff --git a/cds_migrator_kit/videos/scripts/run_failed_tasks.py b/cds_migrator_kit/videos/scripts/run_failed_tasks.py index 9bdf8a05..7960721f 100644 --- a/cds_migrator_kit/videos/scripts/run_failed_tasks.py +++ b/cds_migrator_kit/videos/scripts/run_failed_tasks.py @@ -94,10 +94,12 @@ def find_succeded_tasks(deposit_id): def run_metatadata_task(failed_tasks, flow, deposit_id, record_id): task_names = [task[0] for task in failed_tasks] deposit = deposit_video_resolver(deposit_id) - + if ExtractMetadataTask.name not in task_names: if deposit["_deposit"]["status"] == "draft": - log_success(f"ExtractMetadataTask not failed and deposit already in draft for record {record_id}.") + log_success( + f"ExtractMetadataTask not failed and deposit already in draft for record {record_id}." + ) return True return False @@ -242,7 +244,7 @@ def rerun_chapters_task(deposit_id, record_id, flow_id): flow = FlowMetadata.get_by_deposit(deposit_id) flow_id = flow.id payload = flow.payload.copy() - + task = next(t for t in flow.tasks if t.name == ExtractChapterFramesTask.name) # Determine if ExtractChapterFramesTask needs to run @@ -297,6 +299,7 @@ def load_record_ids(record_states_file_path): # MAIN METHODS # --------------------------- + # WEBLECTURES AYNC TASKS RUNNER def weblectures_tasks_runner(): global SUCCESS_LOG_PATH, ERROR_LOG_PATH @@ -319,11 +322,11 @@ def weblectures_tasks_runner(): # Records states file created during migration record_states_file_path = "rdm_records_state.json" - + all_record_ids = load_record_ids(record_states_file_path) - + # Run in batches - record_ids = all_record_ids[:1000] # any subset + record_ids = all_record_ids[:1000] # any subset total = len(record_ids) deposits_to_republish = [] @@ -338,11 +341,13 @@ def weblectures_tasks_runner(): log_success(f"No failed tasks found for record {record_id}.") else: task_names = [task[0] for task in failed_tasks] - republish_needed = run_metatadata_task(failed_tasks, flow, deposit_id, record_id) + republish_needed = run_metatadata_task( + failed_tasks, flow, deposit_id, record_id + ) if republish_needed: deposits_to_republish.append((deposit_id, record_id)) - # !! Make sure metadata tasks finished!! Run remaining tasks (frames/transcoding) async + # !! Make sure metadata tasks finished!! Run remaining tasks (frames/transcoding) async for i, record_id in enumerate(record_ids, start=1): log_success(f"Processing {i}/{total} record: {record_id}") record = record_video_resolver(record_id) @@ -351,15 +356,24 @@ def weblectures_tasks_runner(): flow, failed_tasks = find_failed_tasks(deposit_id) task_names = [task[0] for task in failed_tasks] if ExtractMetadataTask.name in task_names: - log_error(f"ERROR: Record: {record_id} still has failed ExtractMetadataTask. Skipping further processing.") + log_error( + f"ERROR: Record: {record_id} still has failed ExtractMetadataTask. Skipping further processing." + ) continue if not failed_tasks: log_success(f"No failed tasks found for record {record_id}.") else: log_success(f"Re-running failed tasks: {task_names}") - republish_needed = run_failed_tasks(failed_tasks, flow, deposit_id, record_id) - if republish_needed and (deposit_id, record_id) not in deposits_to_republish: - print(f"Adding deposit {deposit_id} for record {record_id} to republish list.") + republish_needed = run_failed_tasks( + failed_tasks, flow, deposit_id, record_id + ) + if ( + republish_needed + and (deposit_id, record_id) not in deposits_to_republish + ): + print( + f"Adding deposit {deposit_id} for record {record_id} to republish list." + ) deposits_to_republish.append((deposit_id, record_id)) # Re-publish records @@ -368,7 +382,9 @@ def weblectures_tasks_runner(): log_success(f"Processing publish {i}/{total_publish} record: {record_id}") deposit = deposit_video_resolver(deposit_id) if deposit["_deposit"]["status"] == "published": - log_success(f"Deposit already published for record {record_id}. Skipping publish.") + log_success( + f"Deposit already published for record {record_id}. Skipping publish." + ) # continue flow, succeded_tasks = find_succeded_tasks(deposit_id) task_names = [task[0] for task in succeded_tasks] @@ -382,12 +398,14 @@ def weblectures_tasks_runner(): src_bucket=deposit.bucket, dst_bucket=record["_buckets"]["record"], ) - + if metadata_task_succeded and frames_task_succeded: deposit.publish(extract_chapters=False).commit() log_success(f"Deposit {deposit_id} published for record {record_id}.") else: - log_error(f"ERROR: Record: {record_id} has failed tasks: ExtractMetadataTask: {metadata_task_succeded} ExtractFramesTask: {frames_task_succeded}") + log_error( + f"ERROR: Record: {record_id} has failed tasks: ExtractMetadataTask: {metadata_task_succeded} ExtractFramesTask: {frames_task_succeded}" + ) deposit.publish(extract_chapters=False).commit() db.session.commit() @@ -414,11 +432,11 @@ def weblectures_chapters_async_runner(): # Records states file created during migration record_states_file_path = "rdm_records_state.json" - + all_record_ids = load_record_ids(record_states_file_path) # Run in batches - record_ids = all_record_ids[:2000] # any subset + record_ids = all_record_ids[:2000] # any subset total = len(record_ids) # Run chapters tasks async @@ -455,7 +473,7 @@ def weblectures_check_task_status(): "/tmp/weblectures_6_rdm_records_state.json", "/tmp/weblectures_7_rdm_records_state.json", "/tmp/last_lectures_rdm_records_state.json", - ] + ] all_record_ids = [] for record_states_file_path in record_states_file_paths: @@ -465,9 +483,9 @@ def weblectures_check_task_status(): record_ids = all_record_ids total = len(record_ids) - failed_deposits = [] # stores (record_id, [task_names]) - failed_chapters_records = [] # stores record_ids with failed chapters task - draft_deposits = [] # stores (record_id, deposit_id) + failed_deposits = [] # stores (record_id, [task_names]) + failed_chapters_records = [] # stores record_ids with failed chapters task + draft_deposits = [] # stores (record_id, deposit_id) for i, record_id in enumerate(record_ids, start=1): if i % 100 == 0: @@ -519,7 +537,7 @@ def weblectures_check_task_status(): log_success(f"Republished record {record_id}.") db.session.commit() - # run chapters again + # run chapters again total = len(failed_chapters_records) for i, record_id in enumerate(failed_chapters_records, start=1): log_success(f"Processing {i}/{total} record: {record_id}") diff --git a/setup.cfg b/setup.cfg index 493dceca..07732861 100644 --- a/setup.cfg +++ b/setup.cfg @@ -16,7 +16,7 @@ python_requires = >=3.9 zip_safe = False install_requires = sentry-sdk>=1.45,<2.0.0 - cds-dojson>=0.12.0 + cds-dojson @ git+https://github.com/CERNDocumentServer/cds-dojson@master#egg=cds-dojson invenio-rdm-migrator>=5.0.0 lxml>=4.6.5 ipython!=8.1.0 @@ -31,7 +31,7 @@ install_requires = [options.extras_require] rdm = - invenio-app-rdm[opensearch2]==14.0.0b8.dev0 + invenio-app-rdm[opensearch2]==14.0.0b10.dev4 cds-rdm @ git+https://github.com/CERNDocumentServer/cds-rdm@master#egg=cds-rdm&subdirectory=site invenio-preservation-sync==0.3.0 invenio-cern-sync @ git+https://github.com/cerndocumentserver/invenio-cern-sync@v0.6.0#egg=invenio-cern-sync @@ -66,6 +66,9 @@ cds_migrator_kit.migrator.users.model = cds_migrator_kit.migrator.submitter.model = submitter = cds_migrator_kit.rdm.users.transform.xml_processing.models.submitter:submitter_model cds_migrator_kit.migrator.models = + lep_exp = cds_migrator_kit.rdm.records.transform.models.research:research_model + research_comm_model = cds_migrator_kit.rdm.records.transform.models.research_committee:research_comm_model + fap = cds_migrator_kit.rdm.records.transform.models.fap:fap_model ssn = cds_migrator_kit.rdm.records.transform.models.summer_student_report:sspn_model thesis = cds_migrator_kit.rdm.records.transform.models.thesis:thesis_model cms_note = cds_migrator_kit.rdm.records.transform.models.note:cms_note_model @@ -82,7 +85,7 @@ cds_migrator_kit.migrator.models = te = cds_migrator_kit.rdm.records.transform.models.te:te_model en = cds_migrator_kit.rdm.records.transform.models.en:en_model annual_rep = cds_migrator_kit.rdm.records.transform.models.annual_report:annual_rep_model - fap = cds_migrator_kit.rdm.records.transform.models.fap:fap_model + cds_migrator_kit.migrator.rules.base = base = cds_migrator_kit.transform.xml_processing.rules.base cds_migrator_kit.migrator.rdm.rules.base = diff --git a/tests/cds-rdm/conftest.py b/tests/cds-rdm/conftest.py index 68f96dcd..60b38905 100644 --- a/tests/cds-rdm/conftest.py +++ b/tests/cds-rdm/conftest.py @@ -878,6 +878,28 @@ def experiments_v(app, exp_type): "type": "experiments", }, ) + vocab = vocabulary_service.create( + system_identity, + { + "id": "ALICE", + "title": { + "en": "ALICE", + }, + "props": {"link": "http://lhcb.web.cern.ch/lhcb/"}, + "type": "experiments", + }, + ) + vocab = vocabulary_service.create( + system_identity, + { + "id": "ATLAS", + "title": { + "en": "ATLAS", + }, + "props": {"link": "http://lhcb.web.cern.ch/lhcb/"}, + "type": "experiments", + }, + ) return vocab diff --git a/tests/cds-rdm/data/thesis/dump/test_records.json b/tests/cds-rdm/data/thesis/dump/test_records.json index 496515a7..70903741 100644 --- a/tests/cds-rdm/data/thesis/dump/test_records.json +++ b/tests/cds-rdm/data/thesis/dump/test_records.json @@ -42,7 +42,7 @@ "creation_date": "2025-04-29 08:25:01", "record": [ { - "marcxml": "\n 2742366\n SzGeCERN\n 20250429102501.0\n \n 9783030903756\n print version\n \n \n 9783030903763\n electronic version\n \n \n oai:cds.cern.ch:2742366\n cerncds:FULLTEXT\n cerncds:THESES\n cerncds:CERN:FULLTEXT\n INIS\n cerncds:CERN\n \n \n DOI\n 10.1007/978-3-030-90376-3\n publication\n \n \n DOI\n 10.3204/PUBDB-2020-02655\n \n \n Inspire\n 1807850\n \n \n DESY-THESIS-2020-016\n \n \n eng\n \n \n CMS-TS-2020-019\n \n \n CERN-THESIS-2020-148\n \n \n Defranchis, Matteo M.\n XX\n YY\n Hamburg U.\n \n \n First measurement of the running of the top quark mass\n \n \n Cham\n Springer\n 2020\n \n \n 189 p\n \n \n presented on 30 Apr 2020\n \n \n PhD\n 2020\n \n \n In this thesis, the first experimental determination of the running of the top quarkmass is presented. The running is extracted from a measurement of the differential top quark-antiquark ($t$$\\bar{t}$) production cross section as a function of the invariant mass of the $t$$\\bar{t}$ system, $m$$_{t\\bar{t}}$. The analysis is performed using proton-proton collision data recorded by the CMS detector at the CERN LHC in 2016, corresponding to an integrated luminosity of 35.9 fb$^{\u22121}$ . Candidate $t$$\\bar{t}$ events are selected in the final state with an electron and a muon of opposite charge, and the differential cross section d$\\sigma$$_{t\\bar{t}}$/d$m$$_{t\\bar{t}}$ is determined at the parton level by means of a maximum-likelihood fit to multidifferential final-state distributions. The value of the top quark mass in the modified minimal subtraction ($\\overline{MS}$) renormalization scheme, $m$$_{t}$($\\mu$), is determined as a function of the scale $\\mu$ = $m$$_{t\\bar{t}}$ by comparing the measured d$\\sigma$$_{t\\bar{t}}$/d$m$$_{t\\bar{t}}$ to theoretical predictions at next-to-leading order, and the resulting scale dependence is interpreted as the running of the top quark mass. The observed running is found to be compatible with the one-loop solution of the corresponding renormalization group equation, up to a scale of the order of 1 TeV.\n \n \n Springer\n n this thesis, the first measurement of the running of the top quark mass is presented. This is a fundamental quantum effect that had never been studied before. Any deviation from the expected behaviour can be interpreted as a hint of the presence of physics beyond the Standard Model. All relevant aspects of the analysis are extensively described and documented. This thesis also describes a simultaneous measurement of the inclusive top quark-antiquark production cross section and the top quark mass in the simulation. The measured cross section is also used to precisely determine the values of the top quark mass and the strong coupling constant by comparing to state-of-the-art theoretical predictions. All the theoretical and experimental aspects relevant to the results presented in this thesis are discussed in the initial chapters in a concise but complete way, which makes the material accessible to a wider audience.\n \n \n CERN EDS\n \n \n ILSLINK\n \n \n SzGeCERN\n Detectors and Experimental Techniques\n \n \n CMS\n TOP\n \n \n THESIS\n \n \n CERN\n \n \n PUBLCMS\n \n \n CERN LHC\n CMS\n \n \n Lipka, Ekaterina\n dir.\n INSPIRE-00185340\n CCID-721143\n DESY\n \n \n PH\n \n \n 2254427\n 22128655\n http://cds.cern.ch/record/2742366/files/CERN THESIS 2020 148.pdf\n Fulltext\n \n \n 2254427\n 31881\n http://cds.cern.ch/record/2742366/files/CERN THESIS 2020 148.jpg?subformat=icon-700\n icon-700\n Fulltext\n \n \n 2254427\n 3645\n http://cds.cern.ch/record/2742366/files/CERN THESIS 2020 148.gif?subformat=icon\n icon\n Fulltext\n \n \n 2254427\n 4604\n http://cds.cern.ch/record/2742366/files/CERN THESIS 2020 148.jpg?subformat=icon-180\n icon-180\n Fulltext\n \n \n n\n 202043\n \n \n 14\n \n \n PUBLIC\n \n \n THESIS\n \n", + "marcxml": "\n 2742366\n SzGeCERN\n 20250429102501.0\n \n 9783030903756\n print version\n \n \n 9783030903763\n electronic version\n \n \n oai:cds.cern.ch:2742366\n cerncds:FULLTEXT\n cerncds:THESES\n cerncds:CERN:FULLTEXT\n INIS\n cerncds:CERN\n \n \n DOI\n 10.1007/978-3-030-90376-3\n publication\n \n \n DOI\n 10.3204/PUBDB-2020-02655\n \n \n Inspire\n 1807850\n \n \n DESY-THESIS-2020-016\n \n \n eng\n \n \n CMS-TS-2020-019\n \n \n CERN-THESIS-2020-148\n \n \n Defranchis, Matteo M.\n XX\n Hamburg U.\n \n \n First measurement of the running of the top quark mass\n \n \n Cham\n Springer\n 2020\n \n \n 189 p\n \n \n presented on 30 Apr 2020\n \n \n PhD\n 2020\n \n \n In this thesis, the first experimental determination of the running of the top quarkmass is presented. The running is extracted from a measurement of the differential top quark-antiquark ($t$$\\bar{t}$) production cross section as a function of the invariant mass of the $t$$\\bar{t}$ system, $m$$_{t\\bar{t}}$. The analysis is performed using proton-proton collision data recorded by the CMS detector at the CERN LHC in 2016, corresponding to an integrated luminosity of 35.9 fb$^{\u22121}$ . Candidate $t$$\\bar{t}$ events are selected in the final state with an electron and a muon of opposite charge, and the differential cross section d$\\sigma$$_{t\\bar{t}}$/d$m$$_{t\\bar{t}}$ is determined at the parton level by means of a maximum-likelihood fit to multidifferential final-state distributions. The value of the top quark mass in the modified minimal subtraction ($\\overline{MS}$) renormalization scheme, $m$$_{t}$($\\mu$), is determined as a function of the scale $\\mu$ = $m$$_{t\\bar{t}}$ by comparing the measured d$\\sigma$$_{t\\bar{t}}$/d$m$$_{t\\bar{t}}$ to theoretical predictions at next-to-leading order, and the resulting scale dependence is interpreted as the running of the top quark mass. The observed running is found to be compatible with the one-loop solution of the corresponding renormalization group equation, up to a scale of the order of 1 TeV.\n \n \n Springer\n n this thesis, the first measurement of the running of the top quark mass is presented. This is a fundamental quantum effect that had never been studied before. Any deviation from the expected behaviour can be interpreted as a hint of the presence of physics beyond the Standard Model. All relevant aspects of the analysis are extensively described and documented. This thesis also describes a simultaneous measurement of the inclusive top quark-antiquark production cross section and the top quark mass in the simulation. The measured cross section is also used to precisely determine the values of the top quark mass and the strong coupling constant by comparing to state-of-the-art theoretical predictions. All the theoretical and experimental aspects relevant to the results presented in this thesis are discussed in the initial chapters in a concise but complete way, which makes the material accessible to a wider audience.\n \n \n CERN EDS\n \n \n ILSLINK\n \n \n SzGeCERN\n Detectors and Experimental Techniques\n \n \n CMS\n TOP\n \n \n THESIS\n \n \n CERN\n \n \n PUBLCMS\n \n \n CERN LHC\n CMS\n \n \n Lipka, Ekaterina\n dir.\n INSPIRE-00185340\n CCID-721143\n DESY\n \n \n PH\n \n \n 2254427\n 22128655\n http://cds.cern.ch/record/2742366/files/CERN THESIS 2020 148.pdf\n Fulltext\n \n \n 2254427\n 31881\n http://cds.cern.ch/record/2742366/files/CERN THESIS 2020 148.jpg?subformat=icon-700\n icon-700\n Fulltext\n \n \n 2254427\n 3645\n http://cds.cern.ch/record/2742366/files/CERN THESIS 2020 148.gif?subformat=icon\n icon\n Fulltext\n \n \n 2254427\n 4604\n http://cds.cern.ch/record/2742366/files/CERN THESIS 2020 148.jpg?subformat=icon-180\n icon-180\n Fulltext\n \n \n n\n 202043\n \n \n 14\n \n \n PUBLIC\n \n \n THESIS\n \n", "json": null, "modification_datetime": "2025-04-29T08:25:01+00:00" } @@ -241,7 +241,7 @@ "creation_date": "2013-07-10 03:31:16", "record": [ { - "marcxml": "\n 2316709\n SzGeCERN\n 20190603170930.0\n \n oai:cds.cern.ch:2316709\n cerncds:FULLTEXT\n cerncds:THESES\n cerncds:CERN:FULLTEXT\n INIS\n cerncds:CERN\n \n \n AIDA-2020-THESIS-2018-001\n \n \n eng\n \n \n CERN-THESIS-2018-042\n \n \n Paulitsch, Peter\n Vienna, Tech. U.\n \n \n Performance Studies of Silicon Strip Sensors for the Phase-II Upgrade of the CMS Tracker\n \n \n Vienna\n Vienna, Tech. U.\n 2018\n \n \n 72 p\n \n \n Presented 27 Apr 2018\n \n \n Diploma\n Vienna, Tech. U.\n 2018\n \n \n In 2025, the LHC (Large Hadron Collider) will be upgraded to the High-Luminosity\nLHC. The luminosity will be enhanced by a factor of 5 to 10, up to 10^35 cm^-2s^-1. This\nleads to new challenges for experiments such as the Compact Muon Solenoid (CMS),\nwhich is already afflicted by aging effects (radiation damages). Therefore the currently\ninstalled silicon sensors of the track detector (\"Tracker\") have to be replaced, furthermore\nto carry out higher radiation doses (through raised collision rates) and increased\ndata rates. The prototypes of the new sensors are provided by the vendors Infineon and\nHamamatsu. These have to be qualified for application by institutes like the Institute\nof High Energy Physics (HEPHY).\nFor this diploma thesis, I did testbeam measurements on these sensors using protons (64\nto 252MeV) at MedAustron and electrons (5.6 GeV) at Deutsches Elektronen-Synchrotron\n(DESY), analyzed the data and utilized performance and quality evaluation. These\nmethods include IV characteristics, noise contribution, cluster analysis, beam profile\nmeasurement, efficiency and energy measurements. In preparation for the testbeams,\nI tested new trigger scintillators to determine dark rates and efficiency and the strip\nsensor system using a radioactive source and a laser test stand at the HEPHY.\nAt the MedAustron\u2019s first testbeam, high particle rates (up to 10^10/s) exceeded the sensor\nsystem\u2019s processing rate. Occupancy and pile-up effects dominated the signal and\ndistorted measured energy depositions. During the testbeam, the bias voltage supply\nof the strip sensor showed compliance, leading to voltage drops. After changes made\nto the accelerator by MedAustron staff, lower particle rates (10^5/s) were available at\nthe second testbeam. These actions, complemented by optimizations in the setup, lead\nto stable power supply and analysis showed excellent conformity of measured stopping\npower to reference data.\nProspective testbeams require extensive preparations in terms of functionality tests,\nstandardization and simulation in advance to identify design flaws. For achieving better\nenergy resolution in future, well-defined particle rate control by MedAustron is essential,\nas well as high time-resolved monitoring the current consumption of the sensor. If\nthere is a demand for low-energy testbeams, it is essential to analyze the non-linear gain\nbehavior in the upper energy deposition range of the ALiBaVa system. Based on that,\none may eventually extend the analysis software algorithm. Further procedures should\ncover protection against electromagnetic interference. Perhaps it will be possible to find\nan appropriate model to characterize electronic noise contribution to improve SNR.\n \n \n FP7\n 654168\n AIDA-2020\n openAccess\n \n \n CERN Invenio WebSubmit\n GENEU\n 0.1\n \n \n SzGeCERN\n Detectors and Experimental Techniques\n \n \n AIDA-2020\n 2: Innovation and outreach\n WP\n \n \n AIDA-2020\n \n \n AIDA-2020THESIS\n \n \n CERN\n \n \n CERN HL-LHC\n CMS\n \n \n Schwanda, Christoph\n dir.\n Vienna, Tech. U.\n \n \n Bergauer, Thomas\n dir.\n Vienna, Tech. U.\n \n \n 1401872\n 7644879\n http://cds.cern.ch/record/2316709/files/AIDA-2020-THESIS-2018-001.pdf\n \n \n 1401872\n 10988\n http://cds.cern.ch/record/2316709/files/AIDA-2020-THESIS-2018-001.jpg?subformat=icon-\n icon-\n \n \n 1401872\n 11025101\n http://cds.cern.ch/record/2316709/files/AIDA-2020-THESIS-2018-001.pdf?subformat=pdfa\n pdfa\n \n \n 1401872\n 7400\n http://cds.cern.ch/record/2316709/files/AIDA-2020-THESIS-2018-001.gif?subformat=icon\n icon\n \n \n livia.lapadatescu@cern.ch\n \n \n n\n 201817\n Y\n \n \n PUBLIC\n \n \n THESIS\n AIDA-2020\n \n", + "marcxml": "\n 2316709\n SzGeCERN\n 20190603170930.0\n \n oai:cds.cern.ch:2316709\n cerncds:FULLTEXT\n cerncds:THESES\n cerncds:CERN:FULLTEXT\n INIS\n cerncds:CERN\n \n \n AIDA-2020-THESIS-2018-001\n \n \n eng\n \n \n CERN-THESIS-2018-042\n \n \n Paulitsch, Peter\n Vienna, Tech. U.\n \n \n Performance Studies of Silicon Strip Sensors for the Phase-II Upgrade of the CMS Tracker\n \n \n Vienna\n Vienna, Tech. U.\n 2018\n \n \n 72 p\n \n \n Presented 27 Apr 2018\n \n \n Diploma\n Vienna, Tech. U.\n 2018\n \n \n In 2025, the LHC (Large Hadron Collider) will be upgraded to the High-Luminosity\nLHC. The luminosity will be enhanced by a factor of 5 to 10, up to 10^35 cm^-2s^-1. This\nleads to new challenges for experiments such as the Compact Muon Solenoid (CMS),\nwhich is already afflicted by aging effects (radiation damages). Therefore the currently\ninstalled silicon sensors of the track detector (\"Tracker\") have to be replaced, furthermore\nto carry out higher radiation doses (through raised collision rates) and increased\ndata rates. The prototypes of the new sensors are provided by the vendors Infineon and\nHamamatsu. These have to be qualified for application by institutes like the Institute\nof High Energy Physics (HEPHY).\nFor this diploma thesis, I did testbeam measurements on these sensors using protons (64\nto 252MeV) at MedAustron and electrons (5.6 GeV) at Deutsches Elektronen-Synchrotron\n(DESY), analyzed the data and utilized performance and quality evaluation. These\nmethods include IV characteristics, noise contribution, cluster analysis, beam profile\nmeasurement, efficiency and energy measurements. In preparation for the testbeams,\nI tested new trigger scintillators to determine dark rates and efficiency and the strip\nsensor system using a radioactive source and a laser test stand at the HEPHY.\nAt the MedAustron\u2019s first testbeam, high particle rates (up to 10^10/s) exceeded the sensor\nsystem\u2019s processing rate. Occupancy and pile-up effects dominated the signal and\ndistorted measured energy depositions. During the testbeam, the bias voltage supply\nof the strip sensor showed compliance, leading to voltage drops. After changes made\nto the accelerator by MedAustron staff, lower particle rates (10^5/s) were available at\nthe second testbeam. These actions, complemented by optimizations in the setup, lead\nto stable power supply and analysis showed excellent conformity of measured stopping\npower to reference data.\nProspective testbeams require extensive preparations in terms of functionality tests,\nstandardization and simulation in advance to identify design flaws. For achieving better\nenergy resolution in future, well-defined particle rate control by MedAustron is essential,\nas well as high time-resolved monitoring the current consumption of the sensor. If\nthere is a demand for low-energy testbeams, it is essential to analyze the non-linear gain\nbehavior in the upper energy deposition range of the ALiBaVa system. Based on that,\none may eventually extend the analysis software algorithm. Further procedures should\ncover protection against electromagnetic interference. Perhaps it will be possible to find\nan appropriate model to characterize electronic noise contribution to improve SNR.\n \n \n FP7\n 654168\n AIDA-2020\n openAccess\n \n \n CERN Invenio WebSubmit\n GENEU\n 0.1\n \n \n SzGeCERN\n Detectors and Experimental Techniques\n \n \n AIDA-2020\n 2: Innovation and outreach\n WP\n \n \n AIDA-2020\n \n \n AIDA-2020THESIS\n \n \n CERN\n \n \n CERN HL-LHC\n CMS\n \n \n Schwanda, Christoph\n dir.\n Vienna, Tech. U.\n \n \n Bergauer, Thomas\n dir.\n Vienna, Tech. U.\n \n \n 1401872\n 7644879\n http://cds.cern.ch/record/2316709/files/AIDA-2020-THESIS-2018-001.pdf\n \n \n 1401872\n 10988\n http://cds.cern.ch/record/2316709/files/AIDA-2020-THESIS-2018-001.jpg?subformat=icon-\n icon-\n \n \n 1401872\n 11025101\n http://cds.cern.ch/record/2316709/files/AIDA-2020-THESIS-2018-001.pdf?subformat=pdfa\n pdfa\n \n \n 1401872\n 7400\n http://cds.cern.ch/record/2316709/files/AIDA-2020-THESIS-2018-001.gif?subformat=icon\n icon\n \n \n submitter10@gmail.com\n \n \n n\n 201817\n Y\n \n \n PUBLIC\n \n \n THESIS\n AIDA-2020\n \n", "json": null, "modification_datetime": "2019-06-03T15:09:30+00:00" } diff --git a/tests/cds-rdm/test_base_migration.py b/tests/cds-rdm/test_base_migration.py index 4aff7228..d48e6ac2 100644 --- a/tests/cds-rdm/test_base_migration.py +++ b/tests/cds-rdm/test_base_migration.py @@ -173,9 +173,7 @@ def test_corporate_author_basic(self): "person_or_org": { "type": "organizational", "name": "CERN", - "family_name": "CERN", }, - "role": {"id": "hostinginstitution"}, } ] diff --git a/tests/cds-rdm/test_comments_migration.py b/tests/cds-rdm/test_comments_migration.py index 3c2e80e5..bffce35c 100644 --- a/tests/cds-rdm/test_comments_migration.py +++ b/tests/cds-rdm/test_comments_migration.py @@ -296,7 +296,7 @@ def test_migrate_comments_from_metadata( assert request_result.total == 4 -def test_migrate_comments_dry_run(temp_dir): +def test_migrate_comments_dry_run(temp_dir, test_app, db): """Test migrating comments in dry-run mode.""" # Create directory structure for attached files comments_dir = os.path.join(os.path.dirname(__file__), "data", "comments") diff --git a/tests/cds-rdm/test_full_migration.py b/tests/cds-rdm/test_full_migration.py index 1dd620ed..2971e5f1 100644 --- a/tests/cds-rdm/test_full_migration.py +++ b/tests/cds-rdm/test_full_migration.py @@ -85,7 +85,13 @@ def suite_multi_field(record): "family_name": "Juste", "identifiers": [{"identifier": "2675934", "scheme": "cds"}], } - } + }, + { + "person_or_org": { + "name": "RP collaboration", + "type": "organizational", + }, + }, ] assert dict_rec["metadata"]["publication_date"] == "2018-08-02" assert ( @@ -142,13 +148,6 @@ def suite_multi_field(record): "role": {"id": "other", "title": {"en": "Other"}}, "affiliations": [{"name": "CERN"}], }, - { - "person_or_org": {"type": "organizational", "name": "RP collaboration"}, - "role": { - "id": "hostinginstitution", - "title": {"en": "Hosting institution"}, - }, - }, { "person_or_org": { "type": "personal", @@ -195,7 +194,13 @@ def orcid_id(record, orcid_name_data): {"identifier": "0009-0007-7638-4652", "scheme": "orcid"}, ], } - } + }, + { + "person_or_org": { + "name": "CERN EP Department", + "type": "organizational", + }, + }, ] name_from_db = NamesMetadata.query.filter_by(pid=orcid_name_data["id"]).one() @@ -347,9 +352,10 @@ def custom_affiliation(record): """2041388.""" dict_rec = record.to_dict() for creator in dict_rec["metadata"]["creators"]: - assert "affiliations" in creator - for affiliation in creator["affiliations"]: - assert "ror" != affiliation.get("scheme", None) + if creator["person_or_org"]["type"] == "personal": + assert "affiliations" in creator + for affiliation in creator["affiliations"]: + assert "ror" != affiliation.get("scheme", None) def contains_aleph(record): @@ -406,18 +412,6 @@ def author_with_inspire(record): {"name": "Universitat Autonoma de Barcelona ES"}, ], }, - { - "person_or_org": { - "name": "CERN. Geneva. EP Department", - "type": "organizational", - }, - "role": { - "id": "hostinginstitution", - "title": { - "en": "Hosting institution", - }, - }, - }, ] diff --git a/tests/cds-rdm/test_json_translation_rules.py b/tests/cds-rdm/test_json_translation_rules.py index b3f3cfdf..38ce09f2 100644 --- a/tests/cds-rdm/test_json_translation_rules.py +++ b/tests/cds-rdm/test_json_translation_rules.py @@ -42,7 +42,13 @@ def test_migrate_sspn_record(datadir, base_app): {"identifier": "1111", "scheme": "cern"}, ], } - } + }, + { + "person_or_org": { + "type": "organizational", + "name": "CERN EP Department", + } + }, ], "title": "Deep Learning Methods for Particle Reconstruction in the HGCal", "publisher": "CERN", @@ -71,14 +77,6 @@ def test_migrate_sspn_record(datadir, base_app): }, }, "contributors": [ - { - "person_or_org": { - "type": "organizational", - "name": "CERN. Geneva. EP Department", - "family_name": "CERN. Geneva. EP Department", - }, - "role": {"id": "hostinginstitution"}, - }, { "person_or_org": { "type": "personal", @@ -140,7 +138,13 @@ def test_migrate_record_all_fields(datadir, base_app): {"identifier": "81111", "scheme": "cern"}, ], } - } + }, + { + "person_or_org": { + "type": "organizational", + "name": "RP collaboration", + } + }, ], "title": "FLUKA and ActiWiz benchmark on BDF materials", "additional_descriptions": [ @@ -222,14 +226,6 @@ def test_migrate_record_all_fields(datadir, base_app): "role": {"id": "other"}, "affiliations": ["CERN"], }, - { - "person_or_org": { - "type": "organizational", - "name": "RP collaboration", - "family_name": "RP collaboration", - }, - "role": {"id": "hostinginstitution"}, - }, { "person_or_org": { "type": "personal", diff --git a/tests/cds-rdm/test_publications_rules.py b/tests/cds-rdm/test_publications_rules.py index 261e98f8..1df91c85 100644 --- a/tests/cds-rdm/test_publications_rules.py +++ b/tests/cds-rdm/test_publications_rules.py @@ -17,6 +17,9 @@ isbn, issn, journal, + oa_level_from_annual_report, + oa_level_from_license, + oa_level_from_url, udc, ) @@ -356,3 +359,292 @@ def test_journal_issue_without_volume(self): journal_info = result.get("journal:journal", {}) assert journal_info["title"] == "Journal" assert journal_info["issue"] == "5" + + +class TestOaLevelFromLicense: + """Tests for oa_level_from_license (540__ rule).""" + + def _cf(self, record): + return record.get("custom_fields", {}) + + # --- Gold --- + + def test_gold_cc_by_with_publication_scope(self): + record = {"custom_fields": {}} + with pytest.raises(IgnoreKey): + oa_level_from_license(record, "540__", {"a": "CC BY", "3": "publication"}) + assert self._cf(record)["cern:oa_level"] == {"id": "gold"} + + def test_gold_cc_hyphen_by_with_publication_scope(self): + record = {"custom_fields": {}} + with pytest.raises(IgnoreKey): + oa_level_from_license(record, "540__", {"a": "CC-BY", "3": "publication"}) + assert self._cf(record)["cern:oa_level"] == {"id": "gold"} + + def test_cc_by_without_publication_scope_is_not_gold(self): + """CC BY alone (no 540__3='publication') should not set gold.""" + record = {"custom_fields": {}} + with pytest.raises(IgnoreKey): + oa_level_from_license(record, "540__", {"a": "CC BY"}) + assert "cern:oa_level" not in self._cf(record) + + def test_cc_by_with_preprint_scope_is_not_gold(self): + """CC BY with 540__3='preprint' → green, not gold.""" + record = {"custom_fields": {}} + with pytest.raises(IgnoreKey): + oa_level_from_license(record, "540__", {"a": "CC BY", "3": "preprint"}) + assert self._cf(record).get("cern:oa_level") == {"id": "green"} + + def test_gold_takes_priority_over_bronze(self): + """Gold in second 540 tag overrides bronze already set.""" + record = {"custom_fields": {}} + with pytest.raises(IgnoreKey): + oa_level_from_license( + record, + "540__", + [{"f": "Bronze"}, {"a": "CC BY", "3": "publication"}], + ) + assert self._cf(record)["cern:oa_level"] == {"id": "gold"} + + # --- Bronze --- + + def test_bronze(self): + record = {"custom_fields": {}} + with pytest.raises(IgnoreKey): + oa_level_from_license(record, "540__", {"f": "Bronze"}) + assert self._cf(record)["cern:oa_level"] == {"id": "bronze"} + + def test_bronze_case_insensitive(self): + record = {"custom_fields": {}} + with pytest.raises(IgnoreKey): + oa_level_from_license(record, "540__", {"f": "bronze"}) + assert self._cf(record)["cern:oa_level"] == {"id": "bronze"} + + def test_bronze_does_not_override_gold(self): + record = {"custom_fields": {"cern:oa_level": {"id": "gold"}}} + with pytest.raises(IgnoreKey): + oa_level_from_license(record, "540__", {"f": "Bronze"}) + assert self._cf(record)["cern:oa_level"] == {"id": "gold"} + + # --- Green --- + + def test_green_from_preprint_scope(self): + record = {"custom_fields": {}} + with pytest.raises(IgnoreKey): + oa_level_from_license(record, "540__", {"3": "Preprint"}) + assert self._cf(record)["cern:oa_level"] == {"id": "green"} + + def test_green_preprint_scope_case_insensitive(self): + record = {"custom_fields": {}} + with pytest.raises(IgnoreKey): + oa_level_from_license(record, "540__", {"3": "preprint"}) + assert self._cf(record)["cern:oa_level"] == {"id": "green"} + + def test_green_does_not_override_bronze(self): + record = {"custom_fields": {"cern:oa_level": {"id": "bronze"}}} + with pytest.raises(IgnoreKey): + oa_level_from_license(record, "540__", {"3": "preprint"}) + assert self._cf(record)["cern:oa_level"] == {"id": "bronze"} + + # --- No OA marker --- + + def test_no_oa_marker_sets_nothing(self): + record = {"custom_fields": {}} + with pytest.raises(IgnoreKey): + oa_level_from_license(record, "540__", {"a": "Some other license"}) + assert "cern:oa_level" not in self._cf(record) + + # --- Funding model --- + + def test_funding_model_scoap3(self): + record = {"custom_fields": {}} + with pytest.raises(IgnoreKey): + oa_level_from_license(record, "540__", {"f": "SCOAP3"}) + assert self._cf(record)["cern:oa_funding_model"] == {"id": "scoap3"} + + def test_funding_model_collective(self): + record = {"custom_fields": {}} + with pytest.raises(IgnoreKey): + oa_level_from_license(record, "540__", {"f": "Collective"}) + assert self._cf(record)["cern:oa_funding_model"] == {"id": "collective"} + + def test_funding_model_cern_rp(self): + record = {"custom_fields": {}} + with pytest.raises(IgnoreKey): + oa_level_from_license(record, "540__", {"f": "CERN-RP"}) + assert self._cf(record)["cern:oa_funding_model"] == {"id": "cern-rp"} + + def test_funding_model_cern_apc(self): + record = {"custom_fields": {}} + with pytest.raises(IgnoreKey): + oa_level_from_license(record, "540__", {"f": "CERN-APC"}) + assert self._cf(record)["cern:oa_funding_model"] == {"id": "cern-apc"} + + def test_funding_model_other(self): + record = {"custom_fields": {}} + with pytest.raises(IgnoreKey): + oa_level_from_license(record, "540__", {"f": "Other"}) + assert self._cf(record)["cern:oa_funding_model"] == {"id": "other"} + + def test_bronze_does_not_set_funding_model(self): + """Bronze is an OA level, not a funding model.""" + record = {"custom_fields": {}} + with pytest.raises(IgnoreKey): + oa_level_from_license(record, "540__", {"f": "Bronze"}) + assert "cern:oa_funding_model" not in self._cf(record) + + def test_funding_model_not_overwritten_by_second_tag(self): + """First funding model found wins.""" + record = {"custom_fields": {}} + with pytest.raises(IgnoreKey): + oa_level_from_license( + record, "540__", [{"f": "SCOAP3"}, {"f": "Collective"}] + ) + assert self._cf(record)["cern:oa_funding_model"] == {"id": "scoap3"} + + +class TestOaLevelFromAnnualReport: + """Tests for oa_level_from_annual_report (595__ rule).""" + + def _cf(self, record): + return record.get("custom_fields", {}) + + def test_for_annual_report_sets_closed(self): + record = {"custom_fields": {}} + with pytest.raises(IgnoreKey): + oa_level_from_annual_report(record, "595__", {"a": "For annual report"}) + assert self._cf(record)["cern:oa_level"] == {"id": "closed"} + + def test_for_annual_report_case_insensitive(self): + record = {"custom_fields": {}} + with pytest.raises(IgnoreKey): + oa_level_from_annual_report(record, "595__", {"a": "for annual report"}) + assert self._cf(record)["cern:oa_level"] == {"id": "closed"} + + def test_does_not_override_existing_gold(self): + record = {"custom_fields": {"cern:oa_level": {"id": "gold"}}} + with pytest.raises(IgnoreKey): + oa_level_from_annual_report(record, "595__", {"a": "For annual report"}) + assert self._cf(record)["cern:oa_level"] == {"id": "gold"} + + def test_does_not_override_existing_bronze(self): + record = {"custom_fields": {"cern:oa_level": {"id": "bronze"}}} + with pytest.raises(IgnoreKey): + oa_level_from_annual_report(record, "595__", {"a": "For annual report"}) + assert self._cf(record)["cern:oa_level"] == {"id": "bronze"} + + def test_does_not_override_existing_green(self): + record = {"custom_fields": {"cern:oa_level": {"id": "green"}}} + with pytest.raises(IgnoreKey): + oa_level_from_annual_report(record, "595__", {"a": "For annual report"}) + assert self._cf(record)["cern:oa_level"] == {"id": "green"} + + def test_other_595_note_sets_nothing(self): + record = {"custom_fields": {}} + with pytest.raises(IgnoreKey): + oa_level_from_annual_report(record, "595__", {"a": "Not for annual report"}) + assert "cern:oa_level" not in self._cf(record) + + def test_unrelated_note_sets_nothing(self): + record = {"custom_fields": {}} + with pytest.raises(IgnoreKey): + oa_level_from_annual_report(record, "595__", {"a": "Some random note"}) + assert "cern:oa_level" not in self._cf(record) + + +class TestOaLevelFromUrl: + """Tests for oa_level_from_url (8564 rule).""" + + def _cf(self, record): + return record.get("custom_fields", {}) + + def test_preprint_url_sets_green(self): + record = {"custom_fields": {}} + with pytest.raises(IgnoreKey): + oa_level_from_url( + record, + "8564_", + {"y": "preprint", "u": "http://example.com/preprint.pdf"}, + ) + assert self._cf(record)["cern:oa_level"] == {"id": "green"} + + def test_manuscript_url_sets_green(self): + record = {"custom_fields": {}} + with pytest.raises(IgnoreKey): + oa_level_from_url( + record, "8564_", {"y": "manuscript", "u": "http://example.com/ms.pdf"} + ) + assert self._cf(record)["cern:oa_level"] == {"id": "green"} + + def test_preprint_url_case_insensitive(self): + record = {"custom_fields": {}} + with pytest.raises(IgnoreKey): + oa_level_from_url( + record, + "8564_", + {"y": "Preprint", "u": "http://example.com/preprint.pdf"}, + ) + assert self._cf(record)["cern:oa_level"] == {"id": "green"} + + def test_preprint_overrides_closed(self): + """A preprint link should upgrade a tentative 'closed' to green.""" + record = {"custom_fields": {"cern:oa_level": {"id": "closed"}}} + with pytest.raises(IgnoreKey): + oa_level_from_url( + record, + "8564_", + {"y": "preprint", "u": "http://example.com/preprint.pdf"}, + ) + assert self._cf(record)["cern:oa_level"] == {"id": "green"} + + def test_manuscript_overrides_closed(self): + record = {"custom_fields": {"cern:oa_level": {"id": "closed"}}} + with pytest.raises(IgnoreKey): + oa_level_from_url( + record, "8564_", {"y": "manuscript", "u": "http://example.com/ms.pdf"} + ) + assert self._cf(record)["cern:oa_level"] == {"id": "green"} + + def test_preprint_does_not_override_gold(self): + record = {"custom_fields": {"cern:oa_level": {"id": "gold"}}} + with pytest.raises(IgnoreKey): + oa_level_from_url( + record, + "8564_", + {"y": "preprint", "u": "http://example.com/preprint.pdf"}, + ) + assert self._cf(record)["cern:oa_level"] == {"id": "gold"} + + def test_preprint_does_not_override_bronze(self): + record = {"custom_fields": {"cern:oa_level": {"id": "bronze"}}} + with pytest.raises(IgnoreKey): + oa_level_from_url( + record, + "8564_", + {"y": "preprint", "u": "http://example.com/preprint.pdf"}, + ) + assert self._cf(record)["cern:oa_level"] == {"id": "bronze"} + + def test_preprint_does_not_override_green(self): + record = {"custom_fields": {"cern:oa_level": {"id": "green"}}} + with pytest.raises(IgnoreKey): + oa_level_from_url( + record, + "8564_", + {"y": "preprint", "u": "http://example.com/preprint.pdf"}, + ) + assert self._cf(record)["cern:oa_level"] == {"id": "green"} + + def test_non_oa_url_label_sets_nothing(self): + record = {"custom_fields": {}} + with pytest.raises(IgnoreKey): + oa_level_from_url( + record, "8564_", {"y": "fulltext", "u": "http://example.com/paper.pdf"} + ) + assert "cern:oa_level" not in self._cf(record) + + def test_no_y_subfield_sets_nothing(self): + record = {"custom_fields": {}} + with pytest.raises(IgnoreKey): + oa_level_from_url(record, "8564_", {"u": "http://example.com/paper.pdf"}) + assert "cern:oa_level" not in self._cf(record)