-
Notifications
You must be signed in to change notification settings - Fork 10
add(eco): new model and transformation rules #521
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,95 @@ | ||
| # -*- coding: utf-8 -*- | ||
| # | ||
| # Copyright (C) 2026 CERN. | ||
| # | ||
| # CDS-RDM is free software; you can redistribute it and/or modify it under | ||
| # the terms of the MIT License; see LICENSE file for more details. | ||
|
|
||
| """CDS-RDM ECO model.""" | ||
|
|
||
| from cds_migrator_kit.rdm.records.transform.models.base_record import ( | ||
| rdm_base_record_model, | ||
| ) | ||
| from cds_migrator_kit.transform.overdo import CdsOverdo | ||
|
|
||
|
|
||
| class ECOModel(CdsOverdo): | ||
| """Translation model for ECO records.""" | ||
|
|
||
| __query__ = """ | ||
| ( | ||
| 980__:POSTER | ||
| OR (980__:BROCHURE AND 690C_:CERNOFFICIALPRESSBROCHURE) | ||
| OR ( | ||
| (980__:BROCHURE AND 690C_:CERNEXPERIMENTBROCHURE) | ||
| OR ( | ||
| 980__:CMSOUTREACH | ||
| AND ( | ||
| 6531_.a:Brochure | ||
| OR 6531_.a:brochure | ||
| OR 6531_a:Brochure | ||
| OR 6531_a:brochure | ||
| ) | ||
| ) | ||
| ) | ||
| OR (980__:NOTE AND 710__.5:IR) | ||
| ) | ||
| AND -595__a:Press | ||
| AND -980__:LHCb_Misc | ||
| AND -690C_a:PRIVATLAS | ||
| """ | ||
|
|
||
| __ignore_keys__ = { | ||
| "0247_9", # source of pid, only value: OSTI, 2948638, 2853279 | ||
| "0248_a", | ||
| "0248_p", | ||
| "0248_q", | ||
| "035__d", # oai harvest tag | ||
| "035__h", # oai harvest tag | ||
| "035__m", # oai harvest tag | ||
| "100__m", # email of contributor | ||
| "245__9", # source of title, only value: submitter | ||
| "270__m", # email of contact person - TODO: is it okay to ignore? example: 2908973 | ||
| "270__p", # contact person name - TODO: is it okay to ignore? | ||
| "300__a", # number of pages | ||
| "340__a", # Physical medium | ||
| "520__9", # abstract provenance | ||
| "541__e", # Original source poster https://cds.cern.ch/record/2695195/export/hm | ||
| "594__a", # PUB: 2749806, 2749822 | ||
| "6531_9", # scheme of keywords | ||
| "700__m", # email of contributor | ||
| "773__p", # display name of the related link TODO: is it okay to ignore? | ||
| "773__y", # year, TODO: is it okay to ignore? https://cds.cern.ch/record/1452204/export/xm | ||
| "773__v", # TODO: is it okay to ignore? https://cds.cern.ch/record/1452204/export/xm | ||
| "852__c", | ||
| "852__h", | ||
| "8560_f", # contact email | ||
| "8564_8", # file id | ||
| "8564_s", # bibdoc id | ||
| "8564_x", # icon thumbnails sizes | ||
| "8564_y", # file description - handled by files dump | ||
| "8564_z", # DM metadata | ||
| "937__c", # last modified by | ||
| "937__s", # last modification date | ||
| "960__a", # base number | ||
| "961__a", # CDS modification tag | ||
| "961__b", # CDS modification tag | ||
| "961__c", # CDS modification tag | ||
| "961__h", # CDS modification tag | ||
| "961__l", # CDS modification tag | ||
| "961__x", # CDS modification tag | ||
| "981__a", # duplicate record id | ||
| } | ||
|
|
||
| _default_fields = { | ||
| "custom_fields": {}, | ||
| "languages": [], | ||
| "related_identifiers": [], | ||
| "creators": [{"person_or_org": {"type": "organizational", "name": "CERN"}}], | ||
| } | ||
|
|
||
|
|
||
| eco_model = ECOModel( | ||
| bases=(rdm_base_record_model,), | ||
| entry_point_group="cds_migrator_kit.migrator.rules.eco", | ||
| ) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,244 @@ | ||
| # -*- coding: utf-8 -*- | ||
| # | ||
| # Copyright (C) 2026 CERN. | ||
| # | ||
| # CDS-RDM is free software; you can redistribute it and/or modify it under | ||
| # the terms of the MIT License; see LICENSE file for more details. | ||
|
|
||
| """CDS-RDM ECO rules.""" | ||
|
|
||
| import re | ||
|
|
||
| from dojson.utils import IgnoreKey, for_each_value, force_list | ||
|
|
||
| from cds_migrator_kit.errors import UnexpectedValue | ||
| from cds_migrator_kit.rdm.records.transform.xml_processing.rules.it import ( | ||
| corporate_author, | ||
| ) | ||
| from cds_migrator_kit.transform.xml_processing.quality.decorators import ( | ||
| require, | ||
| ) | ||
| from cds_migrator_kit.transform.xml_processing.quality.parsers import ( | ||
| StringValue, | ||
| ) | ||
| from cds_migrator_kit.transform.xml_processing.rules.base import ( | ||
| languages as base_languages, | ||
| ) | ||
|
|
||
| from ...models.eco import eco_model as model | ||
| from .base import identifiers | ||
| from .base import note as base_note | ||
| from .base import report_number, urls | ||
| from .bulletin_issue import ( | ||
| additional_descriptions, | ||
| additional_titles_bulletin, | ||
| rel_identifiers, | ||
| translated_description, | ||
| urls_bulletin, | ||
| ) | ||
| from .it import corporate_author | ||
| from .publications import internal_notes, journal, organisation, related_identifiers | ||
|
|
||
| model.over("additional_titles", "(^246_[1_])", override=True)( | ||
| additional_titles_bulletin | ||
| ) | ||
| model.over("additional_descriptions", "(^500__)")(additional_descriptions) | ||
| model.over("additional_descriptions", "(^590__)")(translated_description) | ||
| model.over("internal_notes", "^562__")(internal_notes) | ||
| model.over("contributors", "^901__")(organisation) | ||
|
Comment on lines
+42
to
+48
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why aren't we using the noes from base here? Why do they need to be imported here? It shoulnd't be necessary
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. they're not in the base model. And some records missing 245 but they have 246, so we can import from bulletin to use 246 as title if 245 missing. Or if you prefer I can add missing title records to curation list. |
||
| model.over("creators", "(^110__)")(corporate_author) | ||
| model.over("eco_urls", "^8564[1_]", override=True)(urls_bulletin) | ||
|
|
||
|
|
||
| @model.over("internal_notes", "^595__") | ||
| @for_each_value | ||
| def internal_notes(self, key, value): | ||
| """Translates internal notes.""" | ||
| subject_notes = force_list(value.get("s", "")) | ||
| if subject_notes: | ||
| # add them as subjects | ||
| subjects = self.get("subjects", []) | ||
| for note in subject_notes: | ||
| subjects.append({"subject": note}) | ||
| self["subjects"] = subjects | ||
| base_note(self, key, value) | ||
| raise IgnoreKey("internal_notes") | ||
|
|
||
|
|
||
| @model.over("eco_report_number", "(^037__)|(^088__)", override=True) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same question, why do you reimplement it ?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. to handle emails in 088, since 037 and 088 implemented in the same rule in base, only overriding 088 is not working |
||
| @for_each_value | ||
| def eco_report_number(self, key, value): | ||
| """Translates report number.""" | ||
| identifier = value.get("a", "") | ||
| # Check it's email TODO: how to handle? | ||
| if key == "088__" and "@" in identifier: | ||
| pass | ||
| else: | ||
| _identifier = report_number(self, key, value) | ||
| identifiers = self.get("identifiers", []) | ||
|
|
||
| if _identifier and _identifier not in identifiers: | ||
| identifiers += _identifier | ||
| self["identifiers"] = identifiers | ||
| raise IgnoreKey("eco_report_number") | ||
|
|
||
|
|
||
| @model.over("eco_related_identifiers", "(^962__)", override=True) | ||
| @for_each_value | ||
| def eco_related_identifiers(self, key, value): | ||
| """Translates related identifiers.""" | ||
| scheme = value.get("l", "") | ||
| if scheme: | ||
| rel_identifiers(self, key, value) | ||
| raise IgnoreKey("eco_related_identifiers") | ||
| rel_identifier = related_identifiers(self, key, value) | ||
| if rel_identifier: | ||
| rel_id = rel_identifier[0] | ||
| rel_ids = self.get("related_identifiers", []) | ||
| if rel_id not in rel_ids: | ||
| rel_ids.append(rel_id) | ||
| self["related_identifiers"] = rel_ids | ||
| raise IgnoreKey("eco_related_identifiers") | ||
|
|
||
|
|
||
| @model.over("eco_identifiers", "^035__", override=True) | ||
| @for_each_value | ||
| def eco_identifiers(self, key, value): | ||
| """Translates identifiers.""" | ||
| original_scheme = StringValue(value.get("9", "")).parse() | ||
| scheme = original_scheme.lower() | ||
|
|
||
| # TODO: handle photo identifier | ||
| if scheme == "phopho": | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hmm I am a bit worried about this... why do we get photo identifiers there? is this a relation?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| id_value = StringValue(value.get("a", "")).parse() | ||
| new_id = {"scheme": "photo", "identifier": id_value} | ||
| raise IgnoreKey("eco_identifiers") | ||
|
Comment on lines
+111
to
+115
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How should we handle photo identifiers? example record: https://cds.cern.ch/record/43679
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it should be linked in related identifiers with Photo resource type |
||
| identifiers(self, key, value) | ||
| raise IgnoreKey("eco_identifiers") | ||
|
|
||
|
|
||
| @model.over("resource_type", "^980__", override=True) | ||
| def resource_type(self, key, value): | ||
| """Translates resource_type.""" | ||
| value = value.get("a") if "a" in value else value.get("b") | ||
| if value: | ||
| value = value.strip().lower() | ||
|
|
||
| mapping = { | ||
| "poster": {"id": "poster"}, | ||
| "brochure": {"id": "publication-brochure"}, | ||
| "note": {"id": "publication-technicalnote"}, | ||
| "conferencepaper": {"id": "publication-conferencepaper"}, | ||
| } | ||
|
|
||
| try: | ||
| return mapping[value] | ||
| except KeyError: | ||
| raise UnexpectedValue("Unknown resource type (ECO)", field=key, value=value) | ||
|
|
||
|
|
||
| @model.over("collection", "^690C_", override=True) | ||
| @for_each_value | ||
| def collection(self, key, value): | ||
| """Translates collection.""" | ||
| collection = value.get("a", "") | ||
| if collection.strip().upper() == "CERN": | ||
| raise IgnoreKey("collection") | ||
| if collection.strip().upper() not in [ | ||
| "POSTER", | ||
| "PREPRINT", | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do you have example preprints? it is quite unlikely we have research content in this data set
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. there's only one record with preprint:
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it should be checked by the curators if it is really preprint. If not, the tag should be removed both t=from the record and from the code here |
||
| ]: | ||
| raise UnexpectedValue(subfield="a", field=key, value=value) | ||
| subjects = self.get("subjects", []) | ||
| subjects.append( | ||
| { | ||
| "subject": f"collection:{collection}", | ||
| } | ||
| ) | ||
| self["subjects"] = subjects | ||
| raise IgnoreKey("collection") | ||
|
|
||
|
|
||
| @model.over("related_ids", "^773__") | ||
| @for_each_value | ||
| def related_ids(self, key, value): | ||
| """Translated related links.""" | ||
| related_link = value.get("u", "") | ||
| if not related_link: | ||
| _custom_fields = journal(self, key, value) | ||
| self["custom_fields"] = _custom_fields | ||
| raise IgnoreKey("related_ids") | ||
|
zubeydecivelek marked this conversation as resolved.
|
||
|
|
||
| # Transform like the base `urls` rule | ||
| rel_ids = urls(self, key, value) | ||
| if not rel_ids: | ||
| raise IgnoreKey("related_ids") | ||
| rel_id = rel_ids[0] | ||
| related_identifiers = self.get("related_identifiers", []) | ||
| if rel_id not in related_identifiers: | ||
| related_identifiers.append(rel_id) | ||
| self["related_identifiers"] = related_identifiers | ||
|
|
||
| raise IgnoreKey("related_ids") | ||
|
|
||
|
|
||
| @model.over("submitter_info", "^923__") | ||
| @for_each_value | ||
| def submitter_info(self, key, value): | ||
| """Translates submitter information.""" | ||
| submitter_info = value.get("r", "") | ||
| names = submitter_info.strip().split(",") | ||
|
|
||
| if len(names) == 2: | ||
| names = {"family_name": names[0].strip(), "given_name": names[1].strip()} | ||
| else: | ||
| names = {"family_name": names[0].strip()} | ||
| contributor = { | ||
| "person_or_org": { | ||
| "type": "personal", | ||
| **names, | ||
| }, | ||
| "role": {"id": "contactperson"}, | ||
| } | ||
| contributors = self.get("contributors", []) | ||
| contributors.append(contributor) | ||
| self["contributors"] = contributors | ||
| raise IgnoreKey("submitter_info") | ||
|
|
||
|
|
||
| @model.over("languages", "^041__", override=True) | ||
|
zubeydecivelek marked this conversation as resolved.
|
||
| @for_each_value | ||
| @require(["a"]) | ||
| def language(self, key, value): | ||
| """Translates languages fields.""" | ||
| langs = value.get("a") | ||
| languages = self.get("languages", []) | ||
| if "-" in langs or "/" in langs: | ||
| # https://cds.cern.ch/record/921930/export/xm | ||
| language_codes = re.split(r"[-/]+", langs) | ||
| for lang in language_codes: | ||
| if not lang: | ||
| continue | ||
| new_langs = base_languages(self, key, {"a": lang}) | ||
| languages.extend(new_langs) | ||
| else: | ||
| new_langs = base_languages(self, key, value) | ||
| languages.extend(new_langs) | ||
| self["languages"] = languages | ||
| raise IgnoreKey("language") | ||
|
|
||
|
|
||
| @model.over("field_993", "^993__", override=True) | ||
| @for_each_value | ||
| def field_993(self, key, value): | ||
| """Translates field 993 as a keyword.""" | ||
| value = value.get("q", "") | ||
| if value and value not in ["Project Management"]: | ||
| raise UnexpectedValue(field=key, subfield="a", value=value) | ||
| _subjects = self.get("subjects", []) | ||
| subject = { | ||
| "subject": value, | ||
| } | ||
| _subjects.append(subject) | ||
| self["subjects"] = _subjects | ||
| raise IgnoreKey("field_993") | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
what does d mean?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
311 record has
din the source field. I checked but couldnt find the meaning of d. Maybe digitized? I'll add a question to curation sheet.Some example recids: 43247, 43430, 824753, 1221556