Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 95 additions & 0 deletions cds_migrator_kit/rdm/records/transform/models/eco.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2026 CERN.
#
# CDS-RDM is free software; you can redistribute it and/or modify it under
# the terms of the MIT License; see LICENSE file for more details.

"""CDS-RDM ECO model."""

from cds_migrator_kit.rdm.records.transform.models.base_record import (
rdm_base_record_model,
)
from cds_migrator_kit.transform.overdo import CdsOverdo


class ECOModel(CdsOverdo):
"""Translation model for ECO records."""

__query__ = """
(
980__:POSTER
OR (980__:BROCHURE AND 690C_:CERNOFFICIALPRESSBROCHURE)
OR (
(980__:BROCHURE AND 690C_:CERNEXPERIMENTBROCHURE)
OR (
980__:CMSOUTREACH
AND (
6531_.a:Brochure
OR 6531_.a:brochure
OR 6531_a:Brochure
OR 6531_a:brochure
)
)
)
OR (980__:NOTE AND 710__.5:IR)
)
AND -595__a:Press
AND -980__:LHCb_Misc
AND -690C_a:PRIVATLAS
"""

__ignore_keys__ = {
"0247_9", # source of pid, only value: OSTI, 2948638, 2853279
"0248_a",
"0248_p",
"0248_q",
"035__d", # oai harvest tag
"035__h", # oai harvest tag
"035__m", # oai harvest tag
"100__m", # email of contributor
"245__9", # source of title, only value: submitter
"270__m", # email of contact person - TODO: is it okay to ignore? example: 2908973
"270__p", # contact person name - TODO: is it okay to ignore?
"300__a", # number of pages
"340__a", # Physical medium
"520__9", # abstract provenance
"541__e", # Original source poster https://cds.cern.ch/record/2695195/export/hm
"594__a", # PUB: 2749806, 2749822
"6531_9", # scheme of keywords
"700__m", # email of contributor
"773__p", # display name of the related link TODO: is it okay to ignore?
"773__y", # year, TODO: is it okay to ignore? https://cds.cern.ch/record/1452204/export/xm
"773__v", # TODO: is it okay to ignore? https://cds.cern.ch/record/1452204/export/xm
"852__c",
"852__h",
"8560_f", # contact email
"8564_8", # file id
"8564_s", # bibdoc id
"8564_x", # icon thumbnails sizes
"8564_y", # file description - handled by files dump
"8564_z", # DM metadata
"937__c", # last modified by
"937__s", # last modification date
"960__a", # base number
"961__a", # CDS modification tag
"961__b", # CDS modification tag
"961__c", # CDS modification tag
"961__h", # CDS modification tag
"961__l", # CDS modification tag
"961__x", # CDS modification tag
"981__a", # duplicate record id
}

_default_fields = {
"custom_fields": {},
"languages": [],
"related_identifiers": [],
"creators": [{"person_or_org": {"type": "organizational", "name": "CERN"}}],
}


eco_model = ECOModel(
bases=(rdm_base_record_model,),
entry_point_group="cds_migrator_kit.migrator.rules.eco",
)
2 changes: 2 additions & 0 deletions cds_migrator_kit/rdm/records/transform/models/it.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ class ITModel(CdsOverdo):
-980__:BOOK
-690C_:YELLOWREPORT
-690C_:"YELLOW REPORT"
-690C_:CERNOFFICIALPRESSBROCHURE
-690C_:CERNEXPERIMENTBROCHURE
-980__:THESIS
-980__:INTNOTECMSPUBL
"""
Expand Down
2 changes: 1 addition & 1 deletion cds_migrator_kit/rdm/records/transform/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -472,7 +472,7 @@ def field_experiments(record_json, custom_fields_dict):
"cern:experiments", []
)
for experiment in experiments:
if experiment.lower().strip() == "not applicable":
if experiment.lower().strip() in ["not applicable", "select:"]:
continue
result = search_vocabulary(experiment, "experiments")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,8 @@ def created(self, key, value):
source = clean_val("s", value, str)
# h = human catalogued
# n = script catalogued or via submission
if source not in ["n", "h", "m", "r"]:
raise UnexpectedValue(subfield="s", field=key, value=value)
if source not in ["n", "h", "m", "r", "d"]:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what does d mean?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

311 record has d in the source field. I checked but couldnt find the meaning of d. Maybe digitized? I'll add a question to curation sheet.
Some example recids: 43247, 43430, 824753, 1221556

raise UnexpectedValue(subfield="s", field=key, value=source)
date_values = value.get("w")
if not date_values or not date_values[0]:
return datetime.date.today().isoformat()
Expand Down Expand Up @@ -801,6 +801,10 @@ def related_identifiers_787(self, key, value):
"relation_type": {"id": "references"},
"resource_type": {"id": "publication-conferencepaper"},
},
"paper": {
"relation_type": {"id": "references"},
"resource_type": {"id": "publication-article"},
},
}

if recid:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ def subjects_bulletin(self, key, value):
@for_each_value
def urls_bulletin(self, key, value):
content_type = value.get("x", "")
if content_type == "icon":
if "icon" in content_type:
# ignore icon urls (conditionally ignoring by accessing the value
url_q = value.get("q", "")
url_u = value.get("u", "")
Expand Down
244 changes: 244 additions & 0 deletions cds_migrator_kit/rdm/records/transform/xml_processing/rules/eco.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,244 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2026 CERN.
#
# CDS-RDM is free software; you can redistribute it and/or modify it under
# the terms of the MIT License; see LICENSE file for more details.

"""CDS-RDM ECO rules."""

import re

from dojson.utils import IgnoreKey, for_each_value, force_list

from cds_migrator_kit.errors import UnexpectedValue
from cds_migrator_kit.rdm.records.transform.xml_processing.rules.it import (
corporate_author,
)
from cds_migrator_kit.transform.xml_processing.quality.decorators import (
require,
)
from cds_migrator_kit.transform.xml_processing.quality.parsers import (
StringValue,
)
from cds_migrator_kit.transform.xml_processing.rules.base import (
languages as base_languages,
)

from ...models.eco import eco_model as model
from .base import identifiers
from .base import note as base_note
from .base import report_number, urls
from .bulletin_issue import (
additional_descriptions,
additional_titles_bulletin,
rel_identifiers,
translated_description,
urls_bulletin,
)
from .it import corporate_author
from .publications import internal_notes, journal, organisation, related_identifiers

model.over("additional_titles", "(^246_[1_])", override=True)(
additional_titles_bulletin
)
model.over("additional_descriptions", "(^500__)")(additional_descriptions)
model.over("additional_descriptions", "(^590__)")(translated_description)
model.over("internal_notes", "^562__")(internal_notes)
model.over("contributors", "^901__")(organisation)
Comment on lines +42 to +48
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why aren't we using the noes from base here? Why do they need to be imported here? It shoulnd't be necessary

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

they're not in the base model. And some records missing 245 but they have 246, so we can import from bulletin to use 246 as title if 245 missing. Or if you prefer I can add missing title records to curation list.

model.over("creators", "(^110__)")(corporate_author)
model.over("eco_urls", "^8564[1_]", override=True)(urls_bulletin)


@model.over("internal_notes", "^595__")
@for_each_value
def internal_notes(self, key, value):
"""Translates internal notes."""
subject_notes = force_list(value.get("s", ""))
if subject_notes:
# add them as subjects
subjects = self.get("subjects", [])
for note in subject_notes:
subjects.append({"subject": note})
self["subjects"] = subjects
base_note(self, key, value)
raise IgnoreKey("internal_notes")


@model.over("eco_report_number", "(^037__)|(^088__)", override=True)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same question, why do you reimplement it ?

Copy link
Copy Markdown
Contributor Author

@zubeydecivelek zubeydecivelek May 11, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

to handle emails in 088, since 037 and 088 implemented in the same rule in base, only overriding 088 is not working

@for_each_value
def eco_report_number(self, key, value):
"""Translates report number."""
identifier = value.get("a", "")
# Check it's email TODO: how to handle?
if key == "088__" and "@" in identifier:
pass
else:
_identifier = report_number(self, key, value)
identifiers = self.get("identifiers", [])

if _identifier and _identifier not in identifiers:
identifiers += _identifier
self["identifiers"] = identifiers
raise IgnoreKey("eco_report_number")


@model.over("eco_related_identifiers", "(^962__)", override=True)
@for_each_value
def eco_related_identifiers(self, key, value):
"""Translates related identifiers."""
scheme = value.get("l", "")
if scheme:
rel_identifiers(self, key, value)
raise IgnoreKey("eco_related_identifiers")
rel_identifier = related_identifiers(self, key, value)
if rel_identifier:
rel_id = rel_identifier[0]
rel_ids = self.get("related_identifiers", [])
if rel_id not in rel_ids:
rel_ids.append(rel_id)
self["related_identifiers"] = rel_ids
raise IgnoreKey("eco_related_identifiers")


@model.over("eco_identifiers", "^035__", override=True)
@for_each_value
def eco_identifiers(self, key, value):
"""Translates identifiers."""
original_scheme = StringValue(value.get("9", "")).parse()
scheme = original_scheme.lower()

# TODO: handle photo identifier
if scheme == "phopho":
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

hmm I am a bit worried about this... why do we get photo identifiers there? is this a relation?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

id_value = StringValue(value.get("a", "")).parse()
new_id = {"scheme": "photo", "identifier": id_value}
raise IgnoreKey("eco_identifiers")
Comment on lines +111 to +115
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How should we handle photo identifiers?

example record: https://cds.cern.ch/record/43679

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it should be linked in related identifiers with Photo resource type

identifiers(self, key, value)
raise IgnoreKey("eco_identifiers")


@model.over("resource_type", "^980__", override=True)
def resource_type(self, key, value):
"""Translates resource_type."""
value = value.get("a") if "a" in value else value.get("b")
if value:
value = value.strip().lower()

mapping = {
"poster": {"id": "poster"},
"brochure": {"id": "publication-brochure"},
"note": {"id": "publication-technicalnote"},
"conferencepaper": {"id": "publication-conferencepaper"},
}

try:
return mapping[value]
except KeyError:
raise UnexpectedValue("Unknown resource type (ECO)", field=key, value=value)


@model.over("collection", "^690C_", override=True)
@for_each_value
def collection(self, key, value):
"""Translates collection."""
collection = value.get("a", "")
if collection.strip().upper() == "CERN":
raise IgnoreKey("collection")
if collection.strip().upper() not in [
"POSTER",
"PREPRINT",
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do you have example preprints? it is quite unlikely we have research content in this data set

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there's only one record with preprint:
https://cds.cern.ch/record/2675049/export/xm

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it should be checked by the curators if it is really preprint. If not, the tag should be removed both t=from the record and from the code here

]:
raise UnexpectedValue(subfield="a", field=key, value=value)
subjects = self.get("subjects", [])
subjects.append(
{
"subject": f"collection:{collection}",
}
)
self["subjects"] = subjects
raise IgnoreKey("collection")


@model.over("related_ids", "^773__")
@for_each_value
def related_ids(self, key, value):
"""Translated related links."""
related_link = value.get("u", "")
if not related_link:
_custom_fields = journal(self, key, value)
self["custom_fields"] = _custom_fields
raise IgnoreKey("related_ids")
Comment thread
zubeydecivelek marked this conversation as resolved.

# Transform like the base `urls` rule
rel_ids = urls(self, key, value)
if not rel_ids:
raise IgnoreKey("related_ids")
rel_id = rel_ids[0]
related_identifiers = self.get("related_identifiers", [])
if rel_id not in related_identifiers:
related_identifiers.append(rel_id)
self["related_identifiers"] = related_identifiers

raise IgnoreKey("related_ids")


@model.over("submitter_info", "^923__")
@for_each_value
def submitter_info(self, key, value):
"""Translates submitter information."""
submitter_info = value.get("r", "")
names = submitter_info.strip().split(",")

if len(names) == 2:
names = {"family_name": names[0].strip(), "given_name": names[1].strip()}
else:
names = {"family_name": names[0].strip()}
contributor = {
"person_or_org": {
"type": "personal",
**names,
},
"role": {"id": "contactperson"},
}
contributors = self.get("contributors", [])
contributors.append(contributor)
self["contributors"] = contributors
raise IgnoreKey("submitter_info")


@model.over("languages", "^041__", override=True)
Comment thread
zubeydecivelek marked this conversation as resolved.
@for_each_value
@require(["a"])
def language(self, key, value):
"""Translates languages fields."""
langs = value.get("a")
languages = self.get("languages", [])
if "-" in langs or "/" in langs:
# https://cds.cern.ch/record/921930/export/xm
language_codes = re.split(r"[-/]+", langs)
for lang in language_codes:
if not lang:
continue
new_langs = base_languages(self, key, {"a": lang})
languages.extend(new_langs)
else:
new_langs = base_languages(self, key, value)
languages.extend(new_langs)
self["languages"] = languages
raise IgnoreKey("language")


@model.over("field_993", "^993__", override=True)
@for_each_value
def field_993(self, key, value):
"""Translates field 993 as a keyword."""
value = value.get("q", "")
if value and value not in ["Project Management"]:
raise UnexpectedValue(field=key, subfield="a", value=value)
_subjects = self.get("subjects", [])
subject = {
"subject": value,
}
_subjects.append(subject)
self["subjects"] = _subjects
raise IgnoreKey("field_993")
Loading
Loading