Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 14 additions & 26 deletions site/cds_rdm/inspire_harvester/transform/mappers/custom_fields.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from idutils.normalizers import normalize_isbn

from cds_rdm.inspire_harvester.transform.mappers.mapper import MapperBase
from cds_rdm.inspire_harvester.utils import search_vocabulary
from cds_rdm.inspire_harvester.utils import get_vocabulary_exact


@dataclass(frozen=True)
Expand Down Expand Up @@ -65,41 +65,29 @@ def map_value(self, src_record, ctx, logger):
acc_exp_list = src_metadata.get("accelerator_experiments", [])
_accelerators = []
_experiments = []

for item in acc_exp_list:
accelerator = item.get("accelerator")
experiment = item.get("experiment")
institution = item.get("institution")

if accelerator:
logger.debug(
f"Searching vocabulary 'accelerator' for term: '{accelerator}'"
)
if institution:
accelerator = f"{institution} {accelerator}"
else:
accelerator = f"{accelerator}"
result = search_vocabulary(accelerator, "accelerators", ctx, logger)
if result.total == 1:
logger.info(f"Found accelerator '{accelerator}'")
hit = list(result.hits)[0]
_accelerators.append({"id": hit["id"]})
accelerator_term = f"{institution} {accelerator}"
else:
logger.warning(
f"Accelerator '{accelerator}' not found."
)
accelerator_term = accelerator

vocab_id = get_vocabulary_exact(
accelerator_term, "accelerators", ctx, logger
)
if vocab_id:
_accelerators.append({"id": vocab_id})

if experiment:
logger.debug(
f"Searching vocabulary 'experiments' for term: '{experiment}'"
vocab_id = get_vocabulary_exact(
experiment, "experiments", ctx, logger
)
result = search_vocabulary(experiment, "experiments", ctx, logger)
if result.total == 1:
logger.info(f"Found experiment '{experiment}'")
hit = list(result.hits)[0]
_experiments.append({"id": hit["id"]})
else:
logger.warning(
f"Experiment '{accelerator}' not found."
)
if vocab_id:
_experiments.append({"id": vocab_id})

return {"cern:accelerators": _accelerators, "cern:experiments": _experiments}
40 changes: 40 additions & 0 deletions site/cds_rdm/inspire_harvester/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,3 +94,43 @@ def search_vocabulary(term, vocab_type, ctx, logger):
f"Vocabulary term ['{term}'] not found in '{vocab_type}'. INSPIRE#: {ctx.inspire_id}"
)
raise e


def _search_vocabulary_id(service, term, vocab_type):
"""Search vocabulary by exact ID match, returning the ID or None."""
search_term = f'"{term}"' if "/" in term else term
result = service.search(system_identity, type=vocab_type, q=f'id:"{search_term}"')
if result.total == 1:
return list(result.hits)[0]["id"]
return None


def get_vocabulary_exact(term, vocab_type, ctx, logger):
"""Get vocabulary ID by exact match, with fallback to normalized term."""
if not term:
return None

service = current_service_registry.get("vocabularies")

try:
vocab_id = _search_vocabulary_id(service, term, vocab_type)
if vocab_id:
return vocab_id

# Fallback: normalize (uppercase + strip hyphens) and search again
normalized = term.upper().replace("-", "")
if normalized != term:
vocab_id = _search_vocabulary_id(service, normalized, vocab_type)
if vocab_id:
return vocab_id

logger.warning(
f"Vocabulary term '{term}' not found in '{vocab_type}'."
)
return None

except Exception as e:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

minor nit: do we really need to catch separately? 😅

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

RIght, collapesed into a single one

logger.error(
f"Failed vocabulary search for '{term}' in '{vocab_type}'. Error: {e}."
)
return None
11 changes: 10 additions & 1 deletion site/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -644,7 +644,7 @@ def experiments_type_v(app, experiments_type):
},
)

vocab = vocabulary_service.create(
vocabulary_service.create(
system_identity,
{
"id": "ATLAS",
Expand All @@ -654,6 +654,15 @@ def experiments_type_v(app, experiments_type):
},
)

vocab = vocabulary_service.create(
system_identity,
{
"id": "NA62",
"title": {"en": "NA62"},
"type": "experiments",
},
)

Vocabulary.index.refresh()

return vocab
Expand Down
8 changes: 8 additions & 0 deletions site/tests/inspire_harvester/test_harvester_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,14 @@
"description": "In the present study the possibility of measuring the lifetime of the positively charged Kaon , K+, is investigated , by using data and framework produced by the experiment NA62 of the European Organization for Nuclear Research (CERN).",
},
"custom_fields": {
'cern:experiments': [
{
'id': 'NA62',
'title': {
'en': 'NA62',
},
},
],
'cern:programmes': {
'id': 'None',
'title': {
Expand Down
200 changes: 200 additions & 0 deletions site/tests/inspire_harvester/test_vocabulary_matching.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
# -*- coding: utf-8 -*-
#
# Copyright (C) 2026 CERN.
#
# CDS-RDM is free software; you can redistribute it and/or modify it under
# the terms of the GPL-2.0 License; see LICENSE file for more details.

"""Tests for vocabulary exact matching functionality."""
from cds_rdm.inspire_harvester.logger import Logger
from cds_rdm.inspire_harvester.transform.context import MetadataSerializationContext
from cds_rdm.inspire_harvester.transform.mappers.custom_fields import CERNFieldsMapper
from cds_rdm.inspire_harvester.transform.resource_types import ResourceType
from cds_rdm.inspire_harvester.utils import get_vocabulary_exact


def test_get_vocabulary_exact_found(running_app):
"""Test get_vocabulary_exact with a term that exists in vocabulary."""
ctx = MetadataSerializationContext(
resource_type=ResourceType.OTHER, inspire_id="12345"
)
logger = Logger(inspire_id="12345")

result = get_vocabulary_exact("CERN LHC", "accelerators", ctx, logger)

assert result == "CERN LHC"
assert len(ctx.errors) == 0


def test_get_vocabulary_exact_not_found(running_app):
"""Test get_vocabulary_exact with a term not in vocabulary."""
ctx = MetadataSerializationContext(
resource_type=ResourceType.OTHER, inspire_id="12345"
)
logger = Logger(inspire_id="12345")

result = get_vocabulary_exact("UNKNOWN", "accelerators", ctx, logger)

assert result is None
assert len(ctx.errors) == 0


def test_get_vocabulary_exact_normalizes_case(running_app):
"""Test get_vocabulary_exact normalizes case before lookup."""
ctx = MetadataSerializationContext(
resource_type=ResourceType.OTHER, inspire_id="12345"
)
logger = Logger(inspire_id="12345")

result = get_vocabulary_exact("alice", "experiments", ctx, logger)

assert result == "ALICE"
assert len(ctx.errors) == 0


def test_get_vocabulary_exact_normalizes_hyphens(running_app):
"""Test get_vocabulary_exact strips hyphens before lookup."""
ctx = MetadataSerializationContext(
resource_type=ResourceType.OTHER, inspire_id="12345"
)
logger = Logger(inspire_id="12345")

# "NA-62" normalizes to "NA62" which exists in vocabulary
result = get_vocabulary_exact("NA-62", "experiments", ctx, logger)

assert result == "NA62"
assert len(ctx.errors) == 0


def test_get_vocabulary_exact_empty_term(running_app):
"""Test get_vocabulary_exact with empty term."""
ctx = MetadataSerializationContext(
resource_type=ResourceType.OTHER, inspire_id="12345"
)
logger = Logger(inspire_id="12345")

result = get_vocabulary_exact("", "accelerators", ctx, logger)

assert result is None
assert len(ctx.errors) == 0


def test_get_vocabulary_exact_none_term(running_app):
"""Test get_vocabulary_exact with None term."""
ctx = MetadataSerializationContext(
resource_type=ResourceType.OTHER, inspire_id="12345"
)
logger = Logger(inspire_id="12345")

result = get_vocabulary_exact(None, "accelerators", ctx, logger)

assert result is None
assert len(ctx.errors) == 0


def test_cern_fields_mapper_accelerator_found(running_app):
"""Test CERNFieldsMapper with accelerator that exists in vocabulary."""
src_metadata = {
"accelerator_experiments": [
{"accelerator": "LHC", "institution": "CERN"},
]
}
ctx = MetadataSerializationContext(
resource_type=ResourceType.OTHER, inspire_id="12345"
)
logger = Logger(inspire_id="12345")
mapper = CERNFieldsMapper()
src_record = {"metadata": src_metadata, "created": "2023-01-01"}

result = mapper.map_value(src_record, ctx, logger)

assert len(result["cern:accelerators"]) == 1
assert result["cern:accelerators"][0]["id"] == "CERN LHC"
assert len(ctx.errors) == 0


def test_cern_fields_mapper_accelerator_not_found(running_app):
"""Test CERNFieldsMapper with accelerator not in vocabulary."""
src_metadata = {
"accelerator_experiments": [
{"accelerator": "UNKNOWN"},
]
}
ctx = MetadataSerializationContext(
resource_type=ResourceType.OTHER, inspire_id="12345"
)
logger = Logger(inspire_id="12345")
mapper = CERNFieldsMapper()
src_record = {"metadata": src_metadata, "created": "2023-01-01"}

result = mapper.map_value(src_record, ctx, logger)

assert len(result["cern:accelerators"]) == 0
assert len(ctx.errors) == 0


def test_cern_fields_mapper_experiment_found(running_app):
"""Test CERNFieldsMapper with experiment that exists in vocabulary."""
src_metadata = {
"accelerator_experiments": [
{"experiment": "ALICE"},
]
}
ctx = MetadataSerializationContext(
resource_type=ResourceType.OTHER, inspire_id="12345"
)
logger = Logger(inspire_id="12345")
mapper = CERNFieldsMapper()
src_record = {"metadata": src_metadata, "created": "2023-01-01"}

result = mapper.map_value(src_record, ctx, logger)

assert len(result["cern:experiments"]) == 1
assert result["cern:experiments"][0]["id"] == "ALICE"
assert len(ctx.errors) == 0


def test_cern_fields_mapper_experiment_not_found(running_app):
"""Test CERNFieldsMapper with experiment not in vocabulary."""
src_metadata = {
"accelerator_experiments": [
{"experiment": "UNKNOWN_EXP"},
]
}
ctx = MetadataSerializationContext(
resource_type=ResourceType.OTHER, inspire_id="12345"
)
logger = Logger(inspire_id="12345")
mapper = CERNFieldsMapper()
src_record = {"metadata": src_metadata, "created": "2023-01-01"}

result = mapper.map_value(src_record, ctx, logger)

assert len(result["cern:experiments"]) == 0
assert len(ctx.errors) == 0


def test_cern_fields_mapper_mixed_results(running_app):
"""Test CERNFieldsMapper with some found and some not found."""
src_metadata = {
"accelerator_experiments": [
{"accelerator": "LHC", "institution": "CERN", "experiment": "ALICE"},
{"accelerator": "UNKNOWN", "experiment": "UNKNOWN_EXP"},
]
}
ctx = MetadataSerializationContext(
resource_type=ResourceType.OTHER, inspire_id="12345"
)
logger = Logger(inspire_id="12345")
mapper = CERNFieldsMapper()
src_record = {"metadata": src_metadata, "created": "2023-01-01"}

result = mapper.map_value(src_record, ctx, logger)

assert len(result["cern:accelerators"]) == 1
assert result["cern:accelerators"][0]["id"] == "CERN LHC"

assert len(result["cern:experiments"]) == 1
assert result["cern:experiments"][0]["id"] == "ALICE"

assert len(ctx.errors) == 0
Loading