From 5b40388d3c8d5065ad7cadcebdcc9792dc439f66 Mon Sep 17 00:00:00 2001 From: Bhawana Karakheti Date: Tue, 19 Aug 2025 15:21:12 -0500 Subject: [PATCH 1/2] Migration to new branch --- pyQuARC/code/custom_validator.py | 34 +++++++++++++++- pyQuARC/code/schema_validator.py | 18 +++------ pyQuARC/schemas/check_messages.json | 12 +++++- pyQuARC/schemas/checks.json | 7 +++- pyQuARC/schemas/rule_mapping.json | 60 ++++------------------------- tests/test_downloader.py | 6 +-- 6 files changed, 66 insertions(+), 71 deletions(-) diff --git a/pyQuARC/code/custom_validator.py b/pyQuARC/code/custom_validator.py index bf3620d1..46b8aa83 100644 --- a/pyQuARC/code/custom_validator.py +++ b/pyQuARC/code/custom_validator.py @@ -56,11 +56,43 @@ def mime_type_check(mime_type, url_type, controlled_list): @staticmethod def availability_check(field_value, parent_value): # If the parent is available, the child should be available too, else it is invalid + return { "valid": bool(field_value) if parent_value else True, "value": parent_value, } + @staticmethod + def url_description_presence_check(field_value): + """ + Ensures that URL descriptions are present and not empty if a URL is provided. + """ + if isinstance(field_value, list): + # List of dictionaries (URL objects) + errors = [] + for url_obj in field_value: + description = url_obj.get("Description", "") + if not description or not str(description).strip(): + errors.append({ + "valid": False, + "value": url_obj, + }) + + if errors: + return errors[0] + else: + return {"valid": True, "value": field_value} + + + if not field_value or not str(field_value).strip(): + return { + "valid": False, + "value": field_value, + } + + return {"valid": True, "value": field_value} + + @staticmethod @if_arg def bounding_coordinate_logic_check(west, north, east, south): @@ -276,4 +308,4 @@ def count_check(count, values, key): if not isinstance(items, list): items = [items] num_items = len(items) - return {"valid": int(count) == num_items, "value": (count, num_items)} + return {"valid": int(count) == num_items, "value": (count, num_items)} \ No newline at end of file diff --git a/pyQuARC/code/schema_validator.py b/pyQuARC/code/schema_validator.py index 11b3f087..b32d87f5 100644 --- a/pyQuARC/code/schema_validator.py +++ b/pyQuARC/code/schema_validator.py @@ -3,7 +3,7 @@ import re from io import BytesIO -from jsonschema import Draft7Validator, draft7_format_checker, RefResolver +from jsonschema import Draft7Validator, RefResolver from lxml import etree from urllib.request import pathname2url @@ -75,25 +75,19 @@ def run_json_validator(self, content_to_validate): """ schema = self.read_json_schema() schema_store = {} - if self.metadata_format == UMM_C: with open(SCHEMA_PATHS["umm-cmn-json-schema"]) as schema_file: schema_base = json.load(schema_file) - # workaround to read local referenced schema file (only supports uri) schema_store = { schema_base.get("$id", "/umm-cmn-json-schema.json"): schema_base, schema_base.get("$id", "umm-cmn-json-schema.json"): schema_base, } - errors = {} - resolver = RefResolver.from_schema(schema, store=schema_store) - validator = Draft7Validator( - schema, format_checker=draft7_format_checker, resolver=resolver + schema, format_checker=Draft7Validator.FORMAT_CHECKER, resolver=resolver ) - for error in sorted( validator.iter_errors(json.loads(content_to_validate)), key=str ): @@ -136,13 +130,13 @@ def _build_errors(error_log, paths): # For DIF, because the namespace is specified in the metadata file, lxml library # provides field name concatenated with the namespace, # the following 3 lines of code removes the namespace - namespaces = re.findall("(\{http[^}]*\})", line) + namespaces = re.findall(r"(\{http[^}]*\})", line) for namespace in namespaces: line = line.replace(namespace, "") - field_name = re.search("Element\s'(.*)':", line)[1] + field_name = re.search(r"Element\s'(.*)':", line)[1] field_paths = [abs_path for abs_path in paths if field_name in abs_path] field_name = field_paths[0] if len(field_paths) == 1 else field_name - message = re.search("Element\s'.+':\s(\[.*\])?(.*)", line)[2].strip() + message = re.search(r"Element\s'.+':\s(\[.*\])?(.*)", line)[2].strip() errors.setdefault(field_name, {})["schema"] = { "message": [f"Error: {message}"], "valid": False, @@ -191,4 +185,4 @@ def run(self, metadata): Returns: (dict): Result of the validation from xml and json schema validators """ - return self.validator_func(metadata) + return self.validator_func(metadata) \ No newline at end of file diff --git a/pyQuARC/schemas/check_messages.json b/pyQuARC/schemas/check_messages.json index 0b8b38c8..989f6a85 100644 --- a/pyQuARC/schemas/check_messages.json +++ b/pyQuARC/schemas/check_messages.json @@ -973,8 +973,18 @@ "message": "", "url": "https://wiki.earthdata.nasa.gov/display/CMR/Related+URLs" }, - "remediation": "Descriptions should be unique to each URL. At least one of the descriptions are repeated in this record. Recommend changing the descriptions to more accurately and uniquely describe each link." + "remediation": "Descriptions should be unique to each URL. Several of the descriptions are repeated in this record. Recommend changing the descriptions to more accurately and uniquely describe each link" + }, + + "url_description_presence_check": { + "failure": "A URL description is missing.", + "help": { + "message": "", + "url": "https://wiki.earthdata.nasa.gov/display/CMR/Related+URLs" + }, + "remediation": "Recommend providing a description for each URL" }, + "online_resource_description_uniqueness_check": { "failure": "A URL description is duplicated: `{}`.", "help": { diff --git a/pyQuARC/schemas/checks.json b/pyQuARC/schemas/checks.json index 778f4da3..94cb39b7 100644 --- a/pyQuARC/schemas/checks.json +++ b/pyQuARC/schemas/checks.json @@ -64,6 +64,11 @@ "check_function": "availability_check", "available": true }, + "url_description_presence_check": { + "data_type": "custom", + "check_function": "url_description_presence_check", + "available": true + }, "mime_type_check": { "data_type": "custom", "check_function": "mime_type_check", @@ -299,4 +304,4 @@ "check_function": "count_check", "available": true } -} +} \ No newline at end of file diff --git a/pyQuARC/schemas/rule_mapping.json b/pyQuARC/schemas/rule_mapping.json index 2e3acc41..3ffbfb92 100644 --- a/pyQuARC/schemas/rule_mapping.json +++ b/pyQuARC/schemas/rule_mapping.json @@ -4733,70 +4733,24 @@ "severity": "info", "check_id": "datetime_compare" }, - "url_desc_presence_check": { - "rule_name": "Online Description Presence Check", + "url_description_presence_check": { + "rule_name": "url description presence check", "fields_to_apply": { - "echo-c": [ - { - "fields": [ - "Collection/OnlineAccessURLs/OnlineAccessURL/URLDescription", - "Collection/OnlineAccessURLs/OnlineAccessURL/URL" - ] - }, - { - "fields": [ - "Collection/OnlineResources/OnlineResource/Description", - "Collection/OnlineResources/OnlineResource/URL" - ] - } - ], - "echo-g": [ - { - "fields": [ - "Granule/OnlineAccessURLs/OnlineAccessURL/URLDescription", - "Granule/OnlineAccessURLs/OnlineAccessURL/URL" - ] - }, - { - "fields": [ - "Granule/OnlineResources/OnlineResource/Description", - "Granule/OnlineResources/OnlineResource/URL" - ] - } - ], - "dif10": [ - { - "fields": [ - "DIF/Related_URL/Description", - "DIF/Related_URL/URL" - ] - }, - { - "fields": [ - "DIF/Multimedia_Sample/Description", - "DIF/Multimedia_Sample/URL" - ] - } - ], "umm-c": [ - { + { "fields": [ - "RelatedUrls/Description", - "RelatedUrls/URL" + "RelatedUrls" ] - } - ], - "umm-g": [ + }, { "fields": [ - "RelatedUrls/Description", - "RelatedUrls/URL" + "DataCenters/ContactInformation/RelatedUrls" ] } ] }, "severity": "warning", - "check_id": "availability_check" + "check_id": "url_description_presence_check" }, "get_data_url_check": { "rule_name": "GET DATA URL check", diff --git a/tests/test_downloader.py b/tests/test_downloader.py index ddd7d5db..ec9b86a7 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -9,11 +9,11 @@ class TestDownloader: def setup_method(self): self.concept_ids = { "collection": { - "real": "C1339230297-GES_DISC", + "real": "C1000000010-CDDIS", "dummy": "C123456-LPDAAC_ECS", }, "granule": { - "real": "G1370895082-GES_DISC", + "real": "G1001434969-CDDIS", "dummy": "G1000000002-CMR_PROV", }, "invalid": "asdfasdf", @@ -165,4 +165,4 @@ def test_download_real_granule_no_errors(self): downloader.download() # is the concept id valid and is the request going through? - assert downloader.errors == [] + assert downloader.errors == [] \ No newline at end of file From 41ee7de9f9f6a2ee12dfe0f3cfcc262ed7fb9225 Mon Sep 17 00:00:00 2001 From: Lavanya Ashokkumar Date: Mon, 13 Oct 2025 14:18:17 -0500 Subject: [PATCH 2/2] Fix before the merge. Code suggestions incorporated --LA --- pyQuARC/code/custom_validator.py | 4 +++- pyQuARC/code/schema_validator.py | 5 ++++- pyQuARC/schemas/check_messages.json | 4 +--- pyQuARC/schemas/checks.json | 2 +- tests/test_downloader.py | 3 ++- 5 files changed, 11 insertions(+), 7 deletions(-) diff --git a/pyQuARC/code/custom_validator.py b/pyQuARC/code/custom_validator.py index 46b8aa83..6a2beb08 100644 --- a/pyQuARC/code/custom_validator.py +++ b/pyQuARC/code/custom_validator.py @@ -308,4 +308,6 @@ def count_check(count, values, key): if not isinstance(items, list): items = [items] num_items = len(items) - return {"valid": int(count) == num_items, "value": (count, num_items)} \ No newline at end of file + return {"valid": int(count) == num_items, "value": (count, num_items)} + + \ No newline at end of file diff --git a/pyQuARC/code/schema_validator.py b/pyQuARC/code/schema_validator.py index b32d87f5..171c7e2a 100644 --- a/pyQuARC/code/schema_validator.py +++ b/pyQuARC/code/schema_validator.py @@ -185,4 +185,7 @@ def run(self, metadata): Returns: (dict): Result of the validation from xml and json schema validators """ - return self.validator_func(metadata) \ No newline at end of file + + return self.validator_func(metadata) + + \ No newline at end of file diff --git a/pyQuARC/schemas/check_messages.json b/pyQuARC/schemas/check_messages.json index 7ae98fb1..8739b373 100644 --- a/pyQuARC/schemas/check_messages.json +++ b/pyQuARC/schemas/check_messages.json @@ -991,16 +991,14 @@ }, "remediation": "Descriptions should be unique to each URL. Several of the descriptions are repeated in this record. Recommend changing the descriptions to more accurately and uniquely describe each link" }, - "url_description_presence_check": { "failure": "A URL description is missing.", "help": { "message": "", "url": "https://wiki.earthdata.nasa.gov/display/CMR/Related+URLs" }, - "remediation": "Recommend providing a description for each URL" + "remediation": "Recommend providing a description for each URL." }, - "online_resource_description_uniqueness_check": { "failure": "A URL description is duplicated: `{}`.", "help": { diff --git a/pyQuARC/schemas/checks.json b/pyQuARC/schemas/checks.json index 08dc3548..acff6c7e 100644 --- a/pyQuARC/schemas/checks.json +++ b/pyQuARC/schemas/checks.json @@ -309,4 +309,4 @@ "check_function": "count_check", "available": true } -} \ No newline at end of file +} diff --git a/tests/test_downloader.py b/tests/test_downloader.py index ec9b86a7..92ab0d0d 100644 --- a/tests/test_downloader.py +++ b/tests/test_downloader.py @@ -165,4 +165,5 @@ def test_download_real_granule_no_errors(self): downloader.download() # is the concept id valid and is the request going through? - assert downloader.errors == [] \ No newline at end of file + assert downloader.errors == [] + \ No newline at end of file