From c0bd7da63189fec2ce0440e43e7d98ed7a0cb2e0 Mon Sep 17 00:00:00 2001 From: Cody Seibert Date: Fri, 5 Jun 2026 15:12:32 -0400 Subject: [PATCH 1/2] 5434 - fallback the mime type of octet-stream when the url provided had no format extension and the format field was blank --- ckanext/datajson/package2pod.py | 10 +- .../tests/test_distribution_media_type.py | 138 ++++++++++++++++++ 2 files changed, 147 insertions(+), 1 deletion(-) create mode 100644 ckanext/datajson/tests/test_distribution_media_type.py diff --git a/ckanext/datajson/package2pod.py b/ckanext/datajson/package2pod.py index 17588e5f..ca6ddd7d 100644 --- a/ckanext/datajson/package2pod.py +++ b/ckanext/datajson/package2pod.py @@ -419,7 +419,15 @@ def generate_distribution(someValue): resource.pop('accessURL') resource['downloadURL'] = res_url if 'mediaType' not in resource: - log.warn("Missing mediaType for resource in package ['%s']", package.get('id')) + # POD v1.1 requires a mediaType whenever a downloadURL is + # present (dependencies.downloadURL.required). When the + # resource format is blank we can't derive a real MIME + # type, so fall back to the generic binary type. Without + # this the whole dataset fails schema validation and is + # silently dropped from the exported catalog. + log.warn("Missing mediaType for resource in package ['%s'], " + "defaulting to application/octet-stream", package.get('id')) + resource['mediaType'] = 'application/octet-stream' else: log.warn("Missing downloadURL for resource in package ['%s']", package.get('id')) diff --git a/ckanext/datajson/tests/test_distribution_media_type.py b/ckanext/datajson/tests/test_distribution_media_type.py new file mode 100644 index 00000000..893d3533 --- /dev/null +++ b/ckanext/datajson/tests/test_distribution_media_type.py @@ -0,0 +1,138 @@ +"""Unit tests for distribution mediaType handling during export. + +Regression coverage for the data.json export error +``'mediaType' is a required property`` (``'required': ['mediaType']``). + +The DCAT-US (POD v1.1) distribution schema requires ``mediaType`` whenever a +distribution has a ``downloadURL`` (pod_schema/federal-v1.1/dataset.json -> +``dependencies.downloadURL.required``). A "Link to a file" resource exports a +``downloadURL``; its ``mediaType`` is derived from the resource ``format`` +field. When ``format`` is blank (e.g. the URL has no file extension for CKAN to +auto-detect), no ``mediaType`` was produced and the whole dataset was silently +dropped from the exported catalog. + +These tests exercise ``Wrappers.generate_distribution`` directly so they need +no database, Solr, or running app. +""" + +from jsonschema.validators import Draft4Validator + +from ckanext.datajson.package2pod import Wrappers +from ckanext.datajson.helpers import get_validator + + +# The distribution portion of export.inventory.map.sample.json. This is the +# field map the inventory export uses; keeping a copy here makes the test +# independent of which export map happens to be configured. +INVENTORY_DISTRIBUTION_MAP = { + "distribution": { + "wrapper": "generate_distribution", + "map": { + "accessURL": {"field": "url"}, + "mediaType": {"field": "format", "wrapper": "mime_type_it"}, + "format": {"field": "formatReadable"}, + "title": {"field": "name"}, + "description": {"field": "description"}, + }, + } +} + + +def _generate_distribution(resources): + """Run the real export wrapper against a package with the given resources.""" + Wrappers.redaction_enabled = False + Wrappers.full_field_map = INVENTORY_DISTRIBUTION_MAP + Wrappers.pkg = {"id": "pkg-test", "resources": resources} + return Wrappers.generate_distribution(None) + + +# Subset of the POD v1.1 distribution definition: a downloadURL makes mediaType +# required. Used to assert the schema-level outcome without a full dataset. +DISTRIBUTION_SCHEMA = { + "$schema": "http://json-schema.org/draft-04/schema#", + "type": "object", + "dependencies": { + "downloadURL": {"required": ["mediaType"]}, + }, +} + + +def _distribution_is_schema_valid(distribution): + validator = Draft4Validator(DISTRIBUTION_SCHEMA) + return not list(validator.iter_errors(distribution)) + + +class TestDistributionMediaType(object): + + def test_file_resource_with_blank_format_still_has_media_type(self): + """A downloadURL resource with no format must still get a mediaType.""" + dist = _generate_distribution([ + {"url": "https://example.com/download", "name": "r", + "format": "", "resource_type": "file"}, + ]) + + assert len(dist) == 1 + distribution = dist[0] + assert "downloadURL" in distribution + # The bug: mediaType was missing here, failing schema validation. + assert "mediaType" in distribution + assert _distribution_is_schema_valid(distribution) + + def test_blank_format_default_media_type_is_octet_stream(self): + """When format can't yield a real MIME type, fall back to octet-stream.""" + dist = _generate_distribution([ + {"url": "https://example.com/download", "name": "r", + "format": "", "resource_type": "file"}, + ]) + + assert dist[0]["mediaType"] == "application/octet-stream" + + def test_known_format_media_type_is_preserved(self): + """A real format must keep its proper MIME type, not the default.""" + dist = _generate_distribution([ + {"url": "https://example.com/data.csv", "name": "r", + "format": "CSV", "resource_type": "file"}, + ]) + + assert dist[0]["mediaType"] == "text/csv" + assert _distribution_is_schema_valid(dist[0]) + + def test_access_url_resource_has_no_media_type(self): + """accessURL distributions must NOT carry a mediaType (and stay valid).""" + dist = _generate_distribution([ + {"url": "https://example.com/api", "name": "r", + "format": "", "resource_type": "accessurl"}, + ]) + + assert "accessURL" in dist[0] + assert "downloadURL" not in dist[0] + assert "mediaType" not in dist[0] + assert _distribution_is_schema_valid(dist[0]) + + def test_exported_dataset_passes_full_pod_schema(self): + """End-to-end: the blank-format file resource passes the real schema.""" + dist = _generate_distribution([ + {"url": "https://example.com/download", "name": "r", + "format": "", "resource_type": "file"}, + ]) + + dataset = { + "@type": "dcat:Dataset", + "title": "Test dataset", + "description": "A description.", + "identifier": "TEST-0001", + "accessLevel": "public", + "modified": "2020-01-01", + "keyword": ["test"], + "bureauCode": ["015:11"], + "programCode": ["015:001"], + "publisher": {"@type": "org:Organization", "name": "Test Org"}, + "contactPoint": {"@type": "vcard:Contact", "fn": "Jane", + "hasEmail": "mailto:jane@example.com"}, + "distribution": dist, + } + + validator = get_validator() # federal-v1.1 dataset.json + errors = list(validator.iter_errors(dataset)) + assert errors == [], "Unexpected validation errors: %s" % ( + [e.message for e in errors]) From 755a1accbfa15d92f6213d384eca7fced30dca94 Mon Sep 17 00:00:00 2001 From: Cody Seibert Date: Mon, 8 Jun 2026 10:52:23 -0400 Subject: [PATCH 2/2] bumping version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index ebedf269..96abe36a 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ setup( name='ckanext-datajson', - version='0.1.28', + version='0.1.29', description='CKAN extension to generate /data.json', long_description=long_description, long_description_content_type='text/markdown',