diff --git a/jsonschema/convert_dcat_1_1_to_3_0.py b/jsonschema/convert_dcat_1_1_to_3_0.py index a0c00ee..7c2a9ff 100755 --- a/jsonschema/convert_dcat_1_1_to_3_0.py +++ b/jsonschema/convert_dcat_1_1_to_3_0.py @@ -205,10 +205,11 @@ def fetch_dcat_catalog(url: str) -> dict: response = requests.get(url, timeout=60, impersonate="safari17_0") response.raise_for_status() except RequestException as e: - raise CatalogFetchException(f"Request failed: {e}") from e + raise CatalogFetchException(f"Request failed: {type(e).__name__}: {e!r}") from e try: - text = response.content.decode("utf-8") + text = response.content.decode("utf-8-sig") + text = text.lstrip("\ufeff") except UnicodeDecodeError: text = response.content.decode("cp1252") diff --git a/jsonschema/tests/harvest_source_urls.csv b/jsonschema/tests/harvest_source_urls.csv new file mode 100644 index 0000000..e0b8092 --- /dev/null +++ b/jsonschema/tests/harvest_source_urls.csv @@ -0,0 +1,129 @@ +url,status,has_valid_subset,notes +https://www.eac.gov/data.json,converts,n/a +https://data.ca.gov/data.json,invalid v1.1,yes +https://open.gsa.gov/data.json,converts,n/a +https://healthdata.gov/data.json,invalid v1.1,yes +https://data.kingcounty.gov/data.json?version=2,invalid v1.1,yes +https://data.hartford.gov/data.json,invalid v1.1,yes +https://data-soa-dnr.opendata.arcgis.com/api/feed/dcat-us/1.1.json,invalid v1.1,no +https://ddi.doi.gov/onrr-data.json,converts,n/a +https://datainventory.doi.gov/data.json,fetch error,n/a +https://www.imls.gov/sites/default/files/data.json,invalid v1.1,yes +https://ddi.doi.gov/bia-data.json,converts,n/a +https://ddi.doi.gov/bsee-data.json,invalid v1.1,yes +https://portal.opentopography.org/geoportal/csw/discovery?Request=GetCapabilities&Service=CSW&Version=2.0.2,fetch error,n/a +https://www.sba.gov/data.json,converts,n/a +https://ddi.doi.gov/fws-data.json,converts,n/a +https://ddi.doi.gov/osmre-data.json,invalid v1.1,no +https://ddi.doi.gov/usgs-data.json,invalid v1.1,yes +https://data.honolulu.gov/data.json?version=2,invalid v1.1,yes +https://data-fairfaxcountygis.opendata.arcgis.com/data.json,invalid v1.1,yes +https://ddi.doi.gov/doios-data.json,invalid v1.1,yes +https://edg.epa.gov/data/nongeo_data.json,invalid v1.1,yes +https://opendata.fcc.gov/data.json,invalid v1.1,yes +https://nycopendata.socrata.com/data.json?version=2,invalid v1.1,yes +https://www.ncua.gov/data.json,converts,n/a +https://www.fec.gov/data.json,converts,n/a +https://www.federalreserve.gov/PDC/data.json,converts,n/a +https://opendata.hawaii.gov/data.json,invalid v1.1,yes +https://img.exim.gov/s3fs-public/dataset/vbhv-d8am/data.json,converts,n/a +https://ddi.doi.gov/blm-data.json,converts,n/a +https://ddi.doi.gov/nps-data.json,converts,n/a +https://data.brla.gov/data.json,invalid v1.1,yes +https://www.commerce.gov/sites/default/files/data.json,fetch error,n/a +https://data.ed.gov/data.json,invalid v1.1,yes +https://data.iowa.gov/data.json,fetch error,n/a +https://www.usda.gov/sites/default/files/documents/data.json,fetch error,n/a +https://ddi.doi.gov/boem-data.json,converts,n/a +https://ddi.doi.gov/usbr-data.json,converts,n/a +https://datacatalog.cookcountyil.gov/data.json?version=2,invalid v1.1,yes +https://data.nola.gov/data.json,invalid v1.1,yes +https://data.sfgov.org/data.json?version=2,invalid v1.1,yes +https://www.state.gov/data.json,converts,n/a +https://data.ok.gov/data.json?version=2,invalid v1.1,yes +https://data.ny.gov/data.json?version=2,invalid v1.1,yes +https://www.cftc.gov/sites/default/files/CFTC-ODI-metadata-v2.json,converts,n/a +https://data.cityofchicago.org/data.json?version=2,invalid v1.1,yes +https://www.huduser.gov/data/data.json,converts,n/a +https://fhfa.gov/data/data.json,converts,n/a +https://its.ntia.gov/data.json,converts,n/a +https://www.dol.gov/data.json,converts,n/a +https://data.mo.gov/data.json?version=2,invalid v1.1,yes +https://data.baltimorecity.gov/data.json,invalid v1.1,yes +https://data.austintexas.gov/data.json,invalid v1.1,yes +https://data.americorps.gov/data.json,converts,n/a +https://www.consumerfinance.gov/data.json,converts,n/a +https://data.somervillema.gov/data.json?version=2,invalid v1.1,yes +https://cos-data.seattle.gov/data.json,invalid v1.1,yes +https://data.townofcary.org/data.json,invalid v1.1,no +https://www.archives.gov/files/data.json,converts,n/a +https://www.usitc.gov/data.json,converts,n/a +https://wa-node.gis.washington.edu/geoportal/csw/discovery,fetch error,n/a +https://www.bls.gov/data.json,converts,n/a +https://www.opm.gov/data.json,converts,n/a +https://data.charlottenc.gov/data.json,invalid v1.1,no +https://opendata-townofchapelhill.hub.arcgis.com/api/feed/dcat-us/1.1.json,invalid v1.1,yes +https://data.mcc.gov/data.json,converts,n/a +https://data.va.gov/data.json,invalid v1.1,yes +https://www.frtib.gov/data.json,converts,n/a +https://geohub-loudoungis.opendata.arcgis.com/data.json,invalid v1.1,yes +https://data.ct.gov/data.json,invalid v1.1,yes +https://data.wa.gov/data.json?version=2,invalid v1.1,yes +https://data.oregon.gov/data.json?version=2,invalid v1.1,yes +https://www.mspb.gov/data.json,converts,n/a +https://data.providenceri.gov/data.json,invalid v1.1,yes +https://public-chesva.opendata.arcgis.com/data.json,invalid v1.1,yes +https://www.eeoc.gov/data.json,converts,n/a +https://www.ssa.gov/data.json,invalid v1.1,yes +https://www.ftc.gov/data.json,converts,n/a +https://www.nsf.gov/data.json,converts,n/a +https://cwbi-app.sec.usace.army.mil/pub/data.json,invalid v1.1,no +https://www.nrc.gov/data.json,invalid v1.1,yes +https://data.transportation.gov/data.json,invalid v1.1,yes +https://geospatial-usace.opendata.arcgis.com/data.json,invalid v1.1,no +https://opendata.maryland.gov/data.json,invalid v1.1,yes +https://ngda-transportation-geoplatform.hub.arcgis.com/api/feed/dcat-us/1.1.json,invalid v1.1,yes +https://gisportal.ibwc.gov/agsportal/,fetch error,n/a +https://data.ferndalemi.gov/data.json,invalid v1.1,yes +https://data.tempe.gov/api/feed/dcat-us/1.1.json,invalid v1.1,yes +https://data.arlingtonva.us/data.json,invalid v1.1,no +https://bloomington.data.socrata.com/data.json,invalid v1.1,yes +https://data-wake.opendata.arcgis.com/data.json,invalid v1.1,yes +https://data.ntsb.gov/data.json,converts,n/a +https://secure.rrb.gov/data.json,converts,n/a +https://www.dhs.gov/data.json,converts,n/a +https://www.cpsc.gov/data.json,invalid v1.1,yes +https://data.montgomerycountymd.gov/data.json,invalid v1.1,yes +https://www.nitrd.gov/data.json,converts,n/a +https://www.archive.arm.gov/metadata/data.json,converts,n/a +https://data.usaid.gov/data.json,fetch error,n/a +https://pbgc.gov/data.json,converts,n/a +https://www.opendataphilly.org/data.json,invalid v1.1,yes +https://www.nist.gov/sites/default/files/data.json,invalid v3.0,n/a,v1.1 is valid but uses reserved `replaces` field name +https://www.treasury.gov/data.json,converts,n/a +https://pasteur.epa.gov/metadata.json,converts,n/a +https://max.gov/data.json,converts,n/a +https://www.justice.gov/data.json,converts,n/a +https://data.nasa.gov/data.json,fetch error,n/a +https://federallabs.org/data.json,converts,n/a +https://www.sec.gov/data.json,converts,n/a +https://data.lacity.org/data.json,invalid v1.1,yes +https://nehopendatastorage.blob.core.windows.net/nehopendata/data.json,converts,n/a +https://openei.org/data.json,converts,n/a +https://www.energy.gov/data.json,converts,n/a +https://www.defense.gov/data.json,fetch error,n/a +https://opendata.cityofboise.org/data.json,invalid v1.1,no +https://opendata.dc.gov/data.json,invalid v1.1,yes +https://data-lakecountyil.opendata.arcgis.com/data.json,invalid v1.1,yes +https://geodata.vermont.gov/api/feed/dcat-us/1.1.json,invalid v1.1,yes +https://dataworks.siouxfalls.gov/data.json,invalid v1.1,yes +https://data.srcity.org/data.json,fetch error,n/a +https://mapcontext.com/npswaf/sample2.json,fetch error,n/a +https://data.wprdc.org/data.json,invalid v1.1,no +https://apps.usgs.gov/fgdc/WAF_JSON/combined.json,converts,n/a +https://geocatalog-uidaho.hub.arcgis.com/data.json,invalid v1.1,yes +https://opendurham.nc.gov/data.json,fetch error,n/a +https://rossi.urs-tally.com/Content/data.json,converts,n/a +https://louisville-metro-opendata-lojic.hub.arcgis.com/api/feed/dcat-us/1.1.json,invalid v1.1,yes +https://raw.githubusercontent.com/gbinal/data/master/datasets/hhs_cas.json,fetch error,n/a +https://www.loc.gov/data.json,converts,n/a diff --git a/jsonschema/tests/validate_1_1_failure_details.log b/jsonschema/tests/validate_1_1_failure_details.log new file mode 100644 index 0000000..2c94d7e --- /dev/null +++ b/jsonschema/tests/validate_1_1_failure_details.log @@ -0,0 +1,757 @@ +Processing: https://data.ca.gov/data.json +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 9160 error(s) across 4457 datasets: + missing required field 'bureauCode': 4457 datasets + missing required field 'programCode': 4457 datasets + dataset[N].contactPoint.hasEmail: invalid mailto URI format: 157 datasets + dataset[N].license: field is not null and invalid format, expected 'uri': 81 datasets + dataset[N].distribution: field is not null and [N].accessURL: field is not null and invalid format, expected 'uri': 3 datasets + dataset[N].distribution: field is not null and invalid format, expected 'uri': 3 datasets + dataset[N].theme: field is not null and ['COVID-19', 'COVID-19'] has non-unique elements: 2 datasets +Removed 4457 invalid dataset(s). 0 valid dataset(s) remaining. +No valid datasets remain after filtering. +------------------------------------------------------ +Processing: https://healthdata.gov/data.json +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 902 error(s) across 20461 datasets: + dataset[N].accrualPeriodicity: field is not null and does not match alternatives: + - value not in allowed values: ['irregular'] + - invalid ISO 8601 duration: 217 datasets + missing required field 'keyword': 201 datasets + dataset[N].temporal: field is not null and does not match alternatives: + - invalid ISO 8601 date/datetime + - invalid ISO 8601 repeating interval: 200 datasets + dataset[N].rights: field is not null and value is too long (max 255 characters): 66 datasets + dataset[N].describedBy: field is not null and invalid format, expected 'uri': 54 datasets + dataset[N].references: field is not null and invalid format, expected 'uri': 31 datasets + dataset[N].description: '' should be non-empty: 26 datasets + dataset[N].contactPoint.hasEmail: invalid mailto URI format: 23 datasets + missing required field 'bureauCode': 22 datasets + missing required field 'programCode': 22 datasets + dataset[N].programCode[N]: invalid program code format (expected '###:###'): 16 datasets + dataset[N].landingPage: field is not null and invalid format, expected 'uri': 12 datasets + dataset[N].conformsTo: field is not null and invalid format, expected 'uri': 9 datasets + dataset[N].primaryITInvestmentUII: field is not null and invalid IT investment UII format (expected '###-#########'): 2 datasets + dataset[N].references: field is not null and ['https://www.cdc.gov/nchs/data/nvsr/nvsr51/nvsr51_12.pdf', 'https://www.cdc.gov/nchs/data/nvsr/nvsr62/nvsr62_09.pdf', 'https://www.cdc.gov/nchs/data/nvsr/nvsr63/nvsr63_04.pdf', 'https://www.cdc.gov/nchs/data/nvsr/nvsr62/nvsr62_09.pdf', 'https://www.cdc.gov/nchs/data/nvsr/nvsr64/nvsr64_01.pdf', 'https://www.cdc.gov/nchs/data/nvsr/nvsr64/nvsr64_12.pdf', 'https://www.cdc.gov/nchs/data/nvsr/nvsr66/nvsr66_01.pdf', 'https://www.cdc.gov/nchs/data/nvsr/nvsr67/nvsr67_01.pdf', 'https://www.cdc.gov/nchs/data/nvsr/nvsr67/nvsr67_08-508.pdf', 'https://www.cdc.gov/nchs/data/nvsr/nvsr68/nvsr68_13-508.pdf', 'https://www.cdc.gov/nchs/data/nvsr/nvsr70/nvsr70-02-508.pdf'] has non-unique elements: 1 dataset +Removed 548 invalid dataset(s). 19913 valid dataset(s) remaining. +------------------------------------------------------ +Processing: https://data.kingcounty.gov/data.json?version=2 +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 748 error(s) across 341 datasets: + missing required field 'bureauCode': 341 datasets + missing required field 'programCode': 341 datasets + missing required field 'keyword': 45 datasets + dataset[N].description: '' should be non-empty: 21 datasets +Removed 341 invalid dataset(s). 0 valid dataset(s) remaining. +No valid datasets remain after filtering. +------------------------------------------------------ +Processing: https://data.hartford.gov/data.json +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 322 error(s) across 128 datasets: + missing required field 'bureauCode': 128 datasets + missing required field 'programCode': 128 datasets + dataset[N].keyword: [] should be non-empty: 23 datasets + dataset[N].spatial: field is not null and '' should be non-empty: 21 datasets + dataset[N].theme: field is not null and expected type 'array': 21 datasets + dataset[N].license: field is not null and invalid format, expected 'uri': 1 dataset +Removed 128 invalid dataset(s). 0 valid dataset(s) remaining. +No valid datasets remain after filtering. +------------------------------------------------------ +Processing: https://data-soa-dnr.opendata.arcgis.com/api/feed/dcat-us/1.1.json +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 1455 error(s) across 460 datasets: + dataset[N].license: field is not null and invalid format, expected 'uri': 460 datasets + missing required field 'bureauCode': 460 datasets + missing required field 'programCode': 460 datasets + dataset[N].spatial: field is not null and '' should be non-empty: 27 datasets + dataset[N].theme: field is not null and expected type 'array': 27 datasets + dataset[N].description: '' should be non-empty: 10 datasets + dataset[N].keyword[N]: '' should be non-empty: 8 datasets + dataset[N].keyword: [] should be non-empty: 2 datasets + dataset[N].contactPoint.hasEmail: invalid mailto URI format: 1 dataset +Removed 460 invalid dataset(s). 0 valid dataset(s) remaining. +No valid datasets remain after filtering. +------------------------------------------------------ +Processing: https://ddi.doi.gov/onrr-data.json +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 14 error(s) across 14 datasets: + missing required field 'programCode': 14 datasets +Removed 14 invalid dataset(s). 0 valid dataset(s) remaining. +No valid datasets remain after filtering. +------------------------------------------------------ +Processing: https://www.imls.gov/sites/default/files/data.json +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 1 error(s) across 59 datasets: + dataset[N].distribution: field is not null and invalid format, expected 'uri': 1 dataset +Removed 1 invalid dataset(s). 58 valid dataset(s) remaining. +------------------------------------------------------ +Processing: https://ddi.doi.gov/bsee-data.json +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 188 error(s) across 95 datasets: + missing required field 'programCode': 95 datasets + missing required field 'modified': 80 datasets + dataset[N].distribution: field is not null and missing required field 'mediaType': 13 datasets +Removed 95 invalid dataset(s). 0 valid dataset(s) remaining. +No valid datasets remain after filtering. +------------------------------------------------------ +Processing: https://ddi.doi.gov/osmre-data.json +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 30 error(s) across 10 datasets: + missing required field 'bureauCode': 10 datasets + missing required field 'modified': 10 datasets + missing required field 'programCode': 10 datasets +Removed 10 invalid dataset(s). 0 valid dataset(s) remaining. +No valid datasets remain after filtering. +------------------------------------------------------ +Processing: https://ddi.doi.gov/usgs-data.json +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 37392 error(s) across 37341 datasets: + missing required field 'programCode': 37341 datasets + dataset[N].distribution: field is not null and invalid format, expected 'uri': 51 datasets +Removed 37341 invalid dataset(s). 0 valid dataset(s) remaining. +No valid datasets remain after filtering. +------------------------------------------------------ +Processing: https://data.honolulu.gov/data.json?version=2 +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 1197 error(s) across 402 datasets: + missing required field 'bureauCode': 402 datasets + missing required field 'programCode': 402 datasets + dataset[N].description: '' should be non-empty: 328 datasets + missing required field 'keyword': 65 datasets +Removed 402 invalid dataset(s). 0 valid dataset(s) remaining. +No valid datasets remain after filtering. +------------------------------------------------------ +Processing: https://data-fairfaxcountygis.opendata.arcgis.com/data.json +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 443 error(s) across 148 datasets: + missing required field 'bureauCode': 148 datasets + missing required field 'programCode': 148 datasets + dataset[N].license: field is not null and invalid format, expected 'uri': 147 datasets +Removed 148 invalid dataset(s). 0 valid dataset(s) remaining. +No valid datasets remain after filtering. +------------------------------------------------------ +Processing: https://ddi.doi.gov/doios-data.json +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 3 error(s) across 2 datasets: + missing required field 'programCode': 2 datasets + dataset[N].contactPoint.hasEmail: invalid mailto URI format: 1 dataset +Removed 2 invalid dataset(s). 0 valid dataset(s) remaining. +No valid datasets remain after filtering. +------------------------------------------------------ +Processing: https://edg.epa.gov/data/nongeo_data.json +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 6 error(s) across 449 datasets: + missing required field 'keyword': 4 datasets + missing required field 'bureauCode': 1 dataset + missing required field 'programCode': 1 dataset +Removed 4 invalid dataset(s). 445 valid dataset(s) remaining. +------------------------------------------------------ +Processing: https://opendata.fcc.gov/data.json +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 10 error(s) across 60 datasets: + missing required field 'keyword': 6 datasets + dataset[N].description: '' should be non-empty: 2 datasets + missing required field 'bureauCode': 1 dataset + missing required field 'programCode': 1 dataset +Removed 6 invalid dataset(s). 54 valid dataset(s) remaining. +------------------------------------------------------ +Processing: https://nycopendata.socrata.com/data.json?version=2 +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 5991 error(s) across 2704 datasets: + missing required field 'bureauCode': 2704 datasets + missing required field 'programCode': 2704 datasets + missing required field 'keyword': 533 datasets + dataset[N].description: '' should be non-empty: 50 datasets +Removed 2704 invalid dataset(s). 0 valid dataset(s) remaining. +No valid datasets remain after filtering. +------------------------------------------------------ +Processing: https://opendata.hawaii.gov/data.json +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 3491 error(s) across 976 datasets: + missing required field 'bureauCode': 976 datasets + missing required field 'programCode': 976 datasets + dataset[N].contactPoint.hasEmail: invalid mailto URI format: 822 datasets + dataset[N].license: field is not null and invalid format, expected 'uri': 453 datasets + missing required field 'keyword': 224 datasets + missing required field 'description': 39 datasets + dataset[N].distribution: field is not null and [N].accessURL: field is not null and invalid format, expected 'uri': 1 dataset +Removed 976 invalid dataset(s). 0 valid dataset(s) remaining. +No valid datasets remain after filtering. +------------------------------------------------------ +Processing: https://data.brla.gov/data.json +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 519 error(s) across 254 datasets: + missing required field 'bureauCode': 254 datasets + missing required field 'programCode': 254 datasets + missing required field 'keyword': 6 datasets + dataset[N].description: '' should be non-empty: 5 datasets +Removed 254 invalid dataset(s). 0 valid dataset(s) remaining. +No valid datasets remain after filtering. +------------------------------------------------------ +Processing: https://data.ed.gov/data.json +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 1 error(s) across 906 datasets: + missing required field 'contactPoint': 1 dataset +Removed 1 invalid dataset(s). 905 valid dataset(s) remaining. +------------------------------------------------------ +Processing: https://ddi.doi.gov/usbr-data.json +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 8745 error(s) across 8745 datasets: + missing required field 'programCode': 8745 datasets +Removed 8745 invalid dataset(s). 0 valid dataset(s) remaining. +No valid datasets remain after filtering. +------------------------------------------------------ +Processing: https://datacatalog.cookcountyil.gov/data.json?version=2 +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 1397 error(s) across 625 datasets: + missing required field 'bureauCode': 625 datasets + missing required field 'programCode': 625 datasets + dataset[N].description: '' should be non-empty: 105 datasets + missing required field 'keyword': 42 datasets +Removed 625 invalid dataset(s). 0 valid dataset(s) remaining. +No valid datasets remain after filtering. +------------------------------------------------------ +Processing: https://data.nola.gov/data.json +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 544 error(s) across 230 datasets: + missing required field 'bureauCode': 230 datasets + missing required field 'programCode': 230 datasets + missing required field 'keyword': 63 datasets + dataset[N].description: '' should be non-empty: 21 datasets +Removed 230 invalid dataset(s). 0 valid dataset(s) remaining. +No valid datasets remain after filtering. +------------------------------------------------------ +Processing: https://data.sfgov.org/data.json?version=2 +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 1771 error(s) across 703 datasets: + missing required field 'bureauCode': 703 datasets + missing required field 'programCode': 703 datasets + missing required field 'keyword': 313 datasets + dataset[N].theme: field is not null and '' should be non-empty: 35 datasets + dataset[N].description: '' should be non-empty: 17 datasets +Removed 703 invalid dataset(s). 0 valid dataset(s) remaining. +No valid datasets remain after filtering. +------------------------------------------------------ +Processing: https://data.ok.gov/data.json?version=2 +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 902 error(s) across 389 datasets: + missing required field 'bureauCode': 389 datasets + missing required field 'programCode': 389 datasets + missing required field 'keyword': 60 datasets + dataset[N].contactPoint.hasEmail: invalid mailto URI format: 51 datasets + missing required field 'description': 13 datasets +Removed 389 invalid dataset(s). 0 valid dataset(s) remaining. +No valid datasets remain after filtering. +------------------------------------------------------ +Processing: https://data.ny.gov/data.json?version=2 +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 2344 error(s) across 1143 datasets: + missing required field 'bureauCode': 1143 datasets + missing required field 'programCode': 1143 datasets + missing required field 'keyword': 50 datasets + dataset[N].description: '' should be non-empty: 5 datasets + dataset[N].contactPoint.hasEmail: invalid mailto URI format: 3 datasets +Removed 1143 invalid dataset(s). 0 valid dataset(s) remaining. +No valid datasets remain after filtering. +------------------------------------------------------ +Processing: https://data.cityofchicago.org/data.json?version=2 +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 2285 error(s) across 1114 datasets: + missing required field 'bureauCode': 1114 datasets + missing required field 'programCode': 1114 datasets + missing required field 'keyword': 46 datasets + dataset[N].description: '' should be non-empty: 11 datasets +Removed 1114 invalid dataset(s). 0 valid dataset(s) remaining. +No valid datasets remain after filtering. +------------------------------------------------------ +Processing: https://data.mo.gov/data.json?version=2 +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 850 error(s) across 277 datasets: + missing required field 'bureauCode': 277 datasets + missing required field 'programCode': 277 datasets + missing required field 'keyword': 191 datasets + dataset[N].description: '' should be non-empty: 105 datasets +Removed 277 invalid dataset(s). 0 valid dataset(s) remaining. +No valid datasets remain after filtering. +------------------------------------------------------ +Processing: https://data.baltimorecity.gov/data.json +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 3903 error(s) across 972 datasets: + missing required field 'bureauCode': 972 datasets + missing required field 'programCode': 972 datasets + dataset[N].license: field is not null and invalid format, expected 'uri': 932 datasets + dataset[N].contactPoint.hasEmail: invalid mailto URI format: 511 datasets + dataset[N].spatial: field is not null and '' should be non-empty: 251 datasets + dataset[N].theme: field is not null and expected type 'array': 251 datasets + dataset[N].keyword: [] should be non-empty: 7 datasets + dataset[N].keyword[N]: '' should be non-empty: 6 datasets + dataset[N].description: '' should be non-empty: 1 dataset +Removed 972 invalid dataset(s). 0 valid dataset(s) remaining. +No valid datasets remain after filtering. +------------------------------------------------------ +Processing: https://data.austintexas.gov/data.json +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 7066 error(s) across 2839 datasets: + missing required field 'bureauCode': 2839 datasets + missing required field 'programCode': 2839 datasets + missing required field 'keyword': 1279 datasets + dataset[N].description: '' should be non-empty: 109 datasets +Removed 2839 invalid dataset(s). 0 valid dataset(s) remaining. +No valid datasets remain after filtering. +------------------------------------------------------ +Processing: https://data.somervillema.gov/data.json?version=2 +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 106 error(s) across 48 datasets: + missing required field 'bureauCode': 48 datasets + missing required field 'programCode': 48 datasets + missing required field 'keyword': 9 datasets + dataset[N].description: '' should be non-empty: 1 dataset +Removed 48 invalid dataset(s). 0 valid dataset(s) remaining. +No valid datasets remain after filtering. +------------------------------------------------------ +Processing: https://cos-data.seattle.gov/data.json +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 1405 error(s) across 688 datasets: + missing required field 'bureauCode': 688 datasets + missing required field 'programCode': 688 datasets + missing required field 'keyword': 25 datasets + dataset[N].description: '' should be non-empty: 3 datasets + dataset[N].contactPoint.hasEmail: invalid mailto URI format: 1 dataset +Removed 688 invalid dataset(s). 0 valid dataset(s) remaining. +No valid datasets remain after filtering. +------------------------------------------------------ +Processing: https://data.townofcary.org/data.json +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 269 error(s) across 78 datasets: + missing required field 'bureauCode': 78 datasets + missing required field 'hasEmail': 78 datasets + missing required field 'programCode': 78 datasets + dataset[N].publisher.name: expected type 'string': 14 datasets + dataset[N].description: expected type 'string': 13 datasets + missing required field 'keyword': 8 datasets +Removed 78 invalid dataset(s). 0 valid dataset(s) remaining. +No valid datasets remain after filtering. +------------------------------------------------------ +Processing: https://data.charlottenc.gov/data.json +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 1665 error(s) across 372 datasets: + dataset[N].contactPoint.hasEmail: invalid mailto URI format: 372 datasets + dataset[N].license: field is not null and invalid format, expected 'uri': 372 datasets + missing required field 'bureauCode': 372 datasets + missing required field 'programCode': 372 datasets + dataset[N].description: '' should be non-empty: 74 datasets + dataset[N].spatial: field is not null and '' should be non-empty: 31 datasets + dataset[N].theme: field is not null and expected type 'array': 31 datasets + dataset[N].modified: does not match any alternative: + - invalid ISO 8601 date/datetime + - invalid ISO 8601 repeating interval: 20 datasets + dataset[N].keyword: [] should be non-empty: 15 datasets + dataset[N].distribution: field is not null and [N].accessURL: field is not null and invalid format, expected 'uri': 5 datasets + dataset[N].keyword[N]: '' should be non-empty: 1 dataset +Removed 372 invalid dataset(s). 0 valid dataset(s) remaining. +No valid datasets remain after filtering. +------------------------------------------------------ +Processing: https://opendata-townofchapelhill.hub.arcgis.com/api/feed/dcat-us/1.1.json +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 468 error(s) across 123 datasets: + missing required field 'bureauCode': 123 datasets + missing required field 'programCode': 123 datasets + dataset[N].license: field is not null and invalid format, expected 'uri': 82 datasets + dataset[N].spatial: field is not null and '' should be non-empty: 64 datasets + dataset[N].theme: field is not null and expected type 'array': 64 datasets + dataset[N].keyword[N]: '' should be non-empty: 6 datasets + dataset[N].description: '' should be non-empty: 5 datasets + dataset[N].keyword: [] should be non-empty: 1 dataset +Removed 123 invalid dataset(s). 0 valid dataset(s) remaining. +No valid datasets remain after filtering. +------------------------------------------------------ +Processing: https://data.va.gov/data.json +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 303 error(s) across 1962 datasets: + missing required field 'keyword': 115 datasets + missing required field 'bureauCode': 61 datasets + missing required field 'programCode': 61 datasets + dataset[N].description: '' should be non-empty: 47 datasets + dataset[N].contactPoint.hasEmail: invalid mailto URI format: 6 datasets + dataset[N].temporal: field is not null and does not match alternatives: + - invalid ISO 8601 date/datetime + - invalid ISO 8601 repeating interval: 4 datasets + dataset[N].bureauCode[N]: does not match pattern '[0-9]{3}:[0-9]{2}': 2 datasets + dataset[N].language: field is not null and does not match pattern '^(((([A-Za-z]{2,3}(-([A-Za-z]{3}(-[A-Za-z]{3}){0,2}))?)|[A-Za-z]{4}|[A-Za-z]{5,8})(-([A-Za-z]{4}))?(-([A-Za-z]{2}|[0-9]{3}))?(-([A-Za-z0-9]{5,8}|[0-9][A-Za-z0-9]{3}))*(-([0-9A-WY-Za-wy-z](-[A-Za-z0-9]{2,8})+))*(-(x(-[A-Za-z0-9]{1,8})+))?)|(x(-[A-Za-z0-9]{1,8})+)|((en-GB-oed|i-ami|i-bnn|i-default|i-enochian|i-hak|i-klingon|i-lux|i-mingo|i-navajo|i-pwn|i-tao|i-tay|i-tsu|sgn-BE-FR|sgn-BE-NL|sgn-CH-DE)|(art-lojban|cel-gaulish|no-bok|no-nyn|zh-guoyu|zh-hakka|zh-min|zh-min-nan|zh-xiang)))$': 2 datasets + dataset[N].programCode[N]: invalid program code format (expected '###:###'): 2 datasets + dataset[N].accrualPeriodicity: field is not null and does not match alternatives: + - value not in allowed values: ['irregular'] + - invalid ISO 8601 duration: 1 dataset + dataset[N].landingPage: field is not null and invalid format, expected 'uri': 1 dataset + dataset[N].references: field is not null and invalid format, expected 'uri': 1 dataset +Removed 205 invalid dataset(s). 1757 valid dataset(s) remaining. +------------------------------------------------------ +Processing: https://geohub-loudoungis.opendata.arcgis.com/data.json +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 783 error(s) across 226 datasets: + missing required field 'bureauCode': 226 datasets + missing required field 'programCode': 226 datasets + dataset[N].license: field is not null and invalid format, expected 'uri': 144 datasets + dataset[N].spatial: field is not null and '' should be non-empty: 92 datasets + dataset[N].theme: field is not null and expected type 'array': 92 datasets + dataset[N].contactPoint.hasEmail: invalid mailto URI format: 2 datasets + dataset[N].keyword[N]: '' should be non-empty: 1 dataset +Removed 226 invalid dataset(s). 0 valid dataset(s) remaining. +No valid datasets remain after filtering. +------------------------------------------------------ +Processing: https://data.ct.gov/data.json +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 2570 error(s) across 1213 datasets: + missing required field 'bureauCode': 1213 datasets + missing required field 'programCode': 1213 datasets + dataset[N].contactPoint.hasEmail: invalid mailto URI format: 69 datasets + dataset[N].description: '' should be non-empty: 37 datasets + missing required field 'keyword': 36 datasets + dataset[N].license: field is not null and invalid format, expected 'uri': 2 datasets +Removed 1213 invalid dataset(s). 0 valid dataset(s) remaining. +No valid datasets remain after filtering. +------------------------------------------------------ +Processing: https://data.wa.gov/data.json?version=2 +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 4470 error(s) across 1954 datasets: + missing required field 'bureauCode': 1954 datasets + missing required field 'programCode': 1954 datasets + dataset[N].description: '' should be non-empty: 493 datasets + missing required field 'keyword': 65 datasets + dataset[N].contactPoint.hasEmail: invalid mailto URI format: 4 datasets +Removed 1954 invalid dataset(s). 0 valid dataset(s) remaining. +No valid datasets remain after filtering. +------------------------------------------------------ +Processing: https://data.oregon.gov/data.json?version=2 +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 2319 error(s) across 981 datasets: + missing required field 'bureauCode': 981 datasets + missing required field 'programCode': 981 datasets + missing required field 'keyword': 260 datasets + dataset[N].description: '' should be non-empty: 87 datasets + dataset[N].license: field is not null and invalid format, expected 'uri': 10 datasets +Removed 981 invalid dataset(s). 0 valid dataset(s) remaining. +No valid datasets remain after filtering. +------------------------------------------------------ +Processing: https://data.providenceri.gov/data.json +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 524 error(s) across 234 datasets: + missing required field 'bureauCode': 234 datasets + missing required field 'programCode': 234 datasets + missing required field 'keyword': 32 datasets + dataset[N].description: '' should be non-empty: 24 datasets +Removed 234 invalid dataset(s). 0 valid dataset(s) remaining. +No valid datasets remain after filtering. +------------------------------------------------------ +Processing: https://public-chesva.opendata.arcgis.com/data.json +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 93 error(s) across 36 datasets: + missing required field 'bureauCode': 36 datasets + missing required field 'programCode': 36 datasets + dataset[N].license: field is not null and invalid format, expected 'uri': 11 datasets + dataset[N].spatial: field is not null and '' should be non-empty: 5 datasets + dataset[N].theme: field is not null and expected type 'array': 5 datasets +Removed 36 invalid dataset(s). 0 valid dataset(s) remaining. +No valid datasets remain after filtering. +------------------------------------------------------ +Processing: https://www.ssa.gov/data.json +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 984 error(s) across 2416 datasets: + dataset[N].isPartOf: expected type 'string': 911 datasets + dataset[N].temporal: field is not null and does not match alternatives: + - invalid ISO 8601 date/datetime + - invalid ISO 8601 repeating interval: 73 datasets +Removed 920 invalid dataset(s). 1496 valid dataset(s) remaining. +------------------------------------------------------ +Processing: https://cwbi-app.sec.usace.army.mil/pub/data.json +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 6 error(s) across 2 datasets: + dataset[N].modified: does not match any alternative: + - invalid ISO 8601 date/datetime + - invalid ISO 8601 repeating interval: 2 datasets + missing required field 'bureauCode': 2 datasets + missing required field 'programCode': 2 datasets +Removed 2 invalid dataset(s). 0 valid dataset(s) remaining. +No valid datasets remain after filtering. +------------------------------------------------------ +Processing: https://www.nrc.gov/data.json +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 14 error(s) across 230 datasets: + dataset[N].modified: does not match any alternative: + - invalid ISO 8601 date/datetime + - invalid ISO 8601 repeating interval: 14 datasets +Removed 14 invalid dataset(s). 216 valid dataset(s) remaining. +------------------------------------------------------ +Processing: https://data.transportation.gov/data.json +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 190 error(s) across 1760 datasets: + missing required field 'programCode': 62 datasets + missing required field 'keyword': 52 datasets + dataset[N].contactPoint.hasEmail: invalid mailto URI format: 20 datasets + dataset[N].description: '' should be non-empty: 15 datasets + missing required field 'bureauCode': 15 datasets + dataset[N].temporal: field is not null and does not match alternatives: + - invalid ISO 8601 date/datetime + - invalid ISO 8601 repeating interval: 12 datasets + dataset[N].accrualPeriodicity: field is not null and does not match alternatives: + - value not in allowed values: ['irregular'] + - invalid ISO 8601 duration: 7 datasets + dataset[N].distribution: field is not null and invalid format, expected 'uri': 3 datasets + dataset[N].landingPage: field is not null and invalid format, expected 'uri': 2 datasets + dataset[N].describedBy: field is not null and invalid format, expected 'uri': 1 dataset + dataset[N].references: field is not null and invalid format, expected 'uri': 1 dataset +Removed 115 invalid dataset(s). 1645 valid dataset(s) remaining. +------------------------------------------------------ +Processing: https://geospatial-usace.opendata.arcgis.com/data.json +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 369 error(s) across 361 datasets: + dataset[N].license: field is not null and invalid format, expected 'uri': 361 datasets + dataset[N].description: '' should be non-empty: 4 datasets + dataset[N].keyword[N]: '' should be non-empty: 3 datasets + dataset[N].keyword: [] should be non-empty: 1 dataset +Removed 361 invalid dataset(s). 0 valid dataset(s) remaining. +No valid datasets remain after filtering. +------------------------------------------------------ +Processing: https://opendata.maryland.gov/data.json +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 6243 error(s) across 2451 datasets: + missing required field 'bureauCode': 2451 datasets + missing required field 'programCode': 2451 datasets + missing required field 'keyword': 1341 datasets +Removed 2451 invalid dataset(s). 0 valid dataset(s) remaining. +No valid datasets remain after filtering. +------------------------------------------------------ +Processing: https://ngda-transportation-geoplatform.hub.arcgis.com/api/feed/dcat-us/1.1.json +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 28 error(s) across 46 datasets: + dataset[N].spatial: field is not null and '' should be non-empty: 14 datasets + dataset[N].theme: field is not null and expected type 'array': 14 datasets +Removed 14 invalid dataset(s). 32 valid dataset(s) remaining. +------------------------------------------------------ +Processing: https://data.ferndalemi.gov/data.json +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 3802 error(s) across 1132 datasets: + missing required field 'bureauCode': 1132 datasets + missing required field 'programCode': 1132 datasets + dataset[N].license: field is not null and invalid format, expected 'uri': 858 datasets + dataset[N].spatial: field is not null and '' should be non-empty: 305 datasets + dataset[N].theme: field is not null and expected type 'array': 305 datasets + dataset[N].description: '' should be non-empty: 49 datasets + dataset[N].keyword[N]: '' should be non-empty: 13 datasets + dataset[N].title: expected type 'string': 3 datasets + dataset[N].keyword: [] should be non-empty: 2 datasets + dataset[N].contactPoint.hasEmail: invalid mailto URI format: 1 dataset + dataset[N].distribution: field is not null and [N].accessURL: field is not null and invalid format, expected 'uri': 1 dataset + dataset[N].modified: does not match any alternative: + - invalid ISO 8601 date/datetime + - invalid ISO 8601 repeating interval: 1 dataset +Removed 1132 invalid dataset(s). 0 valid dataset(s) remaining. +No valid datasets remain after filtering. +------------------------------------------------------ +Processing: https://data.tempe.gov/api/feed/dcat-us/1.1.json +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 1267 error(s) across 621 datasets: + missing required field 'bureauCode': 621 datasets + missing required field 'programCode': 621 datasets + dataset[N].contactPoint.hasEmail: invalid mailto URI format: 7 datasets + dataset[N].keyword[N]: '' should be non-empty: 6 datasets + dataset[N].modified: does not match any alternative: + - invalid ISO 8601 date/datetime + - invalid ISO 8601 repeating interval: 6 datasets + dataset[N].description: '' should be non-empty: 4 datasets + dataset[N].keyword: [] should be non-empty: 2 datasets +Removed 621 invalid dataset(s). 0 valid dataset(s) remaining. +No valid datasets remain after filtering. +------------------------------------------------------ +Processing: https://data.arlingtonva.us/data.json +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 656 error(s) across 185 datasets: + missing required field 'bureauCode': 185 datasets + missing required field 'keyword': 185 datasets + missing required field 'programCode': 185 datasets + dataset[N].modified: expected type 'string': 98 datasets + dataset[N].distribution: field is not null and invalid format, expected 'uri': 3 datasets +Removed 185 invalid dataset(s). 0 valid dataset(s) remaining. +No valid datasets remain after filtering. +------------------------------------------------------ +Processing: https://bloomington.data.socrata.com/data.json +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 732 error(s) across 278 datasets: + missing required field 'bureauCode': 278 datasets + missing required field 'programCode': 278 datasets + missing required field 'keyword': 151 datasets + dataset[N].description: '' should be non-empty: 25 datasets +Removed 278 invalid dataset(s). 0 valid dataset(s) remaining. +No valid datasets remain after filtering. +------------------------------------------------------ +Processing: https://data-wake.opendata.arcgis.com/data.json +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 1889 error(s) across 521 datasets: + missing required field 'bureauCode': 521 datasets + missing required field 'programCode': 521 datasets + dataset[N].license: field is not null and invalid format, expected 'uri': 414 datasets + dataset[N].contactPoint.hasEmail: invalid mailto URI format: 228 datasets + dataset[N].description: '' should be non-empty: 165 datasets + dataset[N].spatial: field is not null and '' should be non-empty: 17 datasets + dataset[N].theme: field is not null and expected type 'array': 17 datasets + dataset[N].keyword: [] should be non-empty: 5 datasets + dataset[N].keyword[N]: '' should be non-empty: 1 dataset +Removed 521 invalid dataset(s). 0 valid dataset(s) remaining. +No valid datasets remain after filtering. +------------------------------------------------------ +Processing: https://www.cpsc.gov/data.json +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 1 error(s) across 8 datasets: + dataset[N].distribution: field is not null and invalid format, expected 'uri': 1 dataset +Removed 1 invalid dataset(s). 7 valid dataset(s) remaining. +------------------------------------------------------ +Processing: https://data.montgomerycountymd.gov/data.json +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 1102 error(s) across 499 datasets: + missing required field 'bureauCode': 499 datasets + missing required field 'programCode': 499 datasets + missing required field 'keyword': 74 datasets + dataset[N].description: '' should be non-empty: 30 datasets +Removed 499 invalid dataset(s). 0 valid dataset(s) remaining. +No valid datasets remain after filtering. +------------------------------------------------------ +Processing: https://www.opendataphilly.org/data.json +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 1323 error(s) across 462 datasets: + missing required field 'bureauCode': 462 datasets + missing required field 'programCode': 462 datasets + dataset[N].modified: expected type 'string': 373 datasets + dataset[N].description: '' should be non-empty: 19 datasets + dataset[N].distribution: field is not null and [N].accessURL: field is not null and invalid format, expected 'uri': 4 datasets + dataset[N].contactPoint.hasEmail: invalid mailto URI format: 2 datasets + dataset[N].keyword[N]: expected type 'string': 1 dataset +Removed 462 invalid dataset(s). 0 valid dataset(s) remaining. +No valid datasets remain after filtering. +------------------------------------------------------ +Processing: https://federallabs.org/data.json +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 2 error(s) across 1 datasets: + missing required field 'bureauCode': 1 dataset + missing required field 'programCode': 1 dataset +Removed 1 invalid dataset(s). 0 valid dataset(s) remaining. +No valid datasets remain after filtering. +------------------------------------------------------ +Processing: https://data.lacity.org/data.json +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 866 error(s) across 383 datasets: + missing required field 'bureauCode': 383 datasets + missing required field 'programCode': 383 datasets + missing required field 'keyword': 83 datasets + dataset[N].description: '' should be non-empty: 17 datasets +Removed 383 invalid dataset(s). 0 valid dataset(s) remaining. +No valid datasets remain after filtering. +------------------------------------------------------ +Processing: https://opendata.cityofboise.org/data.json +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 298 error(s) across 89 datasets: + dataset[N].license: field is not null and invalid format, expected 'uri': 89 datasets + missing required field 'bureauCode': 89 datasets + missing required field 'programCode': 89 datasets + dataset[N].spatial: field is not null and '' should be non-empty: 13 datasets + dataset[N].theme: field is not null and expected type 'array': 13 datasets + dataset[N].description: '' should be non-empty: 2 datasets + dataset[N].keyword[N]: '' should be non-empty: 2 datasets + dataset[N].keyword: [] should be non-empty: 1 dataset +Removed 89 invalid dataset(s). 0 valid dataset(s) remaining. +No valid datasets remain after filtering. +------------------------------------------------------ +Processing: https://opendata.dc.gov/data.json +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 3942 error(s) across 1867 datasets: + missing required field 'bureauCode': 1867 datasets + missing required field 'programCode': 1867 datasets + dataset[N].license: field is not null and invalid format, expected 'uri': 72 datasets + dataset[N].spatial: field is not null and '' should be non-empty: 58 datasets + dataset[N].theme: field is not null and expected type 'array': 58 datasets + dataset[N].keyword[N]: '' should be non-empty: 8 datasets + dataset[N].contactPoint.hasEmail: invalid mailto URI format: 7 datasets + dataset[N].keyword: [] should be non-empty: 3 datasets + dataset[N].description: '' should be non-empty: 2 datasets +Removed 1867 invalid dataset(s). 0 valid dataset(s) remaining. +No valid datasets remain after filtering. +------------------------------------------------------ +Processing: https://data-lakecountyil.opendata.arcgis.com/data.json +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 3527 error(s) across 1548 datasets: + missing required field 'bureauCode': 1548 datasets + missing required field 'programCode': 1548 datasets + dataset[N].license: field is not null and invalid format, expected 'uri': 113 datasets + dataset[N].spatial: field is not null and '' should be non-empty: 105 datasets + dataset[N].theme: field is not null and expected type 'array': 105 datasets + dataset[N].distribution: field is not null and [N].accessURL: field is not null and invalid format, expected 'uri': 95 datasets + dataset[N].description: '' should be non-empty: 10 datasets + dataset[N].keyword[N]: '' should be non-empty: 3 datasets +Removed 1548 invalid dataset(s). 0 valid dataset(s) remaining. +No valid datasets remain after filtering. +------------------------------------------------------ +Processing: https://geodata.vermont.gov/api/feed/dcat-us/1.1.json +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 3664 error(s) across 1117 datasets: + missing required field 'bureauCode': 1117 datasets + missing required field 'programCode': 1117 datasets + dataset[N].license: field is not null and invalid format, expected 'uri': 950 datasets + dataset[N].contactPoint.hasEmail: invalid mailto URI format: 406 datasets + dataset[N].description: '' should be non-empty: 44 datasets + dataset[N].keyword: [] should be non-empty: 16 datasets + dataset[N].keyword[N]: '' should be non-empty: 9 datasets + dataset[N].title: expected type 'string': 5 datasets +Removed 1117 invalid dataset(s). 0 valid dataset(s) remaining. +No valid datasets remain after filtering. +------------------------------------------------------ +Processing: https://dataworks.siouxfalls.gov/data.json +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 473 error(s) across 236 datasets: + missing required field 'bureauCode': 236 datasets + missing required field 'programCode': 236 datasets + dataset[N].license: field is not null and invalid format, expected 'uri': 1 dataset +Removed 236 invalid dataset(s). 0 valid dataset(s) remaining. +No valid datasets remain after filtering. +------------------------------------------------------ +Processing: https://data.wprdc.org/data.json +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 2336 error(s) across 368 datasets: + missing required field 'bureauCode': 368 datasets + missing required field 'identifier': 368 datasets + missing required field 'keyword': 368 datasets + missing required field 'programCode': 368 datasets + missing required field 'fn': 362 datasets + missing required field 'hasEmail': 362 datasets + dataset[N].distribution: field is not null and missing required field 'mediaType': 139 datasets + missing required field 'description': 1 dataset +Removed 368 invalid dataset(s). 0 valid dataset(s) remaining. +No valid datasets remain after filtering. +------------------------------------------------------ +Processing: https://geocatalog-uidaho.hub.arcgis.com/data.json +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 477 error(s) across 191 datasets: + missing required field 'bureauCode': 191 datasets + missing required field 'programCode': 191 datasets + dataset[N].license: field is not null and invalid format, expected 'uri': 83 datasets + dataset[N].spatial: field is not null and '' should be non-empty: 6 datasets + dataset[N].theme: field is not null and expected type 'array': 6 datasets +Removed 191 invalid dataset(s). 0 valid dataset(s) remaining. +No valid datasets remain after filtering. +------------------------------------------------------ +Processing: https://rossi.urs-tally.com/Content/data.json +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 6 error(s) across 3 datasets: + missing required field 'bureauCode': 3 datasets + missing required field 'programCode': 3 datasets +Removed 3 invalid dataset(s). 0 valid dataset(s) remaining. +No valid datasets remain after filtering. +------------------------------------------------------ +Processing: https://louisville-metro-opendata-lojic.hub.arcgis.com/api/feed/dcat-us/1.1.json +Warning: catalog has invalid data, filtering it out: +v1.1 validation failed with 9 error(s) across 515 datasets: + dataset[N].keyword: [] should be non-empty: 4 datasets + dataset[N].modified: does not match any alternative: + - invalid ISO 8601 date/datetime + - invalid ISO 8601 repeating interval: 4 datasets + dataset[N].keyword[N]: '' should be non-empty: 1 dataset +Removed 9 invalid dataset(s). 506 valid dataset(s) remaining. +------------------------------------------------------ diff --git a/jsonschema/v1.1_definitions/dataset.json b/jsonschema/v1.1_definitions/non-federal_dataset.json similarity index 55% rename from jsonschema/v1.1_definitions/dataset.json rename to jsonschema/v1.1_definitions/non-federal_dataset.json index 90275fb..34e243d 100644 --- a/jsonschema/v1.1_definitions/dataset.json +++ b/jsonschema/v1.1_definitions/non-federal_dataset.json @@ -1,12 +1,9 @@ { - "$schema": "http://json-schema.org/draft-04/schema#", - "id": "https://project-open-data.cio.gov/v1.1/schema/dataset.json#", + "$schema": "https://json-schema.org/draft/2020-12/schema", "title": "Project Open Data Dataset", "description": "The metadata format for all federal open data. Validates a single JSON object entry (as opposed to entire Data.json catalog).", "type": "object", "required": [ - "bureauCode", - "programCode", "title", "description", "keyword", @@ -20,18 +17,12 @@ "@type": { "title": "Metadata Context", "description": "IRI for the JSON-LD data type. This should be dcat:Dataset for each Dataset", - "enum": [ - "dcat:Dataset" - ] + "const": "dcat:Dataset" }, "accessLevel": { "description": "The degree to which this dataset could be made publicly-available, regardless of whether it has been made available. Choices: public (Data asset is or could be made publicly available to all without restrictions), restricted public (Data asset is available under certain use restrictions), or non-public (Data asset is not available to members of the public)", "title": "Public Access Level", - "enum": [ - "public", - "restricted public", - "non-public" - ] + "enum": ["public", "restricted public", "non-public"] }, "rights": { "title": "Rights", @@ -52,9 +43,7 @@ "description": "Frequency with which dataset is published.", "anyOf": [ { - "enum": [ - "irregular" - ] + "const": "irregular" }, { "type": "string", @@ -62,22 +51,37 @@ }, { "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" } ] }, "bureauCode": { "title": "Bureau Code", "description": "Federal agencies, combined agency and bureau code from OMB Circular A-11, Appendix C in the format of 015:010.", - "type": "array", - "items": { - "type": "string", - "pattern": "[0-9]{3}:[0-9]{2}" - }, - "minItems": 1, - "uniqueItems": true + "anyOf": [ + { + "type": "array", + "items": { + "type": "string", + "pattern": "[0-9]{3}:[0-9]{2}" + }, + "minItems": 1, + "uniqueItems": true + }, + { + "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" + } + ] }, "contactPoint": { - "$ref": "vcard.json" + "$ref": "#/$defs/vcard" }, "describedBy": { "title": "Data Dictionary", @@ -89,6 +93,10 @@ }, { "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" } ] }, @@ -102,6 +110,10 @@ }, { "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" } ] }, @@ -115,6 +127,10 @@ }, { "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" } ] }, @@ -127,6 +143,10 @@ }, { "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" } ] }, @@ -134,7 +154,8 @@ "title": "Description", "description": "Human-readable description (e.g., an abstract) with sufficient detail to enable a user to quickly understand whether the asset is of interest.", "type": "string", - "minLength": 1 + "minLength": 1, + "maxLength": 10000 }, "distribution": { "title": "Distribution", @@ -143,13 +164,25 @@ { "type": "array", "items": { - "$ref": "distribution.json", - "minItems": 1, - "uniqueItems": true + "anyOf": [ + { + "minItems": 1, + "uniqueItems": true, + "$ref": "#/$defs/distribution" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" + } + ] } }, { "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" } ] }, @@ -169,18 +202,32 @@ }, { "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" } ] }, "keyword": { "title": "Tags", "description": "Tags (or keywords) help users discover your dataset; please include terms that would be used by technical and non-technical users.", - "type": "array", - "items": { - "type": "string", - "minLength": 1 - }, - "minItems": 1 + "anyOf": [ + { + "type": "array", + "items": { + "type": "string", + "minLength": 1, + "maxLength": 1000 + }, + "minItems": 1, + "maxItems": 1000 + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" + } + ] }, "landingPage": { "title": "Homepage URL", @@ -192,6 +239,10 @@ }, { "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" } ] }, @@ -199,15 +250,19 @@ "title": "Language", "description": "The language of the dataset.", "anyOf": [ + { + "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" + }, { "type": "array", "items": { "type": "string", "pattern": "^(((([A-Za-z]{2,3}(-([A-Za-z]{3}(-[A-Za-z]{3}){0,2}))?)|[A-Za-z]{4}|[A-Za-z]{5,8})(-([A-Za-z]{4}))?(-([A-Za-z]{2}|[0-9]{3}))?(-([A-Za-z0-9]{5,8}|[0-9][A-Za-z0-9]{3}))*(-([0-9A-WY-Za-wy-z](-[A-Za-z0-9]{2,8})+))*(-(x(-[A-Za-z0-9]{1,8})+))?)|(x(-[A-Za-z0-9]{1,8})+)|((en-GB-oed|i-ami|i-bnn|i-default|i-enochian|i-hak|i-klingon|i-lux|i-mingo|i-navajo|i-pwn|i-tao|i-tay|i-tsu|sgn-BE-FR|sgn-BE-NL|sgn-CH-DE)|(art-lojban|cel-gaulish|no-bok|no-nyn|zh-guoyu|zh-hakka|zh-min|zh-min-nan|zh-xiang)))$" } - }, - { - "type": "null" } ] }, @@ -221,6 +276,10 @@ }, { "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" } ] }, @@ -239,6 +298,10 @@ { "type": "string", "pattern": "^(R\\d*\\/)?([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\4([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\18[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)P(?:\\d+(?:\\.\\d+)?Y)?(?:\\d+(?:\\.\\d+)?M)?(?:\\d+(?:\\.\\d+)?W)?(?:\\d+(?:\\.\\d+)?D)?(?:T(?:\\d+(?:\\.\\d+)?H)?(?:\\d+(?:\\.\\d+)?M)?(?:\\d+(?:\\.\\d+)?S)?)?$" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" } ] }, @@ -252,57 +315,71 @@ }, { "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" } ] }, "programCode": { "title": "Program Code", "description": "Federal agencies, list the primary program related to this data asset, from the Federal Program Inventory. Use the format of 015:001", - "type": "array", - "items": { - "type": "string", - "pattern": "[0-9]{3}:[0-9]{3}" - }, - "minItems": 1, - "uniqueItems": true - }, - "publisher": { - "$ref": "organization.json" - }, - "references": { - "title": "Related Documents", - "description": "Related documents such as technical information about a dataset, developer documentation, etc.", "anyOf": [ { "type": "array", "items": { "type": "string", - "format": "uri" + "pattern": "[0-9]{3}:[0-9]{3}" }, "minItems": 1, "uniqueItems": true }, { "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" } ] }, - "spatial": { - "title": "Spatial", - "description": "The range of spatial applicability of a dataset. Could include a spatial region like a bounding box or a named place.", + "publisher": { + "$ref": "#/$defs/organization" + }, + "references": { + "title": "Related Documents", + "description": "Related documents such as technical information about a dataset, developer documentation, etc.", "anyOf": [ { - "type": "string", - "minLength": 1 + "type": "array", + "items": { + "anyOf": [ + { + "type": "string", + "format": "uri" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" + } + ] + }, + "minItems": 1, + "uniqueItems": true }, { "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" } ] }, - "systemOfRecords": { - "title": "System of Records", - "description": "If the systems is designated as a system of records under the Privacy Act of 1974, provide the URL to the System of Records Notice related to this dataset.", + "spatial": { + "title": "Spatial", + "description": "The range of spatial applicability of a dataset. Could include a spatial region like a bounding box or a named place.", "anyOf": [ { "type": "string", @@ -330,6 +407,19 @@ } ] }, + "systemOfRecords": { + "title": "System of Records", + "description": "If the systems is designated as a system of records under the Privacy Act of 1974, provide the URL to the System of Records Notice related to this dataset.", + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "null" + } + ] + }, "temporal": { "title": "Temporal", "description": "The range of temporal applicability of a dataset (i.e., a start and end date of applicability for the data).", @@ -348,6 +438,10 @@ }, { "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" } ] }, @@ -358,6 +452,9 @@ { "type": "string", "minLength": 1 + }, + { + "type": "null" } ] }, @@ -376,6 +473,10 @@ }, { "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" } ] }, @@ -385,5 +486,248 @@ "type": "string", "minLength": 1 } + }, + "$id": "https://project-open-data.cio.gov/v1.1/schema/dataset.json#", + "$defs": { + "vcard": { + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "Project Open Data ContactPoint vCard", + "description": "A Dataset ContactPoint as a vCard object", + "type": "object", + "required": ["fn", "hasEmail"], + "properties": { + "@type": { + "title": "Metadata Context", + "description": "IRI for the JSON-LD data type. This should be vcard:Contact for contactPoint", + "const": "vcard:Contact" + }, + "fn": { + "title": "Contact Name", + "description": "A full formatted name, eg Firstname Lastname", + "type": "string", + "minLength": 1 + }, + "hasEmail": { + "title": "Email", + "description": "Email address for the contact", + "anyOf": [ + { + "pattern": "^mailto:[\\w\\_\\~\\!\\$\\&\\'\\(\\)\\*\\+\\,\\;\\=\\:.-]+@[\\w.-]+\\.[\\w.-]+?$", + "type": "string" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" + } + ] + } + }, + "$id": "https://project-open-data.cio.gov/v1.1/schema/vcard.json#" + }, + "distribution": { + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "Project Open Data Distribution", + "description": "Validates an entire collection of common core metadata JSON objects. Agencies produce said collections in the form of Data.json files.", + "type": "object", + "properties": { + "@type": { + "title": "Metadata Context", + "description": "IRI for the JSON-LD data type. This should be dcat:Distribution for each Distribution", + "const": "dcat:Distribution" + }, + "downloadURL": { + "title": "Download URL", + "description": "URL providing direct access to a downloadable file of a dataset", + "anyOf": [ + { + "type": "string", + "format": "uri" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" + } + ] + }, + "mediaType": { + "title": "Media Type", + "description": "The machine-readable file format (IANA Media Type or MIME Type) of the distribution’s downloadURL", + "anyOf": [ + { + "pattern": "^[-\\w]+/[-\\w]+(\\.[-\\w]+)*([+][-\\w]+)?$", + "type": "string" + }, + { + "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" + } + ] + }, + "format": { + "title": "Format", + "description": "A human-readable description of the file format of a distribution", + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "null" + } + ] + }, + "accessURL": { + "title": "Access URL", + "description": "URL providing indirect access to a dataset", + "anyOf": [ + { + "type": "string", + "format": "uri" + }, + { + "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" + } + ] + }, + "description": { + "title": "Description", + "description": "Human-readable description of the distribution", + "anyOf": [ + { + "type": "string", + "minLength": 1, + "maxLength": 10000 + }, + { + "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" + } + ] + }, + "title": { + "title": "Title", + "description": "Human-readable name of the distribution", + "anyOf": [ + { + "type": "string", + "minLength": 1 + }, + { + "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" + } + ] + }, + "conformsTo": { + "title": "Data Standard", + "description": "URL providing indirect access to a dataset", + "anyOf": [ + { + "type": "string", + "format": "uri" + }, + { + "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" + } + ] + }, + "describedBy": { + "title": "Data Dictionary", + "description": "URL to the data dictionary for the distribution found at the downloadURL", + "anyOf": [ + { + "type": "string", + "format": "uri" + }, + { + "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" + } + ] + }, + "describedByType": { + "title": "Data Dictionary Type", + "description": "The machine-readable file format (IANA Media Type or MIME Type) of the distribution’s describedBy URL", + "anyOf": [ + { + "pattern": "^[-\\w]+/[-\\w]+(\\.[-\\w]+)*([+][-\\w]+)?$", + "type": "string" + }, + { + "type": "null" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" + } + ] + } + }, + "$id": "https://project-open-data.cio.gov/v1.1/schema/distribution.json#", + "dependentSchemas": { + "downloadURL": { + "properties": { + "mediaType": { + "anyOf": [ + { + "type": "string", + "pattern": "^[-\\w]+/[-\\w]+(\\.[-\\w]+)*([+][-\\w]+)?$" + }, + { + "type": "string", + "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$" + } + ] + } + }, + "required": ["mediaType"] + } + } + }, + "organization": { + "$schema": "https://json-schema.org/draft/2020-12/schema", + "title": "Project Open Data Organization", + "description": "A Dataset Publisher Organization as a foaf:Agent object", + "type": "object", + "required": ["name"], + "properties": { + "@type": { + "title": "Metadata Context", + "description": "IRI for the JSON-LD data type. This should be org:Organization for each publisher", + "const": "org:Organization" + }, + "name": { + "title": "Publisher Name", + "description": "A full formatted name, eg Firstname Lastname", + "type": "string", + "minLength": 1 + }, + "subOrganizationOf": { + "title": "Parent Organization", + "$ref": "#" + } + }, + "$id": "https://project-open-data.cio.gov/v1.1/schema/organization.json#" + } } -} \ No newline at end of file +} diff --git a/jsonschema/validate_1_1.py b/jsonschema/validate_1_1.py new file mode 100644 index 0000000..a0042aa --- /dev/null +++ b/jsonschema/validate_1_1.py @@ -0,0 +1,328 @@ +#!/usr/bin/env python3 +"""Convert a valid DCAT-US v1.1 catalog to a valid DCAT-US v3.0 catalog.""" +import json +import re +from pathlib import Path + +import click +from curl_cffi import requests +from curl_cffi.requests.exceptions import RequestException +from jsonschema import Draft202012Validator +from referencing import Registry, Resource + + +V1_1_CATALOG_SCHEMA_ID = "https://project-open-data.cio.gov/v1.1/schema/catalog.json" +SCRIPT_DIR = Path(__file__).parent +V1_1_DEFINITIONS_DIR = SCRIPT_DIR / "v1.1_definitions" +PATTERN_DESCRIPTIONS = { + "mailto": "invalid mailto URI format", + "R\\/P": "invalid ISO 8601 duration", + r"[\+-]?\d{4}.*\/": "invalid ISO 8601 interval", + r"R\d*\/": "invalid ISO 8601 repeating interval", + "[0-9]{3}:[0-9]{3}": "invalid program code format (expected '###:###')", + "[0-9]{3}-[0-9]{9}": "invalid IT investment UII format (expected '###-#########')", + r"[\+-]?\d{4}(?!\d{2}": "invalid ISO 8601 date/datetime", +} + + +class CatalogFetchException(Exception): + pass + + +class CatalogValidationException(Exception): + pass + + +def _describe_pattern(pattern: str) -> str: + """Return a human-readable description for a known regex pattern, or None.""" + for substring, description in PATTERN_DESCRIPTIONS.items(): + if substring in pattern: + return description + return None + + +def format_path(path): + """Format a jsonschema path as a readable string like 'subject[0].inScheme'.""" + if not path: + return "(root)" + parts = [] + for p in path: + if isinstance(p, int): + # Array index - append to previous part + if parts: + parts[-1] = f"{parts[-1]}[{p}]" + else: + parts.append(f"[{p}]") + else: + parts.append(str(p)) + return ".".join(parts) + + +def format_validation_errors(errors, indent=0): + """Summarize validation errors grouped by type with occurrence counts.""" + prefix = " " * indent + counts = {} + + for error in errors: + summary = summarize_error(error) + if not summary: + continue + # Normalize array indices to [N] so errors from different items group together + key = re.sub(r'\[\d+\]', '[N]', summary) + counts[key] = counts.get(key, 0) + 1 + + if not counts: + return "" + + # Sort by count descending, then alphabetically for stable output + sorted_items = sorted(counts.items(), key=lambda x: (-x[1], x[0])) + + lines = [] + for msg, count in sorted_items: + noun = "dataset" if count == 1 else "datasets" + lines.append(f"{prefix}{msg}: {count} {noun}") + + return "\n".join(lines) + + +def summarize_error(error, prefix="", is_suberror=False): + """Summarize a single error into a human-readable string.""" + path = format_path(error.path) + + # Handle anyOf/oneOf errors by finding meaningful sub-errors + if error.validator in ("anyOf", "oneOf") and error.context: + meaningful = find_meaningful_errors(error.context) + + if not meaningful: + return f"{prefix}{path}: field is not null and does not match any allowed type" + + has_null_alternative = any(is_null_type_error(e) for e in error.context) + + summaries = [] + for sub_error in meaningful: + sub_summary = summarize_error(sub_error, prefix="", is_suberror=True) + if sub_summary: + summaries.append(sub_summary) + + # Collapse repeated identical sub-errors to a single bullet + unique_summaries = list(dict.fromkeys(summaries)) + + if has_null_alternative and unique_summaries: + intro = f"{path}: field is not null and " + if len(unique_summaries) == 1: + return f"{prefix}{intro}{unique_summaries[0]}" + else: + return f"{prefix}{intro}does not match alternatives:\n" + "\n".join( + f"{prefix} - {s}" for s in unique_summaries + ) + elif unique_summaries: + if len(unique_summaries) == 1: + return f"{prefix}{path}: {unique_summaries[0]}" + else: + return f"{prefix}{path}: does not match any alternative:\n" + "\n".join( + f"{prefix} - {s}" for s in unique_summaries + ) + + # Handle $ref errors - find the expected class + if "$ref" in error.schema: + class_name = extract_schema_name(error.schema) + if error.context: + meaningful = find_meaningful_errors(error.context) + if meaningful: + sub_summaries = [summarize_error(e, prefix="", is_suberror=True) for e in meaningful] + sub_summaries = [s for s in sub_summaries if s] + if sub_summaries: + if class_name: + return f"does not conform to {class_name}: {'; '.join(sub_summaries)}" + return "; ".join(sub_summaries) + if class_name: + return f"does not conform to {class_name}" + + # Handle required field errors + if error.validator == "required": + missing = error.validator_value + if isinstance(missing, list): + missing_fields = [f for f in missing if f in error.message] + if missing_fields: + return f"missing required field '{missing_fields[0]}'" + if "is a required property" in error.message: + field = error.message.split("'")[1] + return f"missing required field '{field}'" + return error.message + + # Handle type errors + if error.validator == "type": + expected = error.validator_value + if isinstance(expected, list): + expected = " or ".join(expected) + if is_suberror: + return f"expected type '{expected}'" + return f"{prefix}{path}: expected type '{expected}'" + + # Handle enum errors + if error.validator == "enum": + if is_suberror: + return f"value not in allowed values: {error.validator_value}" + return f"{prefix}{path}: value not in allowed values: {error.validator_value}" + + # Handle pattern errors + if error.validator == "pattern": + description = _describe_pattern(error.validator_value) + msg = description if description else f"does not match pattern '{error.validator_value}'" + if is_suberror: + return msg + return f"{prefix}{path}: {msg}" + + # Handle format errors + if error.validator == "format": + msg = f"invalid format, expected '{error.validator_value}'" + if is_suberror: + return msg + return f"{prefix}{path}: {msg}" + + # Handle maxLength errors - omit the value to allow grouping + if error.validator == "maxLength": + msg = f"value is too long (max {error.validator_value} characters)" + if is_suberror: + return msg + return f"{prefix}{path}: {msg}" + + # Default: use the message, prepending path if available + if not is_suberror and path and path != "(root)": + return f"{prefix}{path}: {error.message}" + return f"{prefix}{error.message}" + + +def find_meaningful_errors(errors): + """Filter errors to find the meaningful ones, skipping null-type failures.""" + meaningful = [] + for error in errors: + if is_null_type_error(error): + continue + meaningful.append(error) + return meaningful if meaningful else list(errors) + + +def is_null_type_error(error): + """Check if this error is just 'type is not null'.""" + return (error.validator == "type" and + error.validator_value == "null") + + +def extract_schema_name(schema): + """Extract a human-readable schema/class name from a schema definition.""" + if isinstance(schema, dict): + if "$ref" in schema: + # Extract class name from ref like "/dcat-us/3.0.0/definitions/concept" + ref = schema["$ref"] + return ref.split("/")[-1].title() + if "title" in schema: + return schema["title"] + return None + + +def load_schema_registry(definitions_dir: Path) -> Registry: + registry = Registry() + for schema_file in definitions_dir.glob("*.json"): + with schema_file.open() as f: + resource = Resource.from_contents(json.load(f)) + registry = resource @ registry + return registry + + +def fetch_dcat_catalog(url: str) -> dict: + """Fetch a DCAT-US v1.1 catalog to validate.""" + # Some target servers (e.g. usda.gov) reject non-browser TLS/HTTP2 fingerprints, so + # we impersonate a real browser using curl_cffi. + try: + response = requests.get(url, timeout=60, impersonate="safari17_0") + response.raise_for_status() + except RequestException as e: + raise CatalogFetchException(f"Request failed: {type(e).__name__}: {e!r}") from e + + try: + text = response.content.decode("utf-8-sig") + text = text.lstrip("\ufeff") + except UnicodeDecodeError: + text = response.content.decode("cp1252") + + try: + data = json.loads(text) + if isinstance(data, list): + raise CatalogFetchException("Response is a JSON array, not a catalog object") + return data + except ValueError as e: + raise CatalogFetchException(f"Response was not valid JSON: {e}") from e + + +def validate_catalog(schema_id: str, registry: Registry, catalog: dict) -> None: + """Validate a catalog and raise CatalogValidationException if invalid.""" + validator = Draft202012Validator( + {"$ref": schema_id}, + registry=registry, + format_checker=Draft202012Validator.FORMAT_CHECKER, + ) + errors = list(validator.iter_errors(catalog)) + if errors: + version_number = "v1.1" if "v1.1" in schema_id else "v3.0" + dataset_count = len(catalog.get("dataset", [])) + raise CatalogValidationException( + f"{version_number} validation failed with {len(errors)} error(s) across {dataset_count} datasets:\n" + + format_validation_errors(errors, indent=2) + ) + + +def filter_invalid_datasets(schema_id: str, registry: Registry, catalog: dict) -> tuple[dict, int]: + """ + Return a copy of the catalog with invalid datasets removed, plus the count removed. + + Works by validating each dataset individually against the catalog schema. + Datasets whose index appears in any top-level error path are dropped. + """ + validator = Draft202012Validator( + {"$ref": schema_id}, + registry=registry, + format_checker=Draft202012Validator.FORMAT_CHECKER, + ) + errors = list(validator.iter_errors(catalog)) + + # Collect the indices of datasets implicated in at least one error. + # jsonschema paths look like: deque(['dataset', 3, 'title', ...]) + bad_indices = set() + for error in errors: + path = list(error.absolute_path) + if len(path) >= 2 and path[0] == "dataset" and isinstance(path[1], int): + bad_indices.add(path[1]) + + datasets = catalog.get("dataset", []) + filtered = [ds for i, ds in enumerate(datasets) if i not in bad_indices] + removed = len(datasets) - len(filtered) + + return {**catalog, "dataset": filtered}, removed + + +@click.command() +@click.option("-u", "--url", help="URL of DCAT-US v1.1 catalog to be converted", required=True) +def main(url): + v1_1_registry = load_schema_registry(V1_1_DEFINITIONS_DIR) + try: + catalog_to_convert = fetch_dcat_catalog(url) + except CatalogFetchException as e: + click.echo(f"There was an error fetching a DCAT-US v1.1 catalog to convert: {e}", err=True) + return 1 + + try: + validate_catalog(V1_1_CATALOG_SCHEMA_ID, v1_1_registry, catalog_to_convert) + except CatalogValidationException as e: + click.echo(f"Warning: catalog has invalid data, filtering it out:\n{e}", err=True) + catalog_to_convert, removed = filter_invalid_datasets( + V1_1_CATALOG_SCHEMA_ID, v1_1_registry, catalog_to_convert + ) + remaining = len(catalog_to_convert.get("dataset", [])) + click.echo(f"Removed {removed} invalid dataset(s). {remaining} valid dataset(s) remaining.", err=True) + if remaining == 0: + click.echo("No valid datasets remain after filtering.", err=True) + return 1 + +if __name__ == "__main__": + main(standalone_mode=False)