diff --git a/jsonschema/convert_dcat_1_1_to_3_0.py b/jsonschema/convert_dcat_1_1_to_3_0.py
index a0c00ee..7c2a9ff 100755
--- a/jsonschema/convert_dcat_1_1_to_3_0.py
+++ b/jsonschema/convert_dcat_1_1_to_3_0.py
@@ -205,10 +205,11 @@ def fetch_dcat_catalog(url: str) -> dict:
response = requests.get(url, timeout=60, impersonate="safari17_0")
response.raise_for_status()
except RequestException as e:
- raise CatalogFetchException(f"Request failed: {e}") from e
+ raise CatalogFetchException(f"Request failed: {type(e).__name__}: {e!r}") from e
try:
- text = response.content.decode("utf-8")
+ text = response.content.decode("utf-8-sig")
+ text = text.lstrip("\ufeff")
except UnicodeDecodeError:
text = response.content.decode("cp1252")
diff --git a/jsonschema/tests/harvest_source_urls.csv b/jsonschema/tests/harvest_source_urls.csv
new file mode 100644
index 0000000..e0b8092
--- /dev/null
+++ b/jsonschema/tests/harvest_source_urls.csv
@@ -0,0 +1,129 @@
+url,status,has_valid_subset,notes
+https://www.eac.gov/data.json,converts,n/a
+https://data.ca.gov/data.json,invalid v1.1,yes
+https://open.gsa.gov/data.json,converts,n/a
+https://healthdata.gov/data.json,invalid v1.1,yes
+https://data.kingcounty.gov/data.json?version=2,invalid v1.1,yes
+https://data.hartford.gov/data.json,invalid v1.1,yes
+https://data-soa-dnr.opendata.arcgis.com/api/feed/dcat-us/1.1.json,invalid v1.1,no
+https://ddi.doi.gov/onrr-data.json,converts,n/a
+https://datainventory.doi.gov/data.json,fetch error,n/a
+https://www.imls.gov/sites/default/files/data.json,invalid v1.1,yes
+https://ddi.doi.gov/bia-data.json,converts,n/a
+https://ddi.doi.gov/bsee-data.json,invalid v1.1,yes
+https://portal.opentopography.org/geoportal/csw/discovery?Request=GetCapabilities&Service=CSW&Version=2.0.2,fetch error,n/a
+https://www.sba.gov/data.json,converts,n/a
+https://ddi.doi.gov/fws-data.json,converts,n/a
+https://ddi.doi.gov/osmre-data.json,invalid v1.1,no
+https://ddi.doi.gov/usgs-data.json,invalid v1.1,yes
+https://data.honolulu.gov/data.json?version=2,invalid v1.1,yes
+https://data-fairfaxcountygis.opendata.arcgis.com/data.json,invalid v1.1,yes
+https://ddi.doi.gov/doios-data.json,invalid v1.1,yes
+https://edg.epa.gov/data/nongeo_data.json,invalid v1.1,yes
+https://opendata.fcc.gov/data.json,invalid v1.1,yes
+https://nycopendata.socrata.com/data.json?version=2,invalid v1.1,yes
+https://www.ncua.gov/data.json,converts,n/a
+https://www.fec.gov/data.json,converts,n/a
+https://www.federalreserve.gov/PDC/data.json,converts,n/a
+https://opendata.hawaii.gov/data.json,invalid v1.1,yes
+https://img.exim.gov/s3fs-public/dataset/vbhv-d8am/data.json,converts,n/a
+https://ddi.doi.gov/blm-data.json,converts,n/a
+https://ddi.doi.gov/nps-data.json,converts,n/a
+https://data.brla.gov/data.json,invalid v1.1,yes
+https://www.commerce.gov/sites/default/files/data.json,fetch error,n/a
+https://data.ed.gov/data.json,invalid v1.1,yes
+https://data.iowa.gov/data.json,fetch error,n/a
+https://www.usda.gov/sites/default/files/documents/data.json,fetch error,n/a
+https://ddi.doi.gov/boem-data.json,converts,n/a
+https://ddi.doi.gov/usbr-data.json,converts,n/a
+https://datacatalog.cookcountyil.gov/data.json?version=2,invalid v1.1,yes
+https://data.nola.gov/data.json,invalid v1.1,yes
+https://data.sfgov.org/data.json?version=2,invalid v1.1,yes
+https://www.state.gov/data.json,converts,n/a
+https://data.ok.gov/data.json?version=2,invalid v1.1,yes
+https://data.ny.gov/data.json?version=2,invalid v1.1,yes
+https://www.cftc.gov/sites/default/files/CFTC-ODI-metadata-v2.json,converts,n/a
+https://data.cityofchicago.org/data.json?version=2,invalid v1.1,yes
+https://www.huduser.gov/data/data.json,converts,n/a
+https://fhfa.gov/data/data.json,converts,n/a
+https://its.ntia.gov/data.json,converts,n/a
+https://www.dol.gov/data.json,converts,n/a
+https://data.mo.gov/data.json?version=2,invalid v1.1,yes
+https://data.baltimorecity.gov/data.json,invalid v1.1,yes
+https://data.austintexas.gov/data.json,invalid v1.1,yes
+https://data.americorps.gov/data.json,converts,n/a
+https://www.consumerfinance.gov/data.json,converts,n/a
+https://data.somervillema.gov/data.json?version=2,invalid v1.1,yes
+https://cos-data.seattle.gov/data.json,invalid v1.1,yes
+https://data.townofcary.org/data.json,invalid v1.1,no
+https://www.archives.gov/files/data.json,converts,n/a
+https://www.usitc.gov/data.json,converts,n/a
+https://wa-node.gis.washington.edu/geoportal/csw/discovery,fetch error,n/a
+https://www.bls.gov/data.json,converts,n/a
+https://www.opm.gov/data.json,converts,n/a
+https://data.charlottenc.gov/data.json,invalid v1.1,no
+https://opendata-townofchapelhill.hub.arcgis.com/api/feed/dcat-us/1.1.json,invalid v1.1,yes
+https://data.mcc.gov/data.json,converts,n/a
+https://data.va.gov/data.json,invalid v1.1,yes
+https://www.frtib.gov/data.json,converts,n/a
+https://geohub-loudoungis.opendata.arcgis.com/data.json,invalid v1.1,yes
+https://data.ct.gov/data.json,invalid v1.1,yes
+https://data.wa.gov/data.json?version=2,invalid v1.1,yes
+https://data.oregon.gov/data.json?version=2,invalid v1.1,yes
+https://www.mspb.gov/data.json,converts,n/a
+https://data.providenceri.gov/data.json,invalid v1.1,yes
+https://public-chesva.opendata.arcgis.com/data.json,invalid v1.1,yes
+https://www.eeoc.gov/data.json,converts,n/a
+https://www.ssa.gov/data.json,invalid v1.1,yes
+https://www.ftc.gov/data.json,converts,n/a
+https://www.nsf.gov/data.json,converts,n/a
+https://cwbi-app.sec.usace.army.mil/pub/data.json,invalid v1.1,no
+https://www.nrc.gov/data.json,invalid v1.1,yes
+https://data.transportation.gov/data.json,invalid v1.1,yes
+https://geospatial-usace.opendata.arcgis.com/data.json,invalid v1.1,no
+https://opendata.maryland.gov/data.json,invalid v1.1,yes
+https://ngda-transportation-geoplatform.hub.arcgis.com/api/feed/dcat-us/1.1.json,invalid v1.1,yes
+https://gisportal.ibwc.gov/agsportal/,fetch error,n/a
+https://data.ferndalemi.gov/data.json,invalid v1.1,yes
+https://data.tempe.gov/api/feed/dcat-us/1.1.json,invalid v1.1,yes
+https://data.arlingtonva.us/data.json,invalid v1.1,no
+https://bloomington.data.socrata.com/data.json,invalid v1.1,yes
+https://data-wake.opendata.arcgis.com/data.json,invalid v1.1,yes
+https://data.ntsb.gov/data.json,converts,n/a
+https://secure.rrb.gov/data.json,converts,n/a
+https://www.dhs.gov/data.json,converts,n/a
+https://www.cpsc.gov/data.json,invalid v1.1,yes
+https://data.montgomerycountymd.gov/data.json,invalid v1.1,yes
+https://www.nitrd.gov/data.json,converts,n/a
+https://www.archive.arm.gov/metadata/data.json,converts,n/a
+https://data.usaid.gov/data.json,fetch error,n/a
+https://pbgc.gov/data.json,converts,n/a
+https://www.opendataphilly.org/data.json,invalid v1.1,yes
+https://www.nist.gov/sites/default/files/data.json,invalid v3.0,n/a,v1.1 is valid but uses reserved `replaces` field name
+https://www.treasury.gov/data.json,converts,n/a
+https://pasteur.epa.gov/metadata.json,converts,n/a
+https://max.gov/data.json,converts,n/a
+https://www.justice.gov/data.json,converts,n/a
+https://data.nasa.gov/data.json,fetch error,n/a
+https://federallabs.org/data.json,converts,n/a
+https://www.sec.gov/data.json,converts,n/a
+https://data.lacity.org/data.json,invalid v1.1,yes
+https://nehopendatastorage.blob.core.windows.net/nehopendata/data.json,converts,n/a
+https://openei.org/data.json,converts,n/a
+https://www.energy.gov/data.json,converts,n/a
+https://www.defense.gov/data.json,fetch error,n/a
+https://opendata.cityofboise.org/data.json,invalid v1.1,no
+https://opendata.dc.gov/data.json,invalid v1.1,yes
+https://data-lakecountyil.opendata.arcgis.com/data.json,invalid v1.1,yes
+https://geodata.vermont.gov/api/feed/dcat-us/1.1.json,invalid v1.1,yes
+https://dataworks.siouxfalls.gov/data.json,invalid v1.1,yes
+https://data.srcity.org/data.json,fetch error,n/a
+https://mapcontext.com/npswaf/sample2.json,fetch error,n/a
+https://data.wprdc.org/data.json,invalid v1.1,no
+https://apps.usgs.gov/fgdc/WAF_JSON/combined.json,converts,n/a
+https://geocatalog-uidaho.hub.arcgis.com/data.json,invalid v1.1,yes
+https://opendurham.nc.gov/data.json,fetch error,n/a
+https://rossi.urs-tally.com/Content/data.json,converts,n/a
+https://louisville-metro-opendata-lojic.hub.arcgis.com/api/feed/dcat-us/1.1.json,invalid v1.1,yes
+https://raw.githubusercontent.com/gbinal/data/master/datasets/hhs_cas.json,fetch error,n/a
+https://www.loc.gov/data.json,converts,n/a
diff --git a/jsonschema/tests/validate_1_1_failure_details.log b/jsonschema/tests/validate_1_1_failure_details.log
new file mode 100644
index 0000000..2c94d7e
--- /dev/null
+++ b/jsonschema/tests/validate_1_1_failure_details.log
@@ -0,0 +1,757 @@
+Processing: https://data.ca.gov/data.json
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 9160 error(s) across 4457 datasets:
+ missing required field 'bureauCode': 4457 datasets
+ missing required field 'programCode': 4457 datasets
+ dataset[N].contactPoint.hasEmail: invalid mailto URI format: 157 datasets
+ dataset[N].license: field is not null and invalid format, expected 'uri': 81 datasets
+ dataset[N].distribution: field is not null and [N].accessURL: field is not null and invalid format, expected 'uri': 3 datasets
+ dataset[N].distribution: field is not null and invalid format, expected 'uri': 3 datasets
+ dataset[N].theme: field is not null and ['COVID-19', 'COVID-19'] has non-unique elements: 2 datasets
+Removed 4457 invalid dataset(s). 0 valid dataset(s) remaining.
+No valid datasets remain after filtering.
+------------------------------------------------------
+Processing: https://healthdata.gov/data.json
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 902 error(s) across 20461 datasets:
+ dataset[N].accrualPeriodicity: field is not null and does not match alternatives:
+ - value not in allowed values: ['irregular']
+ - invalid ISO 8601 duration: 217 datasets
+ missing required field 'keyword': 201 datasets
+ dataset[N].temporal: field is not null and does not match alternatives:
+ - invalid ISO 8601 date/datetime
+ - invalid ISO 8601 repeating interval: 200 datasets
+ dataset[N].rights: field is not null and value is too long (max 255 characters): 66 datasets
+ dataset[N].describedBy: field is not null and invalid format, expected 'uri': 54 datasets
+ dataset[N].references: field is not null and invalid format, expected 'uri': 31 datasets
+ dataset[N].description: '' should be non-empty: 26 datasets
+ dataset[N].contactPoint.hasEmail: invalid mailto URI format: 23 datasets
+ missing required field 'bureauCode': 22 datasets
+ missing required field 'programCode': 22 datasets
+ dataset[N].programCode[N]: invalid program code format (expected '###:###'): 16 datasets
+ dataset[N].landingPage: field is not null and invalid format, expected 'uri': 12 datasets
+ dataset[N].conformsTo: field is not null and invalid format, expected 'uri': 9 datasets
+ dataset[N].primaryITInvestmentUII: field is not null and invalid IT investment UII format (expected '###-#########'): 2 datasets
+ dataset[N].references: field is not null and ['https://www.cdc.gov/nchs/data/nvsr/nvsr51/nvsr51_12.pdf', 'https://www.cdc.gov/nchs/data/nvsr/nvsr62/nvsr62_09.pdf', 'https://www.cdc.gov/nchs/data/nvsr/nvsr63/nvsr63_04.pdf', 'https://www.cdc.gov/nchs/data/nvsr/nvsr62/nvsr62_09.pdf', 'https://www.cdc.gov/nchs/data/nvsr/nvsr64/nvsr64_01.pdf', 'https://www.cdc.gov/nchs/data/nvsr/nvsr64/nvsr64_12.pdf', 'https://www.cdc.gov/nchs/data/nvsr/nvsr66/nvsr66_01.pdf', 'https://www.cdc.gov/nchs/data/nvsr/nvsr67/nvsr67_01.pdf', 'https://www.cdc.gov/nchs/data/nvsr/nvsr67/nvsr67_08-508.pdf', 'https://www.cdc.gov/nchs/data/nvsr/nvsr68/nvsr68_13-508.pdf', 'https://www.cdc.gov/nchs/data/nvsr/nvsr70/nvsr70-02-508.pdf'] has non-unique elements: 1 dataset
+Removed 548 invalid dataset(s). 19913 valid dataset(s) remaining.
+------------------------------------------------------
+Processing: https://data.kingcounty.gov/data.json?version=2
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 748 error(s) across 341 datasets:
+ missing required field 'bureauCode': 341 datasets
+ missing required field 'programCode': 341 datasets
+ missing required field 'keyword': 45 datasets
+ dataset[N].description: '' should be non-empty: 21 datasets
+Removed 341 invalid dataset(s). 0 valid dataset(s) remaining.
+No valid datasets remain after filtering.
+------------------------------------------------------
+Processing: https://data.hartford.gov/data.json
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 322 error(s) across 128 datasets:
+ missing required field 'bureauCode': 128 datasets
+ missing required field 'programCode': 128 datasets
+ dataset[N].keyword: [] should be non-empty: 23 datasets
+ dataset[N].spatial: field is not null and '' should be non-empty: 21 datasets
+ dataset[N].theme: field is not null and expected type 'array': 21 datasets
+ dataset[N].license: field is not null and invalid format, expected 'uri': 1 dataset
+Removed 128 invalid dataset(s). 0 valid dataset(s) remaining.
+No valid datasets remain after filtering.
+------------------------------------------------------
+Processing: https://data-soa-dnr.opendata.arcgis.com/api/feed/dcat-us/1.1.json
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 1455 error(s) across 460 datasets:
+ dataset[N].license: field is not null and invalid format, expected 'uri': 460 datasets
+ missing required field 'bureauCode': 460 datasets
+ missing required field 'programCode': 460 datasets
+ dataset[N].spatial: field is not null and '' should be non-empty: 27 datasets
+ dataset[N].theme: field is not null and expected type 'array': 27 datasets
+ dataset[N].description: '' should be non-empty: 10 datasets
+ dataset[N].keyword[N]: '' should be non-empty: 8 datasets
+ dataset[N].keyword: [] should be non-empty: 2 datasets
+ dataset[N].contactPoint.hasEmail: invalid mailto URI format: 1 dataset
+Removed 460 invalid dataset(s). 0 valid dataset(s) remaining.
+No valid datasets remain after filtering.
+------------------------------------------------------
+Processing: https://ddi.doi.gov/onrr-data.json
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 14 error(s) across 14 datasets:
+ missing required field 'programCode': 14 datasets
+Removed 14 invalid dataset(s). 0 valid dataset(s) remaining.
+No valid datasets remain after filtering.
+------------------------------------------------------
+Processing: https://www.imls.gov/sites/default/files/data.json
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 1 error(s) across 59 datasets:
+ dataset[N].distribution: field is not null and invalid format, expected 'uri': 1 dataset
+Removed 1 invalid dataset(s). 58 valid dataset(s) remaining.
+------------------------------------------------------
+Processing: https://ddi.doi.gov/bsee-data.json
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 188 error(s) across 95 datasets:
+ missing required field 'programCode': 95 datasets
+ missing required field 'modified': 80 datasets
+ dataset[N].distribution: field is not null and missing required field 'mediaType': 13 datasets
+Removed 95 invalid dataset(s). 0 valid dataset(s) remaining.
+No valid datasets remain after filtering.
+------------------------------------------------------
+Processing: https://ddi.doi.gov/osmre-data.json
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 30 error(s) across 10 datasets:
+ missing required field 'bureauCode': 10 datasets
+ missing required field 'modified': 10 datasets
+ missing required field 'programCode': 10 datasets
+Removed 10 invalid dataset(s). 0 valid dataset(s) remaining.
+No valid datasets remain after filtering.
+------------------------------------------------------
+Processing: https://ddi.doi.gov/usgs-data.json
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 37392 error(s) across 37341 datasets:
+ missing required field 'programCode': 37341 datasets
+ dataset[N].distribution: field is not null and invalid format, expected 'uri': 51 datasets
+Removed 37341 invalid dataset(s). 0 valid dataset(s) remaining.
+No valid datasets remain after filtering.
+------------------------------------------------------
+Processing: https://data.honolulu.gov/data.json?version=2
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 1197 error(s) across 402 datasets:
+ missing required field 'bureauCode': 402 datasets
+ missing required field 'programCode': 402 datasets
+ dataset[N].description: '' should be non-empty: 328 datasets
+ missing required field 'keyword': 65 datasets
+Removed 402 invalid dataset(s). 0 valid dataset(s) remaining.
+No valid datasets remain after filtering.
+------------------------------------------------------
+Processing: https://data-fairfaxcountygis.opendata.arcgis.com/data.json
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 443 error(s) across 148 datasets:
+ missing required field 'bureauCode': 148 datasets
+ missing required field 'programCode': 148 datasets
+ dataset[N].license: field is not null and invalid format, expected 'uri': 147 datasets
+Removed 148 invalid dataset(s). 0 valid dataset(s) remaining.
+No valid datasets remain after filtering.
+------------------------------------------------------
+Processing: https://ddi.doi.gov/doios-data.json
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 3 error(s) across 2 datasets:
+ missing required field 'programCode': 2 datasets
+ dataset[N].contactPoint.hasEmail: invalid mailto URI format: 1 dataset
+Removed 2 invalid dataset(s). 0 valid dataset(s) remaining.
+No valid datasets remain after filtering.
+------------------------------------------------------
+Processing: https://edg.epa.gov/data/nongeo_data.json
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 6 error(s) across 449 datasets:
+ missing required field 'keyword': 4 datasets
+ missing required field 'bureauCode': 1 dataset
+ missing required field 'programCode': 1 dataset
+Removed 4 invalid dataset(s). 445 valid dataset(s) remaining.
+------------------------------------------------------
+Processing: https://opendata.fcc.gov/data.json
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 10 error(s) across 60 datasets:
+ missing required field 'keyword': 6 datasets
+ dataset[N].description: '' should be non-empty: 2 datasets
+ missing required field 'bureauCode': 1 dataset
+ missing required field 'programCode': 1 dataset
+Removed 6 invalid dataset(s). 54 valid dataset(s) remaining.
+------------------------------------------------------
+Processing: https://nycopendata.socrata.com/data.json?version=2
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 5991 error(s) across 2704 datasets:
+ missing required field 'bureauCode': 2704 datasets
+ missing required field 'programCode': 2704 datasets
+ missing required field 'keyword': 533 datasets
+ dataset[N].description: '' should be non-empty: 50 datasets
+Removed 2704 invalid dataset(s). 0 valid dataset(s) remaining.
+No valid datasets remain after filtering.
+------------------------------------------------------
+Processing: https://opendata.hawaii.gov/data.json
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 3491 error(s) across 976 datasets:
+ missing required field 'bureauCode': 976 datasets
+ missing required field 'programCode': 976 datasets
+ dataset[N].contactPoint.hasEmail: invalid mailto URI format: 822 datasets
+ dataset[N].license: field is not null and invalid format, expected 'uri': 453 datasets
+ missing required field 'keyword': 224 datasets
+ missing required field 'description': 39 datasets
+ dataset[N].distribution: field is not null and [N].accessURL: field is not null and invalid format, expected 'uri': 1 dataset
+Removed 976 invalid dataset(s). 0 valid dataset(s) remaining.
+No valid datasets remain after filtering.
+------------------------------------------------------
+Processing: https://data.brla.gov/data.json
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 519 error(s) across 254 datasets:
+ missing required field 'bureauCode': 254 datasets
+ missing required field 'programCode': 254 datasets
+ missing required field 'keyword': 6 datasets
+ dataset[N].description: '' should be non-empty: 5 datasets
+Removed 254 invalid dataset(s). 0 valid dataset(s) remaining.
+No valid datasets remain after filtering.
+------------------------------------------------------
+Processing: https://data.ed.gov/data.json
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 1 error(s) across 906 datasets:
+ missing required field 'contactPoint': 1 dataset
+Removed 1 invalid dataset(s). 905 valid dataset(s) remaining.
+------------------------------------------------------
+Processing: https://ddi.doi.gov/usbr-data.json
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 8745 error(s) across 8745 datasets:
+ missing required field 'programCode': 8745 datasets
+Removed 8745 invalid dataset(s). 0 valid dataset(s) remaining.
+No valid datasets remain after filtering.
+------------------------------------------------------
+Processing: https://datacatalog.cookcountyil.gov/data.json?version=2
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 1397 error(s) across 625 datasets:
+ missing required field 'bureauCode': 625 datasets
+ missing required field 'programCode': 625 datasets
+ dataset[N].description: '' should be non-empty: 105 datasets
+ missing required field 'keyword': 42 datasets
+Removed 625 invalid dataset(s). 0 valid dataset(s) remaining.
+No valid datasets remain after filtering.
+------------------------------------------------------
+Processing: https://data.nola.gov/data.json
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 544 error(s) across 230 datasets:
+ missing required field 'bureauCode': 230 datasets
+ missing required field 'programCode': 230 datasets
+ missing required field 'keyword': 63 datasets
+ dataset[N].description: '' should be non-empty: 21 datasets
+Removed 230 invalid dataset(s). 0 valid dataset(s) remaining.
+No valid datasets remain after filtering.
+------------------------------------------------------
+Processing: https://data.sfgov.org/data.json?version=2
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 1771 error(s) across 703 datasets:
+ missing required field 'bureauCode': 703 datasets
+ missing required field 'programCode': 703 datasets
+ missing required field 'keyword': 313 datasets
+ dataset[N].theme: field is not null and '' should be non-empty: 35 datasets
+ dataset[N].description: '' should be non-empty: 17 datasets
+Removed 703 invalid dataset(s). 0 valid dataset(s) remaining.
+No valid datasets remain after filtering.
+------------------------------------------------------
+Processing: https://data.ok.gov/data.json?version=2
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 902 error(s) across 389 datasets:
+ missing required field 'bureauCode': 389 datasets
+ missing required field 'programCode': 389 datasets
+ missing required field 'keyword': 60 datasets
+ dataset[N].contactPoint.hasEmail: invalid mailto URI format: 51 datasets
+ missing required field 'description': 13 datasets
+Removed 389 invalid dataset(s). 0 valid dataset(s) remaining.
+No valid datasets remain after filtering.
+------------------------------------------------------
+Processing: https://data.ny.gov/data.json?version=2
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 2344 error(s) across 1143 datasets:
+ missing required field 'bureauCode': 1143 datasets
+ missing required field 'programCode': 1143 datasets
+ missing required field 'keyword': 50 datasets
+ dataset[N].description: '' should be non-empty: 5 datasets
+ dataset[N].contactPoint.hasEmail: invalid mailto URI format: 3 datasets
+Removed 1143 invalid dataset(s). 0 valid dataset(s) remaining.
+No valid datasets remain after filtering.
+------------------------------------------------------
+Processing: https://data.cityofchicago.org/data.json?version=2
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 2285 error(s) across 1114 datasets:
+ missing required field 'bureauCode': 1114 datasets
+ missing required field 'programCode': 1114 datasets
+ missing required field 'keyword': 46 datasets
+ dataset[N].description: '' should be non-empty: 11 datasets
+Removed 1114 invalid dataset(s). 0 valid dataset(s) remaining.
+No valid datasets remain after filtering.
+------------------------------------------------------
+Processing: https://data.mo.gov/data.json?version=2
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 850 error(s) across 277 datasets:
+ missing required field 'bureauCode': 277 datasets
+ missing required field 'programCode': 277 datasets
+ missing required field 'keyword': 191 datasets
+ dataset[N].description: '' should be non-empty: 105 datasets
+Removed 277 invalid dataset(s). 0 valid dataset(s) remaining.
+No valid datasets remain after filtering.
+------------------------------------------------------
+Processing: https://data.baltimorecity.gov/data.json
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 3903 error(s) across 972 datasets:
+ missing required field 'bureauCode': 972 datasets
+ missing required field 'programCode': 972 datasets
+ dataset[N].license: field is not null and invalid format, expected 'uri': 932 datasets
+ dataset[N].contactPoint.hasEmail: invalid mailto URI format: 511 datasets
+ dataset[N].spatial: field is not null and '' should be non-empty: 251 datasets
+ dataset[N].theme: field is not null and expected type 'array': 251 datasets
+ dataset[N].keyword: [] should be non-empty: 7 datasets
+ dataset[N].keyword[N]: '' should be non-empty: 6 datasets
+ dataset[N].description: '' should be non-empty: 1 dataset
+Removed 972 invalid dataset(s). 0 valid dataset(s) remaining.
+No valid datasets remain after filtering.
+------------------------------------------------------
+Processing: https://data.austintexas.gov/data.json
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 7066 error(s) across 2839 datasets:
+ missing required field 'bureauCode': 2839 datasets
+ missing required field 'programCode': 2839 datasets
+ missing required field 'keyword': 1279 datasets
+ dataset[N].description: '' should be non-empty: 109 datasets
+Removed 2839 invalid dataset(s). 0 valid dataset(s) remaining.
+No valid datasets remain after filtering.
+------------------------------------------------------
+Processing: https://data.somervillema.gov/data.json?version=2
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 106 error(s) across 48 datasets:
+ missing required field 'bureauCode': 48 datasets
+ missing required field 'programCode': 48 datasets
+ missing required field 'keyword': 9 datasets
+ dataset[N].description: '' should be non-empty: 1 dataset
+Removed 48 invalid dataset(s). 0 valid dataset(s) remaining.
+No valid datasets remain after filtering.
+------------------------------------------------------
+Processing: https://cos-data.seattle.gov/data.json
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 1405 error(s) across 688 datasets:
+ missing required field 'bureauCode': 688 datasets
+ missing required field 'programCode': 688 datasets
+ missing required field 'keyword': 25 datasets
+ dataset[N].description: '' should be non-empty: 3 datasets
+ dataset[N].contactPoint.hasEmail: invalid mailto URI format: 1 dataset
+Removed 688 invalid dataset(s). 0 valid dataset(s) remaining.
+No valid datasets remain after filtering.
+------------------------------------------------------
+Processing: https://data.townofcary.org/data.json
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 269 error(s) across 78 datasets:
+ missing required field 'bureauCode': 78 datasets
+ missing required field 'hasEmail': 78 datasets
+ missing required field 'programCode': 78 datasets
+ dataset[N].publisher.name: expected type 'string': 14 datasets
+ dataset[N].description: expected type 'string': 13 datasets
+ missing required field 'keyword': 8 datasets
+Removed 78 invalid dataset(s). 0 valid dataset(s) remaining.
+No valid datasets remain after filtering.
+------------------------------------------------------
+Processing: https://data.charlottenc.gov/data.json
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 1665 error(s) across 372 datasets:
+ dataset[N].contactPoint.hasEmail: invalid mailto URI format: 372 datasets
+ dataset[N].license: field is not null and invalid format, expected 'uri': 372 datasets
+ missing required field 'bureauCode': 372 datasets
+ missing required field 'programCode': 372 datasets
+ dataset[N].description: '' should be non-empty: 74 datasets
+ dataset[N].spatial: field is not null and '' should be non-empty: 31 datasets
+ dataset[N].theme: field is not null and expected type 'array': 31 datasets
+ dataset[N].modified: does not match any alternative:
+ - invalid ISO 8601 date/datetime
+ - invalid ISO 8601 repeating interval: 20 datasets
+ dataset[N].keyword: [] should be non-empty: 15 datasets
+ dataset[N].distribution: field is not null and [N].accessURL: field is not null and invalid format, expected 'uri': 5 datasets
+ dataset[N].keyword[N]: '' should be non-empty: 1 dataset
+Removed 372 invalid dataset(s). 0 valid dataset(s) remaining.
+No valid datasets remain after filtering.
+------------------------------------------------------
+Processing: https://opendata-townofchapelhill.hub.arcgis.com/api/feed/dcat-us/1.1.json
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 468 error(s) across 123 datasets:
+ missing required field 'bureauCode': 123 datasets
+ missing required field 'programCode': 123 datasets
+ dataset[N].license: field is not null and invalid format, expected 'uri': 82 datasets
+ dataset[N].spatial: field is not null and '' should be non-empty: 64 datasets
+ dataset[N].theme: field is not null and expected type 'array': 64 datasets
+ dataset[N].keyword[N]: '' should be non-empty: 6 datasets
+ dataset[N].description: '' should be non-empty: 5 datasets
+ dataset[N].keyword: [] should be non-empty: 1 dataset
+Removed 123 invalid dataset(s). 0 valid dataset(s) remaining.
+No valid datasets remain after filtering.
+------------------------------------------------------
+Processing: https://data.va.gov/data.json
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 303 error(s) across 1962 datasets:
+ missing required field 'keyword': 115 datasets
+ missing required field 'bureauCode': 61 datasets
+ missing required field 'programCode': 61 datasets
+ dataset[N].description: '' should be non-empty: 47 datasets
+ dataset[N].contactPoint.hasEmail: invalid mailto URI format: 6 datasets
+ dataset[N].temporal: field is not null and does not match alternatives:
+ - invalid ISO 8601 date/datetime
+ - invalid ISO 8601 repeating interval: 4 datasets
+ dataset[N].bureauCode[N]: does not match pattern '[0-9]{3}:[0-9]{2}': 2 datasets
+ dataset[N].language: field is not null and does not match pattern '^(((([A-Za-z]{2,3}(-([A-Za-z]{3}(-[A-Za-z]{3}){0,2}))?)|[A-Za-z]{4}|[A-Za-z]{5,8})(-([A-Za-z]{4}))?(-([A-Za-z]{2}|[0-9]{3}))?(-([A-Za-z0-9]{5,8}|[0-9][A-Za-z0-9]{3}))*(-([0-9A-WY-Za-wy-z](-[A-Za-z0-9]{2,8})+))*(-(x(-[A-Za-z0-9]{1,8})+))?)|(x(-[A-Za-z0-9]{1,8})+)|((en-GB-oed|i-ami|i-bnn|i-default|i-enochian|i-hak|i-klingon|i-lux|i-mingo|i-navajo|i-pwn|i-tao|i-tay|i-tsu|sgn-BE-FR|sgn-BE-NL|sgn-CH-DE)|(art-lojban|cel-gaulish|no-bok|no-nyn|zh-guoyu|zh-hakka|zh-min|zh-min-nan|zh-xiang)))$': 2 datasets
+ dataset[N].programCode[N]: invalid program code format (expected '###:###'): 2 datasets
+ dataset[N].accrualPeriodicity: field is not null and does not match alternatives:
+ - value not in allowed values: ['irregular']
+ - invalid ISO 8601 duration: 1 dataset
+ dataset[N].landingPage: field is not null and invalid format, expected 'uri': 1 dataset
+ dataset[N].references: field is not null and invalid format, expected 'uri': 1 dataset
+Removed 205 invalid dataset(s). 1757 valid dataset(s) remaining.
+------------------------------------------------------
+Processing: https://geohub-loudoungis.opendata.arcgis.com/data.json
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 783 error(s) across 226 datasets:
+ missing required field 'bureauCode': 226 datasets
+ missing required field 'programCode': 226 datasets
+ dataset[N].license: field is not null and invalid format, expected 'uri': 144 datasets
+ dataset[N].spatial: field is not null and '' should be non-empty: 92 datasets
+ dataset[N].theme: field is not null and expected type 'array': 92 datasets
+ dataset[N].contactPoint.hasEmail: invalid mailto URI format: 2 datasets
+ dataset[N].keyword[N]: '' should be non-empty: 1 dataset
+Removed 226 invalid dataset(s). 0 valid dataset(s) remaining.
+No valid datasets remain after filtering.
+------------------------------------------------------
+Processing: https://data.ct.gov/data.json
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 2570 error(s) across 1213 datasets:
+ missing required field 'bureauCode': 1213 datasets
+ missing required field 'programCode': 1213 datasets
+ dataset[N].contactPoint.hasEmail: invalid mailto URI format: 69 datasets
+ dataset[N].description: '' should be non-empty: 37 datasets
+ missing required field 'keyword': 36 datasets
+ dataset[N].license: field is not null and invalid format, expected 'uri': 2 datasets
+Removed 1213 invalid dataset(s). 0 valid dataset(s) remaining.
+No valid datasets remain after filtering.
+------------------------------------------------------
+Processing: https://data.wa.gov/data.json?version=2
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 4470 error(s) across 1954 datasets:
+ missing required field 'bureauCode': 1954 datasets
+ missing required field 'programCode': 1954 datasets
+ dataset[N].description: '' should be non-empty: 493 datasets
+ missing required field 'keyword': 65 datasets
+ dataset[N].contactPoint.hasEmail: invalid mailto URI format: 4 datasets
+Removed 1954 invalid dataset(s). 0 valid dataset(s) remaining.
+No valid datasets remain after filtering.
+------------------------------------------------------
+Processing: https://data.oregon.gov/data.json?version=2
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 2319 error(s) across 981 datasets:
+ missing required field 'bureauCode': 981 datasets
+ missing required field 'programCode': 981 datasets
+ missing required field 'keyword': 260 datasets
+ dataset[N].description: '' should be non-empty: 87 datasets
+ dataset[N].license: field is not null and invalid format, expected 'uri': 10 datasets
+Removed 981 invalid dataset(s). 0 valid dataset(s) remaining.
+No valid datasets remain after filtering.
+------------------------------------------------------
+Processing: https://data.providenceri.gov/data.json
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 524 error(s) across 234 datasets:
+ missing required field 'bureauCode': 234 datasets
+ missing required field 'programCode': 234 datasets
+ missing required field 'keyword': 32 datasets
+ dataset[N].description: '' should be non-empty: 24 datasets
+Removed 234 invalid dataset(s). 0 valid dataset(s) remaining.
+No valid datasets remain after filtering.
+------------------------------------------------------
+Processing: https://public-chesva.opendata.arcgis.com/data.json
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 93 error(s) across 36 datasets:
+ missing required field 'bureauCode': 36 datasets
+ missing required field 'programCode': 36 datasets
+ dataset[N].license: field is not null and invalid format, expected 'uri': 11 datasets
+ dataset[N].spatial: field is not null and '' should be non-empty: 5 datasets
+ dataset[N].theme: field is not null and expected type 'array': 5 datasets
+Removed 36 invalid dataset(s). 0 valid dataset(s) remaining.
+No valid datasets remain after filtering.
+------------------------------------------------------
+Processing: https://www.ssa.gov/data.json
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 984 error(s) across 2416 datasets:
+ dataset[N].isPartOf: expected type 'string': 911 datasets
+ dataset[N].temporal: field is not null and does not match alternatives:
+ - invalid ISO 8601 date/datetime
+ - invalid ISO 8601 repeating interval: 73 datasets
+Removed 920 invalid dataset(s). 1496 valid dataset(s) remaining.
+------------------------------------------------------
+Processing: https://cwbi-app.sec.usace.army.mil/pub/data.json
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 6 error(s) across 2 datasets:
+ dataset[N].modified: does not match any alternative:
+ - invalid ISO 8601 date/datetime
+ - invalid ISO 8601 repeating interval: 2 datasets
+ missing required field 'bureauCode': 2 datasets
+ missing required field 'programCode': 2 datasets
+Removed 2 invalid dataset(s). 0 valid dataset(s) remaining.
+No valid datasets remain after filtering.
+------------------------------------------------------
+Processing: https://www.nrc.gov/data.json
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 14 error(s) across 230 datasets:
+ dataset[N].modified: does not match any alternative:
+ - invalid ISO 8601 date/datetime
+ - invalid ISO 8601 repeating interval: 14 datasets
+Removed 14 invalid dataset(s). 216 valid dataset(s) remaining.
+------------------------------------------------------
+Processing: https://data.transportation.gov/data.json
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 190 error(s) across 1760 datasets:
+ missing required field 'programCode': 62 datasets
+ missing required field 'keyword': 52 datasets
+ dataset[N].contactPoint.hasEmail: invalid mailto URI format: 20 datasets
+ dataset[N].description: '' should be non-empty: 15 datasets
+ missing required field 'bureauCode': 15 datasets
+ dataset[N].temporal: field is not null and does not match alternatives:
+ - invalid ISO 8601 date/datetime
+ - invalid ISO 8601 repeating interval: 12 datasets
+ dataset[N].accrualPeriodicity: field is not null and does not match alternatives:
+ - value not in allowed values: ['irregular']
+ - invalid ISO 8601 duration: 7 datasets
+ dataset[N].distribution: field is not null and invalid format, expected 'uri': 3 datasets
+ dataset[N].landingPage: field is not null and invalid format, expected 'uri': 2 datasets
+ dataset[N].describedBy: field is not null and invalid format, expected 'uri': 1 dataset
+ dataset[N].references: field is not null and invalid format, expected 'uri': 1 dataset
+Removed 115 invalid dataset(s). 1645 valid dataset(s) remaining.
+------------------------------------------------------
+Processing: https://geospatial-usace.opendata.arcgis.com/data.json
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 369 error(s) across 361 datasets:
+ dataset[N].license: field is not null and invalid format, expected 'uri': 361 datasets
+ dataset[N].description: '' should be non-empty: 4 datasets
+ dataset[N].keyword[N]: '' should be non-empty: 3 datasets
+ dataset[N].keyword: [] should be non-empty: 1 dataset
+Removed 361 invalid dataset(s). 0 valid dataset(s) remaining.
+No valid datasets remain after filtering.
+------------------------------------------------------
+Processing: https://opendata.maryland.gov/data.json
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 6243 error(s) across 2451 datasets:
+ missing required field 'bureauCode': 2451 datasets
+ missing required field 'programCode': 2451 datasets
+ missing required field 'keyword': 1341 datasets
+Removed 2451 invalid dataset(s). 0 valid dataset(s) remaining.
+No valid datasets remain after filtering.
+------------------------------------------------------
+Processing: https://ngda-transportation-geoplatform.hub.arcgis.com/api/feed/dcat-us/1.1.json
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 28 error(s) across 46 datasets:
+ dataset[N].spatial: field is not null and '' should be non-empty: 14 datasets
+ dataset[N].theme: field is not null and expected type 'array': 14 datasets
+Removed 14 invalid dataset(s). 32 valid dataset(s) remaining.
+------------------------------------------------------
+Processing: https://data.ferndalemi.gov/data.json
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 3802 error(s) across 1132 datasets:
+ missing required field 'bureauCode': 1132 datasets
+ missing required field 'programCode': 1132 datasets
+ dataset[N].license: field is not null and invalid format, expected 'uri': 858 datasets
+ dataset[N].spatial: field is not null and '' should be non-empty: 305 datasets
+ dataset[N].theme: field is not null and expected type 'array': 305 datasets
+ dataset[N].description: '' should be non-empty: 49 datasets
+ dataset[N].keyword[N]: '' should be non-empty: 13 datasets
+ dataset[N].title: expected type 'string': 3 datasets
+ dataset[N].keyword: [] should be non-empty: 2 datasets
+ dataset[N].contactPoint.hasEmail: invalid mailto URI format: 1 dataset
+ dataset[N].distribution: field is not null and [N].accessURL: field is not null and invalid format, expected 'uri': 1 dataset
+ dataset[N].modified: does not match any alternative:
+ - invalid ISO 8601 date/datetime
+ - invalid ISO 8601 repeating interval: 1 dataset
+Removed 1132 invalid dataset(s). 0 valid dataset(s) remaining.
+No valid datasets remain after filtering.
+------------------------------------------------------
+Processing: https://data.tempe.gov/api/feed/dcat-us/1.1.json
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 1267 error(s) across 621 datasets:
+ missing required field 'bureauCode': 621 datasets
+ missing required field 'programCode': 621 datasets
+ dataset[N].contactPoint.hasEmail: invalid mailto URI format: 7 datasets
+ dataset[N].keyword[N]: '' should be non-empty: 6 datasets
+ dataset[N].modified: does not match any alternative:
+ - invalid ISO 8601 date/datetime
+ - invalid ISO 8601 repeating interval: 6 datasets
+ dataset[N].description: '' should be non-empty: 4 datasets
+ dataset[N].keyword: [] should be non-empty: 2 datasets
+Removed 621 invalid dataset(s). 0 valid dataset(s) remaining.
+No valid datasets remain after filtering.
+------------------------------------------------------
+Processing: https://data.arlingtonva.us/data.json
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 656 error(s) across 185 datasets:
+ missing required field 'bureauCode': 185 datasets
+ missing required field 'keyword': 185 datasets
+ missing required field 'programCode': 185 datasets
+ dataset[N].modified: expected type 'string': 98 datasets
+ dataset[N].distribution: field is not null and invalid format, expected 'uri': 3 datasets
+Removed 185 invalid dataset(s). 0 valid dataset(s) remaining.
+No valid datasets remain after filtering.
+------------------------------------------------------
+Processing: https://bloomington.data.socrata.com/data.json
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 732 error(s) across 278 datasets:
+ missing required field 'bureauCode': 278 datasets
+ missing required field 'programCode': 278 datasets
+ missing required field 'keyword': 151 datasets
+ dataset[N].description: '' should be non-empty: 25 datasets
+Removed 278 invalid dataset(s). 0 valid dataset(s) remaining.
+No valid datasets remain after filtering.
+------------------------------------------------------
+Processing: https://data-wake.opendata.arcgis.com/data.json
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 1889 error(s) across 521 datasets:
+ missing required field 'bureauCode': 521 datasets
+ missing required field 'programCode': 521 datasets
+ dataset[N].license: field is not null and invalid format, expected 'uri': 414 datasets
+ dataset[N].contactPoint.hasEmail: invalid mailto URI format: 228 datasets
+ dataset[N].description: '' should be non-empty: 165 datasets
+ dataset[N].spatial: field is not null and '' should be non-empty: 17 datasets
+ dataset[N].theme: field is not null and expected type 'array': 17 datasets
+ dataset[N].keyword: [] should be non-empty: 5 datasets
+ dataset[N].keyword[N]: '' should be non-empty: 1 dataset
+Removed 521 invalid dataset(s). 0 valid dataset(s) remaining.
+No valid datasets remain after filtering.
+------------------------------------------------------
+Processing: https://www.cpsc.gov/data.json
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 1 error(s) across 8 datasets:
+ dataset[N].distribution: field is not null and invalid format, expected 'uri': 1 dataset
+Removed 1 invalid dataset(s). 7 valid dataset(s) remaining.
+------------------------------------------------------
+Processing: https://data.montgomerycountymd.gov/data.json
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 1102 error(s) across 499 datasets:
+ missing required field 'bureauCode': 499 datasets
+ missing required field 'programCode': 499 datasets
+ missing required field 'keyword': 74 datasets
+ dataset[N].description: '' should be non-empty: 30 datasets
+Removed 499 invalid dataset(s). 0 valid dataset(s) remaining.
+No valid datasets remain after filtering.
+------------------------------------------------------
+Processing: https://www.opendataphilly.org/data.json
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 1323 error(s) across 462 datasets:
+ missing required field 'bureauCode': 462 datasets
+ missing required field 'programCode': 462 datasets
+ dataset[N].modified: expected type 'string': 373 datasets
+ dataset[N].description: '' should be non-empty: 19 datasets
+ dataset[N].distribution: field is not null and [N].accessURL: field is not null and invalid format, expected 'uri': 4 datasets
+ dataset[N].contactPoint.hasEmail: invalid mailto URI format: 2 datasets
+ dataset[N].keyword[N]: expected type 'string': 1 dataset
+Removed 462 invalid dataset(s). 0 valid dataset(s) remaining.
+No valid datasets remain after filtering.
+------------------------------------------------------
+Processing: https://federallabs.org/data.json
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 2 error(s) across 1 datasets:
+ missing required field 'bureauCode': 1 dataset
+ missing required field 'programCode': 1 dataset
+Removed 1 invalid dataset(s). 0 valid dataset(s) remaining.
+No valid datasets remain after filtering.
+------------------------------------------------------
+Processing: https://data.lacity.org/data.json
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 866 error(s) across 383 datasets:
+ missing required field 'bureauCode': 383 datasets
+ missing required field 'programCode': 383 datasets
+ missing required field 'keyword': 83 datasets
+ dataset[N].description: '' should be non-empty: 17 datasets
+Removed 383 invalid dataset(s). 0 valid dataset(s) remaining.
+No valid datasets remain after filtering.
+------------------------------------------------------
+Processing: https://opendata.cityofboise.org/data.json
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 298 error(s) across 89 datasets:
+ dataset[N].license: field is not null and invalid format, expected 'uri': 89 datasets
+ missing required field 'bureauCode': 89 datasets
+ missing required field 'programCode': 89 datasets
+ dataset[N].spatial: field is not null and '' should be non-empty: 13 datasets
+ dataset[N].theme: field is not null and expected type 'array': 13 datasets
+ dataset[N].description: '' should be non-empty: 2 datasets
+ dataset[N].keyword[N]: '' should be non-empty: 2 datasets
+ dataset[N].keyword: [] should be non-empty: 1 dataset
+Removed 89 invalid dataset(s). 0 valid dataset(s) remaining.
+No valid datasets remain after filtering.
+------------------------------------------------------
+Processing: https://opendata.dc.gov/data.json
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 3942 error(s) across 1867 datasets:
+ missing required field 'bureauCode': 1867 datasets
+ missing required field 'programCode': 1867 datasets
+ dataset[N].license: field is not null and invalid format, expected 'uri': 72 datasets
+ dataset[N].spatial: field is not null and '' should be non-empty: 58 datasets
+ dataset[N].theme: field is not null and expected type 'array': 58 datasets
+ dataset[N].keyword[N]: '' should be non-empty: 8 datasets
+ dataset[N].contactPoint.hasEmail: invalid mailto URI format: 7 datasets
+ dataset[N].keyword: [] should be non-empty: 3 datasets
+ dataset[N].description: '' should be non-empty: 2 datasets
+Removed 1867 invalid dataset(s). 0 valid dataset(s) remaining.
+No valid datasets remain after filtering.
+------------------------------------------------------
+Processing: https://data-lakecountyil.opendata.arcgis.com/data.json
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 3527 error(s) across 1548 datasets:
+ missing required field 'bureauCode': 1548 datasets
+ missing required field 'programCode': 1548 datasets
+ dataset[N].license: field is not null and invalid format, expected 'uri': 113 datasets
+ dataset[N].spatial: field is not null and '' should be non-empty: 105 datasets
+ dataset[N].theme: field is not null and expected type 'array': 105 datasets
+ dataset[N].distribution: field is not null and [N].accessURL: field is not null and invalid format, expected 'uri': 95 datasets
+ dataset[N].description: '' should be non-empty: 10 datasets
+ dataset[N].keyword[N]: '' should be non-empty: 3 datasets
+Removed 1548 invalid dataset(s). 0 valid dataset(s) remaining.
+No valid datasets remain after filtering.
+------------------------------------------------------
+Processing: https://geodata.vermont.gov/api/feed/dcat-us/1.1.json
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 3664 error(s) across 1117 datasets:
+ missing required field 'bureauCode': 1117 datasets
+ missing required field 'programCode': 1117 datasets
+ dataset[N].license: field is not null and invalid format, expected 'uri': 950 datasets
+ dataset[N].contactPoint.hasEmail: invalid mailto URI format: 406 datasets
+ dataset[N].description: '' should be non-empty: 44 datasets
+ dataset[N].keyword: [] should be non-empty: 16 datasets
+ dataset[N].keyword[N]: '' should be non-empty: 9 datasets
+ dataset[N].title: expected type 'string': 5 datasets
+Removed 1117 invalid dataset(s). 0 valid dataset(s) remaining.
+No valid datasets remain after filtering.
+------------------------------------------------------
+Processing: https://dataworks.siouxfalls.gov/data.json
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 473 error(s) across 236 datasets:
+ missing required field 'bureauCode': 236 datasets
+ missing required field 'programCode': 236 datasets
+ dataset[N].license: field is not null and invalid format, expected 'uri': 1 dataset
+Removed 236 invalid dataset(s). 0 valid dataset(s) remaining.
+No valid datasets remain after filtering.
+------------------------------------------------------
+Processing: https://data.wprdc.org/data.json
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 2336 error(s) across 368 datasets:
+ missing required field 'bureauCode': 368 datasets
+ missing required field 'identifier': 368 datasets
+ missing required field 'keyword': 368 datasets
+ missing required field 'programCode': 368 datasets
+ missing required field 'fn': 362 datasets
+ missing required field 'hasEmail': 362 datasets
+ dataset[N].distribution: field is not null and missing required field 'mediaType': 139 datasets
+ missing required field 'description': 1 dataset
+Removed 368 invalid dataset(s). 0 valid dataset(s) remaining.
+No valid datasets remain after filtering.
+------------------------------------------------------
+Processing: https://geocatalog-uidaho.hub.arcgis.com/data.json
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 477 error(s) across 191 datasets:
+ missing required field 'bureauCode': 191 datasets
+ missing required field 'programCode': 191 datasets
+ dataset[N].license: field is not null and invalid format, expected 'uri': 83 datasets
+ dataset[N].spatial: field is not null and '' should be non-empty: 6 datasets
+ dataset[N].theme: field is not null and expected type 'array': 6 datasets
+Removed 191 invalid dataset(s). 0 valid dataset(s) remaining.
+No valid datasets remain after filtering.
+------------------------------------------------------
+Processing: https://rossi.urs-tally.com/Content/data.json
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 6 error(s) across 3 datasets:
+ missing required field 'bureauCode': 3 datasets
+ missing required field 'programCode': 3 datasets
+Removed 3 invalid dataset(s). 0 valid dataset(s) remaining.
+No valid datasets remain after filtering.
+------------------------------------------------------
+Processing: https://louisville-metro-opendata-lojic.hub.arcgis.com/api/feed/dcat-us/1.1.json
+Warning: catalog has invalid data, filtering it out:
+v1.1 validation failed with 9 error(s) across 515 datasets:
+ dataset[N].keyword: [] should be non-empty: 4 datasets
+ dataset[N].modified: does not match any alternative:
+ - invalid ISO 8601 date/datetime
+ - invalid ISO 8601 repeating interval: 4 datasets
+ dataset[N].keyword[N]: '' should be non-empty: 1 dataset
+Removed 9 invalid dataset(s). 506 valid dataset(s) remaining.
+------------------------------------------------------
diff --git a/jsonschema/v1.1_definitions/dataset.json b/jsonschema/v1.1_definitions/non-federal_dataset.json
similarity index 55%
rename from jsonschema/v1.1_definitions/dataset.json
rename to jsonschema/v1.1_definitions/non-federal_dataset.json
index 90275fb..34e243d 100644
--- a/jsonschema/v1.1_definitions/dataset.json
+++ b/jsonschema/v1.1_definitions/non-federal_dataset.json
@@ -1,12 +1,9 @@
{
- "$schema": "http://json-schema.org/draft-04/schema#",
- "id": "https://project-open-data.cio.gov/v1.1/schema/dataset.json#",
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
"title": "Project Open Data Dataset",
"description": "The metadata format for all federal open data. Validates a single JSON object entry (as opposed to entire Data.json catalog).",
"type": "object",
"required": [
- "bureauCode",
- "programCode",
"title",
"description",
"keyword",
@@ -20,18 +17,12 @@
"@type": {
"title": "Metadata Context",
"description": "IRI for the JSON-LD data type. This should be dcat:Dataset for each Dataset",
- "enum": [
- "dcat:Dataset"
- ]
+ "const": "dcat:Dataset"
},
"accessLevel": {
"description": "The degree to which this dataset could be made publicly-available, regardless of whether it has been made available. Choices: public (Data asset is or could be made publicly available to all without restrictions), restricted public (Data asset is available under certain use restrictions), or non-public (Data asset is not available to members of the public)",
"title": "Public Access Level",
- "enum": [
- "public",
- "restricted public",
- "non-public"
- ]
+ "enum": ["public", "restricted public", "non-public"]
},
"rights": {
"title": "Rights",
@@ -52,9 +43,7 @@
"description": "Frequency with which dataset is published.",
"anyOf": [
{
- "enum": [
- "irregular"
- ]
+ "const": "irregular"
},
{
"type": "string",
@@ -62,22 +51,37 @@
},
{
"type": "null"
+ },
+ {
+ "type": "string",
+ "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$"
}
]
},
"bureauCode": {
"title": "Bureau Code",
"description": "Federal agencies, combined agency and bureau code from OMB Circular A-11, Appendix C in the format of 015:010.",
- "type": "array",
- "items": {
- "type": "string",
- "pattern": "[0-9]{3}:[0-9]{2}"
- },
- "minItems": 1,
- "uniqueItems": true
+ "anyOf": [
+ {
+ "type": "array",
+ "items": {
+ "type": "string",
+ "pattern": "[0-9]{3}:[0-9]{2}"
+ },
+ "minItems": 1,
+ "uniqueItems": true
+ },
+ {
+ "type": "null"
+ },
+ {
+ "type": "string",
+ "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$"
+ }
+ ]
},
"contactPoint": {
- "$ref": "vcard.json"
+ "$ref": "#/$defs/vcard"
},
"describedBy": {
"title": "Data Dictionary",
@@ -89,6 +93,10 @@
},
{
"type": "null"
+ },
+ {
+ "type": "string",
+ "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$"
}
]
},
@@ -102,6 +110,10 @@
},
{
"type": "null"
+ },
+ {
+ "type": "string",
+ "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$"
}
]
},
@@ -115,6 +127,10 @@
},
{
"type": "null"
+ },
+ {
+ "type": "string",
+ "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$"
}
]
},
@@ -127,6 +143,10 @@
},
{
"type": "null"
+ },
+ {
+ "type": "string",
+ "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$"
}
]
},
@@ -134,7 +154,8 @@
"title": "Description",
"description": "Human-readable description (e.g., an abstract) with sufficient detail to enable a user to quickly understand whether the asset is of interest.",
"type": "string",
- "minLength": 1
+ "minLength": 1,
+ "maxLength": 10000
},
"distribution": {
"title": "Distribution",
@@ -143,13 +164,25 @@
{
"type": "array",
"items": {
- "$ref": "distribution.json",
- "minItems": 1,
- "uniqueItems": true
+ "anyOf": [
+ {
+ "minItems": 1,
+ "uniqueItems": true,
+ "$ref": "#/$defs/distribution"
+ },
+ {
+ "type": "string",
+ "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$"
+ }
+ ]
}
},
{
"type": "null"
+ },
+ {
+ "type": "string",
+ "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$"
}
]
},
@@ -169,18 +202,32 @@
},
{
"type": "null"
+ },
+ {
+ "type": "string",
+ "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$"
}
]
},
"keyword": {
"title": "Tags",
"description": "Tags (or keywords) help users discover your dataset; please include terms that would be used by technical and non-technical users.",
- "type": "array",
- "items": {
- "type": "string",
- "minLength": 1
- },
- "minItems": 1
+ "anyOf": [
+ {
+ "type": "array",
+ "items": {
+ "type": "string",
+ "minLength": 1,
+ "maxLength": 1000
+ },
+ "minItems": 1,
+ "maxItems": 1000
+ },
+ {
+ "type": "string",
+ "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$"
+ }
+ ]
},
"landingPage": {
"title": "Homepage URL",
@@ -192,6 +239,10 @@
},
{
"type": "null"
+ },
+ {
+ "type": "string",
+ "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$"
}
]
},
@@ -199,15 +250,19 @@
"title": "Language",
"description": "The language of the dataset.",
"anyOf": [
+ {
+ "type": "null"
+ },
+ {
+ "type": "string",
+ "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$"
+ },
{
"type": "array",
"items": {
"type": "string",
"pattern": "^(((([A-Za-z]{2,3}(-([A-Za-z]{3}(-[A-Za-z]{3}){0,2}))?)|[A-Za-z]{4}|[A-Za-z]{5,8})(-([A-Za-z]{4}))?(-([A-Za-z]{2}|[0-9]{3}))?(-([A-Za-z0-9]{5,8}|[0-9][A-Za-z0-9]{3}))*(-([0-9A-WY-Za-wy-z](-[A-Za-z0-9]{2,8})+))*(-(x(-[A-Za-z0-9]{1,8})+))?)|(x(-[A-Za-z0-9]{1,8})+)|((en-GB-oed|i-ami|i-bnn|i-default|i-enochian|i-hak|i-klingon|i-lux|i-mingo|i-navajo|i-pwn|i-tao|i-tay|i-tsu|sgn-BE-FR|sgn-BE-NL|sgn-CH-DE)|(art-lojban|cel-gaulish|no-bok|no-nyn|zh-guoyu|zh-hakka|zh-min|zh-min-nan|zh-xiang)))$"
}
- },
- {
- "type": "null"
}
]
},
@@ -221,6 +276,10 @@
},
{
"type": "null"
+ },
+ {
+ "type": "string",
+ "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$"
}
]
},
@@ -239,6 +298,10 @@
{
"type": "string",
"pattern": "^(R\\d*\\/)?([\\+-]?\\d{4}(?!\\d{2}\\b))((-?)((0[1-9]|1[0-2])(\\4([12]\\d|0[1-9]|3[01]))?|W([0-4]\\d|5[0-2])(-?[1-7])?|(00[1-9]|0[1-9]\\d|[12]\\d{2}|3([0-5]\\d|6[1-6])))([T\\s]((([01]\\d|2[0-3])((:?)[0-5]\\d)?|24\\:?00)([\\.,]\\d+(?!:))?)?(\\18[0-5]\\d([\\.,]\\d+)?)?([zZ]|([\\+-])([01]\\d|2[0-3]):?([0-5]\\d)?)?)?)?(\\/)P(?:\\d+(?:\\.\\d+)?Y)?(?:\\d+(?:\\.\\d+)?M)?(?:\\d+(?:\\.\\d+)?W)?(?:\\d+(?:\\.\\d+)?D)?(?:T(?:\\d+(?:\\.\\d+)?H)?(?:\\d+(?:\\.\\d+)?M)?(?:\\d+(?:\\.\\d+)?S)?)?$"
+ },
+ {
+ "type": "string",
+ "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$"
}
]
},
@@ -252,57 +315,71 @@
},
{
"type": "null"
+ },
+ {
+ "type": "string",
+ "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$"
}
]
},
"programCode": {
"title": "Program Code",
"description": "Federal agencies, list the primary program related to this data asset, from the Federal Program Inventory. Use the format of 015:001",
- "type": "array",
- "items": {
- "type": "string",
- "pattern": "[0-9]{3}:[0-9]{3}"
- },
- "minItems": 1,
- "uniqueItems": true
- },
- "publisher": {
- "$ref": "organization.json"
- },
- "references": {
- "title": "Related Documents",
- "description": "Related documents such as technical information about a dataset, developer documentation, etc.",
"anyOf": [
{
"type": "array",
"items": {
"type": "string",
- "format": "uri"
+ "pattern": "[0-9]{3}:[0-9]{3}"
},
"minItems": 1,
"uniqueItems": true
},
{
"type": "null"
+ },
+ {
+ "type": "string",
+ "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$"
}
]
},
- "spatial": {
- "title": "Spatial",
- "description": "The range of spatial applicability of a dataset. Could include a spatial region like a bounding box or a named place.",
+ "publisher": {
+ "$ref": "#/$defs/organization"
+ },
+ "references": {
+ "title": "Related Documents",
+ "description": "Related documents such as technical information about a dataset, developer documentation, etc.",
"anyOf": [
{
- "type": "string",
- "minLength": 1
+ "type": "array",
+ "items": {
+ "anyOf": [
+ {
+ "type": "string",
+ "format": "uri"
+ },
+ {
+ "type": "string",
+ "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$"
+ }
+ ]
+ },
+ "minItems": 1,
+ "uniqueItems": true
},
{
"type": "null"
+ },
+ {
+ "type": "string",
+ "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$"
}
]
},
- "systemOfRecords": {
- "title": "System of Records",
- "description": "If the systems is designated as a system of records under the Privacy Act of 1974, provide the URL to the System of Records Notice related to this dataset.",
+ "spatial": {
+ "title": "Spatial",
+ "description": "The range of spatial applicability of a dataset. Could include a spatial region like a bounding box or a named place.",
"anyOf": [
{
"type": "string",
@@ -330,6 +407,19 @@
}
]
},
+ "systemOfRecords": {
+ "title": "System of Records",
+ "description": "If the systems is designated as a system of records under the Privacy Act of 1974, provide the URL to the System of Records Notice related to this dataset.",
+ "anyOf": [
+ {
+ "type": "string",
+ "minLength": 1
+ },
+ {
+ "type": "null"
+ }
+ ]
+ },
"temporal": {
"title": "Temporal",
"description": "The range of temporal applicability of a dataset (i.e., a start and end date of applicability for the data).",
@@ -348,6 +438,10 @@
},
{
"type": "null"
+ },
+ {
+ "type": "string",
+ "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$"
}
]
},
@@ -358,6 +452,9 @@
{
"type": "string",
"minLength": 1
+ },
+ {
+ "type": "null"
}
]
},
@@ -376,6 +473,10 @@
},
{
"type": "null"
+ },
+ {
+ "type": "string",
+ "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$"
}
]
},
@@ -385,5 +486,248 @@
"type": "string",
"minLength": 1
}
+ },
+ "$id": "https://project-open-data.cio.gov/v1.1/schema/dataset.json#",
+ "$defs": {
+ "vcard": {
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
+ "title": "Project Open Data ContactPoint vCard",
+ "description": "A Dataset ContactPoint as a vCard object",
+ "type": "object",
+ "required": ["fn", "hasEmail"],
+ "properties": {
+ "@type": {
+ "title": "Metadata Context",
+ "description": "IRI for the JSON-LD data type. This should be vcard:Contact for contactPoint",
+ "const": "vcard:Contact"
+ },
+ "fn": {
+ "title": "Contact Name",
+ "description": "A full formatted name, eg Firstname Lastname",
+ "type": "string",
+ "minLength": 1
+ },
+ "hasEmail": {
+ "title": "Email",
+ "description": "Email address for the contact",
+ "anyOf": [
+ {
+ "pattern": "^mailto:[\\w\\_\\~\\!\\$\\&\\'\\(\\)\\*\\+\\,\\;\\=\\:.-]+@[\\w.-]+\\.[\\w.-]+?$",
+ "type": "string"
+ },
+ {
+ "type": "string",
+ "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$"
+ }
+ ]
+ }
+ },
+ "$id": "https://project-open-data.cio.gov/v1.1/schema/vcard.json#"
+ },
+ "distribution": {
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
+ "title": "Project Open Data Distribution",
+ "description": "Validates an entire collection of common core metadata JSON objects. Agencies produce said collections in the form of Data.json files.",
+ "type": "object",
+ "properties": {
+ "@type": {
+ "title": "Metadata Context",
+ "description": "IRI for the JSON-LD data type. This should be dcat:Distribution for each Distribution",
+ "const": "dcat:Distribution"
+ },
+ "downloadURL": {
+ "title": "Download URL",
+ "description": "URL providing direct access to a downloadable file of a dataset",
+ "anyOf": [
+ {
+ "type": "string",
+ "format": "uri"
+ },
+ {
+ "type": "string",
+ "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$"
+ }
+ ]
+ },
+ "mediaType": {
+ "title": "Media Type",
+ "description": "The machine-readable file format (IANA Media Type or MIME Type) of the distribution’s downloadURL",
+ "anyOf": [
+ {
+ "pattern": "^[-\\w]+/[-\\w]+(\\.[-\\w]+)*([+][-\\w]+)?$",
+ "type": "string"
+ },
+ {
+ "type": "null"
+ },
+ {
+ "type": "string",
+ "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$"
+ }
+ ]
+ },
+ "format": {
+ "title": "Format",
+ "description": "A human-readable description of the file format of a distribution",
+ "anyOf": [
+ {
+ "type": "string",
+ "minLength": 1
+ },
+ {
+ "type": "null"
+ }
+ ]
+ },
+ "accessURL": {
+ "title": "Access URL",
+ "description": "URL providing indirect access to a dataset",
+ "anyOf": [
+ {
+ "type": "string",
+ "format": "uri"
+ },
+ {
+ "type": "null"
+ },
+ {
+ "type": "string",
+ "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$"
+ }
+ ]
+ },
+ "description": {
+ "title": "Description",
+ "description": "Human-readable description of the distribution",
+ "anyOf": [
+ {
+ "type": "string",
+ "minLength": 1,
+ "maxLength": 10000
+ },
+ {
+ "type": "null"
+ },
+ {
+ "type": "string",
+ "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$"
+ }
+ ]
+ },
+ "title": {
+ "title": "Title",
+ "description": "Human-readable name of the distribution",
+ "anyOf": [
+ {
+ "type": "string",
+ "minLength": 1
+ },
+ {
+ "type": "null"
+ },
+ {
+ "type": "string",
+ "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$"
+ }
+ ]
+ },
+ "conformsTo": {
+ "title": "Data Standard",
+ "description": "URL providing indirect access to a dataset",
+ "anyOf": [
+ {
+ "type": "string",
+ "format": "uri"
+ },
+ {
+ "type": "null"
+ },
+ {
+ "type": "string",
+ "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$"
+ }
+ ]
+ },
+ "describedBy": {
+ "title": "Data Dictionary",
+ "description": "URL to the data dictionary for the distribution found at the downloadURL",
+ "anyOf": [
+ {
+ "type": "string",
+ "format": "uri"
+ },
+ {
+ "type": "null"
+ },
+ {
+ "type": "string",
+ "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$"
+ }
+ ]
+ },
+ "describedByType": {
+ "title": "Data Dictionary Type",
+ "description": "The machine-readable file format (IANA Media Type or MIME Type) of the distribution’s describedBy URL",
+ "anyOf": [
+ {
+ "pattern": "^[-\\w]+/[-\\w]+(\\.[-\\w]+)*([+][-\\w]+)?$",
+ "type": "string"
+ },
+ {
+ "type": "null"
+ },
+ {
+ "type": "string",
+ "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$"
+ }
+ ]
+ }
+ },
+ "$id": "https://project-open-data.cio.gov/v1.1/schema/distribution.json#",
+ "dependentSchemas": {
+ "downloadURL": {
+ "properties": {
+ "mediaType": {
+ "anyOf": [
+ {
+ "type": "string",
+ "pattern": "^[-\\w]+/[-\\w]+(\\.[-\\w]+)*([+][-\\w]+)?$"
+ },
+ {
+ "type": "string",
+ "pattern": "^(\\[\\[REDACTED).*?(\\]\\])$"
+ }
+ ]
+ }
+ },
+ "required": ["mediaType"]
+ }
+ }
+ },
+ "organization": {
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
+ "title": "Project Open Data Organization",
+ "description": "A Dataset Publisher Organization as a foaf:Agent object",
+ "type": "object",
+ "required": ["name"],
+ "properties": {
+ "@type": {
+ "title": "Metadata Context",
+ "description": "IRI for the JSON-LD data type. This should be org:Organization for each publisher",
+ "const": "org:Organization"
+ },
+ "name": {
+ "title": "Publisher Name",
+ "description": "A full formatted name, eg Firstname Lastname",
+ "type": "string",
+ "minLength": 1
+ },
+ "subOrganizationOf": {
+ "title": "Parent Organization",
+ "$ref": "#"
+ }
+ },
+ "$id": "https://project-open-data.cio.gov/v1.1/schema/organization.json#"
+ }
}
-}
\ No newline at end of file
+}
diff --git a/jsonschema/validate_1_1.py b/jsonschema/validate_1_1.py
new file mode 100644
index 0000000..a0042aa
--- /dev/null
+++ b/jsonschema/validate_1_1.py
@@ -0,0 +1,328 @@
+#!/usr/bin/env python3
+"""Convert a valid DCAT-US v1.1 catalog to a valid DCAT-US v3.0 catalog."""
+import json
+import re
+from pathlib import Path
+
+import click
+from curl_cffi import requests
+from curl_cffi.requests.exceptions import RequestException
+from jsonschema import Draft202012Validator
+from referencing import Registry, Resource
+
+
+V1_1_CATALOG_SCHEMA_ID = "https://project-open-data.cio.gov/v1.1/schema/catalog.json"
+SCRIPT_DIR = Path(__file__).parent
+V1_1_DEFINITIONS_DIR = SCRIPT_DIR / "v1.1_definitions"
+PATTERN_DESCRIPTIONS = {
+ "mailto": "invalid mailto URI format",
+ "R\\/P": "invalid ISO 8601 duration",
+ r"[\+-]?\d{4}.*\/": "invalid ISO 8601 interval",
+ r"R\d*\/": "invalid ISO 8601 repeating interval",
+ "[0-9]{3}:[0-9]{3}": "invalid program code format (expected '###:###')",
+ "[0-9]{3}-[0-9]{9}": "invalid IT investment UII format (expected '###-#########')",
+ r"[\+-]?\d{4}(?!\d{2}": "invalid ISO 8601 date/datetime",
+}
+
+
+class CatalogFetchException(Exception):
+ pass
+
+
+class CatalogValidationException(Exception):
+ pass
+
+
+def _describe_pattern(pattern: str) -> str:
+ """Return a human-readable description for a known regex pattern, or None."""
+ for substring, description in PATTERN_DESCRIPTIONS.items():
+ if substring in pattern:
+ return description
+ return None
+
+
+def format_path(path):
+ """Format a jsonschema path as a readable string like 'subject[0].inScheme'."""
+ if not path:
+ return "(root)"
+ parts = []
+ for p in path:
+ if isinstance(p, int):
+ # Array index - append to previous part
+ if parts:
+ parts[-1] = f"{parts[-1]}[{p}]"
+ else:
+ parts.append(f"[{p}]")
+ else:
+ parts.append(str(p))
+ return ".".join(parts)
+
+
+def format_validation_errors(errors, indent=0):
+ """Summarize validation errors grouped by type with occurrence counts."""
+ prefix = " " * indent
+ counts = {}
+
+ for error in errors:
+ summary = summarize_error(error)
+ if not summary:
+ continue
+ # Normalize array indices to [N] so errors from different items group together
+ key = re.sub(r'\[\d+\]', '[N]', summary)
+ counts[key] = counts.get(key, 0) + 1
+
+ if not counts:
+ return ""
+
+ # Sort by count descending, then alphabetically for stable output
+ sorted_items = sorted(counts.items(), key=lambda x: (-x[1], x[0]))
+
+ lines = []
+ for msg, count in sorted_items:
+ noun = "dataset" if count == 1 else "datasets"
+ lines.append(f"{prefix}{msg}: {count} {noun}")
+
+ return "\n".join(lines)
+
+
+def summarize_error(error, prefix="", is_suberror=False):
+ """Summarize a single error into a human-readable string."""
+ path = format_path(error.path)
+
+ # Handle anyOf/oneOf errors by finding meaningful sub-errors
+ if error.validator in ("anyOf", "oneOf") and error.context:
+ meaningful = find_meaningful_errors(error.context)
+
+ if not meaningful:
+ return f"{prefix}{path}: field is not null and does not match any allowed type"
+
+ has_null_alternative = any(is_null_type_error(e) for e in error.context)
+
+ summaries = []
+ for sub_error in meaningful:
+ sub_summary = summarize_error(sub_error, prefix="", is_suberror=True)
+ if sub_summary:
+ summaries.append(sub_summary)
+
+ # Collapse repeated identical sub-errors to a single bullet
+ unique_summaries = list(dict.fromkeys(summaries))
+
+ if has_null_alternative and unique_summaries:
+ intro = f"{path}: field is not null and "
+ if len(unique_summaries) == 1:
+ return f"{prefix}{intro}{unique_summaries[0]}"
+ else:
+ return f"{prefix}{intro}does not match alternatives:\n" + "\n".join(
+ f"{prefix} - {s}" for s in unique_summaries
+ )
+ elif unique_summaries:
+ if len(unique_summaries) == 1:
+ return f"{prefix}{path}: {unique_summaries[0]}"
+ else:
+ return f"{prefix}{path}: does not match any alternative:\n" + "\n".join(
+ f"{prefix} - {s}" for s in unique_summaries
+ )
+
+ # Handle $ref errors - find the expected class
+ if "$ref" in error.schema:
+ class_name = extract_schema_name(error.schema)
+ if error.context:
+ meaningful = find_meaningful_errors(error.context)
+ if meaningful:
+ sub_summaries = [summarize_error(e, prefix="", is_suberror=True) for e in meaningful]
+ sub_summaries = [s for s in sub_summaries if s]
+ if sub_summaries:
+ if class_name:
+ return f"does not conform to {class_name}: {'; '.join(sub_summaries)}"
+ return "; ".join(sub_summaries)
+ if class_name:
+ return f"does not conform to {class_name}"
+
+ # Handle required field errors
+ if error.validator == "required":
+ missing = error.validator_value
+ if isinstance(missing, list):
+ missing_fields = [f for f in missing if f in error.message]
+ if missing_fields:
+ return f"missing required field '{missing_fields[0]}'"
+ if "is a required property" in error.message:
+ field = error.message.split("'")[1]
+ return f"missing required field '{field}'"
+ return error.message
+
+ # Handle type errors
+ if error.validator == "type":
+ expected = error.validator_value
+ if isinstance(expected, list):
+ expected = " or ".join(expected)
+ if is_suberror:
+ return f"expected type '{expected}'"
+ return f"{prefix}{path}: expected type '{expected}'"
+
+ # Handle enum errors
+ if error.validator == "enum":
+ if is_suberror:
+ return f"value not in allowed values: {error.validator_value}"
+ return f"{prefix}{path}: value not in allowed values: {error.validator_value}"
+
+ # Handle pattern errors
+ if error.validator == "pattern":
+ description = _describe_pattern(error.validator_value)
+ msg = description if description else f"does not match pattern '{error.validator_value}'"
+ if is_suberror:
+ return msg
+ return f"{prefix}{path}: {msg}"
+
+ # Handle format errors
+ if error.validator == "format":
+ msg = f"invalid format, expected '{error.validator_value}'"
+ if is_suberror:
+ return msg
+ return f"{prefix}{path}: {msg}"
+
+ # Handle maxLength errors - omit the value to allow grouping
+ if error.validator == "maxLength":
+ msg = f"value is too long (max {error.validator_value} characters)"
+ if is_suberror:
+ return msg
+ return f"{prefix}{path}: {msg}"
+
+ # Default: use the message, prepending path if available
+ if not is_suberror and path and path != "(root)":
+ return f"{prefix}{path}: {error.message}"
+ return f"{prefix}{error.message}"
+
+
+def find_meaningful_errors(errors):
+ """Filter errors to find the meaningful ones, skipping null-type failures."""
+ meaningful = []
+ for error in errors:
+ if is_null_type_error(error):
+ continue
+ meaningful.append(error)
+ return meaningful if meaningful else list(errors)
+
+
+def is_null_type_error(error):
+ """Check if this error is just 'type is not null'."""
+ return (error.validator == "type" and
+ error.validator_value == "null")
+
+
+def extract_schema_name(schema):
+ """Extract a human-readable schema/class name from a schema definition."""
+ if isinstance(schema, dict):
+ if "$ref" in schema:
+ # Extract class name from ref like "/dcat-us/3.0.0/definitions/concept"
+ ref = schema["$ref"]
+ return ref.split("/")[-1].title()
+ if "title" in schema:
+ return schema["title"]
+ return None
+
+
+def load_schema_registry(definitions_dir: Path) -> Registry:
+ registry = Registry()
+ for schema_file in definitions_dir.glob("*.json"):
+ with schema_file.open() as f:
+ resource = Resource.from_contents(json.load(f))
+ registry = resource @ registry
+ return registry
+
+
+def fetch_dcat_catalog(url: str) -> dict:
+ """Fetch a DCAT-US v1.1 catalog to validate."""
+ # Some target servers (e.g. usda.gov) reject non-browser TLS/HTTP2 fingerprints, so
+ # we impersonate a real browser using curl_cffi.
+ try:
+ response = requests.get(url, timeout=60, impersonate="safari17_0")
+ response.raise_for_status()
+ except RequestException as e:
+ raise CatalogFetchException(f"Request failed: {type(e).__name__}: {e!r}") from e
+
+ try:
+ text = response.content.decode("utf-8-sig")
+ text = text.lstrip("\ufeff")
+ except UnicodeDecodeError:
+ text = response.content.decode("cp1252")
+
+ try:
+ data = json.loads(text)
+ if isinstance(data, list):
+ raise CatalogFetchException("Response is a JSON array, not a catalog object")
+ return data
+ except ValueError as e:
+ raise CatalogFetchException(f"Response was not valid JSON: {e}") from e
+
+
+def validate_catalog(schema_id: str, registry: Registry, catalog: dict) -> None:
+ """Validate a catalog and raise CatalogValidationException if invalid."""
+ validator = Draft202012Validator(
+ {"$ref": schema_id},
+ registry=registry,
+ format_checker=Draft202012Validator.FORMAT_CHECKER,
+ )
+ errors = list(validator.iter_errors(catalog))
+ if errors:
+ version_number = "v1.1" if "v1.1" in schema_id else "v3.0"
+ dataset_count = len(catalog.get("dataset", []))
+ raise CatalogValidationException(
+ f"{version_number} validation failed with {len(errors)} error(s) across {dataset_count} datasets:\n"
+ + format_validation_errors(errors, indent=2)
+ )
+
+
+def filter_invalid_datasets(schema_id: str, registry: Registry, catalog: dict) -> tuple[dict, int]:
+ """
+ Return a copy of the catalog with invalid datasets removed, plus the count removed.
+
+ Works by validating each dataset individually against the catalog schema.
+ Datasets whose index appears in any top-level error path are dropped.
+ """
+ validator = Draft202012Validator(
+ {"$ref": schema_id},
+ registry=registry,
+ format_checker=Draft202012Validator.FORMAT_CHECKER,
+ )
+ errors = list(validator.iter_errors(catalog))
+
+ # Collect the indices of datasets implicated in at least one error.
+ # jsonschema paths look like: deque(['dataset', 3, 'title', ...])
+ bad_indices = set()
+ for error in errors:
+ path = list(error.absolute_path)
+ if len(path) >= 2 and path[0] == "dataset" and isinstance(path[1], int):
+ bad_indices.add(path[1])
+
+ datasets = catalog.get("dataset", [])
+ filtered = [ds for i, ds in enumerate(datasets) if i not in bad_indices]
+ removed = len(datasets) - len(filtered)
+
+ return {**catalog, "dataset": filtered}, removed
+
+
+@click.command()
+@click.option("-u", "--url", help="URL of DCAT-US v1.1 catalog to be converted", required=True)
+def main(url):
+ v1_1_registry = load_schema_registry(V1_1_DEFINITIONS_DIR)
+ try:
+ catalog_to_convert = fetch_dcat_catalog(url)
+ except CatalogFetchException as e:
+ click.echo(f"There was an error fetching a DCAT-US v1.1 catalog to convert: {e}", err=True)
+ return 1
+
+ try:
+ validate_catalog(V1_1_CATALOG_SCHEMA_ID, v1_1_registry, catalog_to_convert)
+ except CatalogValidationException as e:
+ click.echo(f"Warning: catalog has invalid data, filtering it out:\n{e}", err=True)
+ catalog_to_convert, removed = filter_invalid_datasets(
+ V1_1_CATALOG_SCHEMA_ID, v1_1_registry, catalog_to_convert
+ )
+ remaining = len(catalog_to_convert.get("dataset", []))
+ click.echo(f"Removed {removed} invalid dataset(s). {remaining} valid dataset(s) remaining.", err=True)
+ if remaining == 0:
+ click.echo("No valid datasets remain after filtering.", err=True)
+ return 1
+
+if __name__ == "__main__":
+ main(standalone_mode=False)