diff --git a/CHANGELOG.md b/CHANGELOG.md index 9df5ebe..1f35ee5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,18 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.8.0] - 2026-03-03 + +### Added +- New endpoint `POST /dataset/{dataset_id}/publish` to copy datasets from local catalog to PRE-CKAN + - Copies dataset metadata and all associated resources + - Proper error handling for disabled PRE-CKAN and duplicate names + - Unit tests for all scenarios +- New `PRE_CKAN_ORGANIZATION` environment variable + - When set, overrides the owner_org when publishing to PRE-CKAN + - Required when PRE-CKAN API credentials are tied to a specific organization + - Local catalog can use any organization; PRE-CKAN uses the configured one + ## [0.7.2] - 2026-02-23 ### Added diff --git a/README.md b/README.md index f2a5d96..e74f39d 100644 --- a/README.md +++ b/README.md @@ -182,6 +182,13 @@ PRE_CKAN_URL=http://XX.XX.XX.XXX:5000/ # Obtain this from the NDP team or your Pre-CKAN user profile PRE_CKAN_API_KEY= +# Organization for Pre-CKAN publishing (Optional) +# When set, all datasets published to PRE-CKAN will use this organization, +# regardless of their original owner_org in the local catalog. +# Required when your PRE-CKAN API key is tied to a specific organization. +# Format: ep-XXXXXXXXXXXXXXXXXXXXXXXX (assigned by NDP) +PRE_CKAN_ORGANIZATION= + # ============================================== # STREAMING CONFIGURATION # ============================================== @@ -329,6 +336,7 @@ CKAN_API_KEY=your-local-ckan-api-key PRE_CKAN_ENABLED=True PRE_CKAN_URL=https://preckan.nationaldataplatform.org PRE_CKAN_API_KEY=your-ndp-preckan-api-key +PRE_CKAN_ORGANIZATION=ep-your-assigned-org-id ``` ## 🔒 Group-Based Access Control diff --git a/api/config/ckan_settings.py b/api/config/ckan_settings.py index 80050cb..3f8c8be 100644 --- a/api/config/ckan_settings.py +++ b/api/config/ckan_settings.py @@ -15,6 +15,7 @@ class Settings(BaseSettings): pre_ckan_url: str = "https://ndp-test.sdsc.edu/catalog2" pre_ckan_api_key: str = "" pre_ckan_verify_ssl: bool = True + pre_ckan_organization: str = "" def _get_session(self, verify_ssl: bool) -> requests.Session: """Create a requests session with SSL verification setting.""" diff --git a/api/routes/register_routes/__init__.py b/api/routes/register_routes/__init__.py index 5ec7e02..f79ff24 100644 --- a/api/routes/register_routes/__init__.py +++ b/api/routes/register_routes/__init__.py @@ -8,6 +8,7 @@ from .post_s3 import router as post_s3_router from .post_service import router as post_service_router from .post_url import router as post_url_router +from .publish_dataset import router as publish_dataset_router router = APIRouter() @@ -17,3 +18,4 @@ router.include_router(post_s3_router) router.include_router(post_service_router) router.include_router(post_general_dataset_router) +router.include_router(publish_dataset_router) diff --git a/api/routes/register_routes/publish_dataset.py b/api/routes/register_routes/publish_dataset.py new file mode 100644 index 0000000..4ca2ca9 --- /dev/null +++ b/api/routes/register_routes/publish_dataset.py @@ -0,0 +1,167 @@ +# api/routes/register_routes/publish_dataset.py + +"""Endpoint for publishing datasets from local catalog to PRE-CKAN.""" + +import logging +from typing import Any, Dict + +from fastapi import APIRouter, Depends, HTTPException, Path, status + +from api.services.auth_services import get_user_for_write_operation +from api.services.dataset_services.publish_dataset import publish_dataset_to_preckan + +logger = logging.getLogger(__name__) + +router = APIRouter() + + +@router.post( + "/dataset/{dataset_id}/publish", + response_model=dict, + status_code=status.HTTP_201_CREATED, + summary="Publish dataset from local catalog to PRE-CKAN", + description=( + "Publish (copy) a dataset from the local catalog to PRE-CKAN.\n\n" + "### Behavior\n" + "1. Fetches the dataset from the local catalog\n" + "2. Creates the dataset in PRE-CKAN with the same metadata\n" + "3. Creates all associated resources in PRE-CKAN\n\n" + "### Requirements\n" + "- PRE-CKAN must be enabled in the configuration\n" + "- The dataset must exist in the local catalog\n" + "- The dataset name must not already exist in PRE-CKAN\n\n" + "### Authorization\n" + "This endpoint requires authentication.\n\n" + "### Example Response\n" + "```json\n" + "{\n" + ' "id": "12345678-abcd-efgh-ijkl-1234567890ab",\n' + ' "message": "Dataset published to PRE-CKAN successfully"\n' + "}\n" + "```\n" + ), + responses={ + 201: { + "description": "Dataset published successfully", + "content": { + "application/json": { + "example": { + "id": "12345678-abcd-efgh-ijkl-1234567890ab", + "message": "Dataset published to PRE-CKAN successfully", + } + } + }, + }, + 400: { + "description": "Bad Request", + "content": { + "application/json": { + "examples": { + "preckan_disabled": { + "summary": "PRE-CKAN disabled", + "value": { + "detail": "PRE-CKAN is disabled and cannot be used." + }, + }, + "duplicate": { + "summary": "Dataset already exists", + "value": { + "detail": ( + "A dataset with name 'my-dataset' " + "already exists in PRE-CKAN." + ) + }, + }, + } + } + }, + }, + 401: { + "description": "Unauthorized - Authentication required", + "content": { + "application/json": {"example": {"detail": "Invalid or expired token"}} + }, + }, + 404: { + "description": "Dataset not found in local catalog", + "content": { + "application/json": { + "example": {"detail": "Dataset not found in local catalog: ..."} + } + }, + }, + 500: { + "description": "Internal server error", + "content": { + "application/json": { + "example": {"detail": "Error creating dataset in PRE-CKAN: ..."} + } + }, + }, + }, +) +async def publish_dataset_endpoint( + dataset_id: str = Path(..., description="ID or name of the dataset to publish"), + user_info: Dict[str, Any] = Depends(get_user_for_write_operation), +): + """ + Publish a dataset from the local catalog to PRE-CKAN. + + This endpoint copies a dataset and its resources from the local + catalog to PRE-CKAN, enabling promotion of datasets from + development/local environment to pre-production. + + Parameters + ---------- + dataset_id : str + The ID or name of the dataset to publish. + user_info : Dict[str, Any] + User authentication and authorization information. + + Returns + ------- + dict + A dictionary containing the new dataset ID in PRE-CKAN and + a success message. + + Raises + ------ + HTTPException + - 400: PRE-CKAN disabled or duplicate dataset + - 401: Authentication required + - 404: Dataset not found in local catalog + - 500: Error during publication + """ + try: + new_dataset_id = publish_dataset_to_preckan( + dataset_id=dataset_id, + user_info=user_info, + ) + + logger.info( + f"Dataset '{dataset_id}' published to PRE-CKAN " + f"with new ID: {new_dataset_id}" + ) + + return { + "id": new_dataset_id, + "message": "Dataset published to PRE-CKAN successfully", + } + + except ValueError as exc: + error_msg = str(exc) + if "not found" in error_msg.lower(): + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=error_msg, + ) + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=error_msg, + ) + except Exception as exc: + logger.error(f"Error publishing dataset to PRE-CKAN: {exc}") + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"Error publishing dataset: {str(exc)}", + ) diff --git a/api/services/dataset_services/__init__.py b/api/services/dataset_services/__init__.py index cc67461..ce3f534 100644 --- a/api/services/dataset_services/__init__.py +++ b/api/services/dataset_services/__init__.py @@ -7,4 +7,5 @@ ) from .get_resource import get_resource # noqa: F401 from .patch_resource import patch_resource # noqa: F401 +from .publish_dataset import publish_dataset_to_preckan # noqa: F401 from .search_resources import search_resources # noqa: F401 diff --git a/api/services/dataset_services/publish_dataset.py b/api/services/dataset_services/publish_dataset.py new file mode 100644 index 0000000..37f9a3f --- /dev/null +++ b/api/services/dataset_services/publish_dataset.py @@ -0,0 +1,168 @@ +# api/services/dataset_services/publish_dataset.py + +"""Service for publishing datasets from local catalog to PRE-CKAN.""" + +import logging +from typing import Any, Dict, List, Optional + +from api.config import catalog_settings, ckan_settings +from api.repositories import CKANRepository + +logger = logging.getLogger(__name__) + +# Fields to exclude when copying dataset to PRE-CKAN +EXCLUDED_FIELDS = { + "id", + "revision_id", + "metadata_created", + "metadata_modified", + "creator_user_id", + "state", + "type", + "num_resources", + "num_tags", + "relationships_as_subject", + "relationships_as_object", + "isopen", + "organization", +} + + +def publish_dataset_to_preckan( + dataset_id: str, + user_info: Optional[Dict[str, Any]] = None, +) -> str: + """ + Publish a dataset from local catalog to PRE-CKAN. + + This function fetches a dataset from the local catalog and creates + a copy in PRE-CKAN with the same metadata and resources. + + Parameters + ---------- + dataset_id : str + The ID or name of the dataset to publish. + user_info : Optional[Dict[str, Any]] + User information for authorization. + + Returns + ------- + str + The ID of the newly created dataset in PRE-CKAN. + + Raises + ------ + ValueError + If PRE-CKAN is not enabled or dataset not found. + Exception + For errors during dataset creation. + """ + # Check if PRE-CKAN is enabled + if not ckan_settings.pre_ckan_enabled: + raise ValueError("PRE-CKAN is disabled and cannot be used.") + + # Get local repository + local_repository = catalog_settings.local_catalog + + # Get PRE-CKAN repository + preckan_repository = CKANRepository(ckan_settings.pre_ckan) + + # Fetch dataset from local catalog + try: + dataset = local_repository.package_show(id=dataset_id) + except Exception as exc: + raise ValueError(f"Dataset not found in local catalog: {str(exc)}") + + # Prepare dataset dict for PRE-CKAN (exclude system fields) + dataset_dict = { + key: value + for key, value in dataset.items() + if key not in EXCLUDED_FIELDS and value is not None + } + + # Override owner_org with PRE-CKAN organization if configured + if ckan_settings.pre_ckan_organization: + original_org = dataset_dict.get("owner_org", "none") + dataset_dict["owner_org"] = ckan_settings.pre_ckan_organization + logger.info( + f"Using PRE-CKAN organization '{ckan_settings.pre_ckan_organization}' " + f"(original: '{original_org}')" + ) + else: + # Resolve owner_org: if it's a UUID, get the org name from local catalog + owner_org = dataset_dict.get("owner_org") + if owner_org: + try: + org = local_repository.organization_show(id=owner_org) + # Use org name instead of UUID for PRE-CKAN compatibility + dataset_dict["owner_org"] = org.get("name", owner_org) + logger.info(f"Resolved owner_org '{owner_org}' to '{dataset_dict['owner_org']}'") + except Exception: + # If we can't resolve, keep original value + logger.warning(f"Could not resolve owner_org '{owner_org}', using as-is") + + # Extract resources to create separately + resources = dataset_dict.pop("resources", []) + + # Clean up resources (remove system fields) + cleaned_resources = [] + for resource in resources: + cleaned_resource = { + key: value + for key, value in resource.items() + if key + not in { + "id", + "package_id", + "revision_id", + "created", + "metadata_modified", + "state", + "position", + "datastore_active", + "url_type", + "hash", + "size", + "cache_url", + "cache_last_updated", + "last_modified", + } + and value is not None + } + cleaned_resources.append(cleaned_resource) + + # Create dataset in PRE-CKAN + try: + new_dataset = preckan_repository.package_create(**dataset_dict) + new_dataset_id = new_dataset["id"] + logger.info(f"Dataset created in PRE-CKAN with ID: {new_dataset_id}") + except Exception as exc: + error_msg = str(exc) + if "That name is already in use" in error_msg: + raise ValueError( + f"A dataset with name '{dataset_dict.get('name')}' " + "already exists in PRE-CKAN." + ) + if "Organization does not exist" in error_msg: + raise ValueError( + f"Organization '{dataset_dict.get('owner_org')}' " + "does not exist in PRE-CKAN. Create it first." + ) + raise Exception(f"Error creating dataset in PRE-CKAN: {error_msg}") + + # Create resources in PRE-CKAN + for resource in cleaned_resources: + try: + resource["package_id"] = new_dataset_id + preckan_repository.resource_create(**resource) + logger.info( + f"Resource '{resource.get('name', 'unnamed')}' " + f"created in PRE-CKAN dataset {new_dataset_id}" + ) + except Exception as exc: + logger.warning( + f"Failed to create resource in PRE-CKAN: {str(exc)}. " + "Continuing with remaining resources." + ) + + return new_dataset_id diff --git a/example.env b/example.env index 86a7b5a..b002690 100644 --- a/example.env +++ b/example.env @@ -105,6 +105,12 @@ PRE_CKAN_URL= # API key for Pre-CKAN authentication (Optional) PRE_CKAN_API_KEY= +# Organization name for Pre-CKAN (Optional) +# When set, this organization will be used for all datasets published to PRE-CKAN, +# overriding the original owner_org from the local catalog. +# This is required when your PRE-CKAN credentials are tied to a specific organization. +PRE_CKAN_ORGANIZATION= + # ============================================== # Streaming Configuration # ==============================================