diff --git a/Makefile b/Makefile index 7e5a24a1d..d663a6c0f 100644 --- a/Makefile +++ b/Makefile @@ -12,7 +12,7 @@ SECRETS_DIR=${BUILD_DIR}/secrets DB_PASSWORD_PATH=${SECRETS_DIR}/db_password PGPASS_PATH=${SECRETS_DIR}/.pgpass SECRET_KEY_PATH=${SECRETS_DIR}/django_secret_key -EXT_SECRETS=hcaptcha_secret github_client_secret orcid_client_secret discourse_api_key discourse_sso_secret mail_api_key datacite_api_password +EXT_SECRETS=hcaptcha_secret github_client_secret orcid_client_secret discourse_api_key discourse_sso_secret mail_api_key datacite_api_password github_integration_app_private_key github_integration_app_webhook_secret GENERATED_SECRETS=$(DB_PASSWORD_PATH) $(PGPASS_PATH) $(SECRET_KEY_PATH) ENVREPLACE := deploy/scripts/envreplace diff --git a/base.yml b/base.yml index 374bbc4ce..4061cd90f 100644 --- a/base.yml +++ b/base.yml @@ -67,6 +67,8 @@ services: - django_secret_key - github_client_secret - orcid_client_secret + - github_integration_app_private_key + - github_integration_app_webhook_secret - hcaptcha_secret - mail_api_key volumes: @@ -98,6 +100,10 @@ secrets: file: ./build/secrets/django_secret_key github_client_secret: file: ./build/secrets/github_client_secret + github_integration_app_private_key: + file: ./build/secrets/github_integration_app_private_key + github_integration_app_webhook_secret: + file: ./build/secrets/github_integration_app_webhook_secret hcaptcha_secret: file: ./build/secrets/hcaptcha_secret mail_api_key: diff --git a/deploy/conf/.env.template b/deploy/conf/.env.template index ce64ac44e..470b0d2ef 100644 --- a/deploy/conf/.env.template +++ b/deploy/conf/.env.template @@ -42,6 +42,12 @@ ORCID_CLIENT_ID= DATACITE_API_USERNAME= DATACITE_DRY_RUN="true" # allowed values: "true" or "false" +# github integration app +GITHUB_INTEGRATION_APP_ID= +GITHUB_INTEGRATION_APP_NAME= +GITHUB_INTEGRATION_APP_INSTALLATION_ID= +GITHUB_MODEL_LIBRARY_ORG_NAME= + # test TEST_USER_ID=10000000 TEST_USERNAME=__test_user__ diff --git a/django/Dockerfile b/django/Dockerfile index c78ac5ec5..0330b57d5 100644 --- a/django/Dockerfile +++ b/django/Dockerfile @@ -43,9 +43,9 @@ RUN --mount=type=cache,target=/var/lib/apt,sharing=locked \ && update-alternatives --install /usr/bin/python python /usr/bin/python3 1000 \ && python -m venv ${VIRTUAL_ENV} \ && apt-get upgrade -q -y -o Dpkg::Options::="--force-confold" \ - && mkdir -p /etc/service/django \ - && touch /etc/service/django/run /etc/postgresql-backup-pre \ - && chmod a+x /etc/service/django/run /etc/postgresql-backup-pre \ + && mkdir -p /etc/service/django /etc/service/huey \ + && touch /etc/service/django/run /etc/service/huey/run /etc/postgresql-backup-pre \ + && chmod a+x /etc/service/django/run /etc/service/huey/run /etc/postgresql-backup-pre \ && apt-get autoremove -y && apt-get clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* WORKDIR /code @@ -61,5 +61,13 @@ COPY ./deploy/cron.weekly/* /etc/cron.weekly/ COPY ./deploy/db/autopostgresqlbackup.conf /etc/default/autopostgresqlbackup COPY ./deploy/db/postgresql-backup-pre /etc/ COPY ${RUN_SCRIPT} /etc/service/django/run +COPY ./deploy/huey.sh /etc/service/huey/run COPY . /code + +# FIXME: replace with install from pypi +# upgrading pip because of some bug with the debian patched version +RUN python3 -m pip install --upgrade pip +RUN pip3 install git+https://github.com/sgfost/codemeticulous.git +RUN pip3 install -r /tmp/requirements.txt + CMD ["/sbin/my_init"] diff --git a/django/core/huey.py b/django/core/huey.py new file mode 100644 index 000000000..a6c4d8769 --- /dev/null +++ b/django/core/huey.py @@ -0,0 +1,13 @@ +from django_redis import get_redis_connection +from huey import RedisHuey + + +class DjangoRedisHuey(RedisHuey): + """Huey subclass that uses the existing connection pool + from the django-redis cache backend + """ + + def __init__(self, *args, **kwargs): + connection = get_redis_connection("default") + kwargs["connection_pool"] = connection.connection_pool + super().__init__(*args, **kwargs) diff --git a/django/core/models.py b/django/core/models.py index 0d158d0d7..881e52df0 100644 --- a/django/core/models.py +++ b/django/core/models.py @@ -464,6 +464,14 @@ def github_url(self): """ return self.get_social_account_profile_url("github") + @property + def github_username(self): + github_account = self.get_social_account("github") + if github_account: + return github_account.extra_data.get("login") + else: + return None + def get_social_account_profile_url(self, provider_name): social_acct = self.get_social_account(provider_name) if social_acct: diff --git a/django/core/serializers.py b/django/core/serializers.py index 3c663131a..434523866 100644 --- a/django/core/serializers.py +++ b/django/core/serializers.py @@ -69,10 +69,8 @@ def create(model_cls, validated_data, context): def update(serializer_update, instance, validated_data): tags = TagSerializer(many=True, data=validated_data.pop("tags")) - instance = serializer_update(instance, validated_data) set_tags(instance, tags) - instance.save() - return instance + return serializer_update(instance, validated_data) class EditableSerializerMixin(serializers.Serializer): diff --git a/django/core/settings/defaults.py b/django/core/settings/defaults.py index d509b3994..021b95fbb 100644 --- a/django/core/settings/defaults.py +++ b/django/core/settings/defaults.py @@ -118,6 +118,7 @@ def is_test(self): "django_extensions", "django_vite", "guardian", + "huey.contrib.djhuey", "rest_framework", "rest_framework_swagger", "robots", @@ -396,6 +397,11 @@ def is_test(self): "handlers": ["console", "comsesfile"], "propagate": False, }, + "huey": { + "level": "INFO", + "handlers": ["console", "comsesfile"], + "propagate": False, + }, }, } @@ -479,10 +485,19 @@ def is_test(self): "LOCATION": "unix:///shared/redis/redis.sock", "OPTIONS": { "CLIENT_CLASS": "django_redis.client.DefaultClient", + "CONNECTION_POOL_KWARGS": {"max_connections": 20}, }, } } +HUEY = { + "name": "comses", + "huey_class": "core.huey.DjangoRedisHuey", + "immediate": False, # always run tasks in the background (for now), if removed it will default to DEBUG + # FIXME: this should generally be True in development, the huey consumer WILL NOT + # automatically reload when the code changes when False +} + # SSO, user registration, and django-allauth configuration, see # https://django-allauth.readthedocs.io/en/latest/configuration.html # ACCOUNT_ADAPTER = 'core.adapter.AccountAdapter' @@ -501,12 +516,22 @@ def is_test(self): ACCOUNT_CHANGE_EMAIL = True ORCID_CLIENT_ID = os.getenv("ORCID_CLIENT_ID", "") - ORCID_CLIENT_SECRET = read_secret("orcid_client_secret") - GITHUB_CLIENT_ID = os.getenv("GITHUB_CLIENT_ID", "") GITHUB_CLIENT_SECRET = read_secret("github_client_secret") +GITHUB_INTEGRATION_APP_ID = int(os.getenv("GITHUB_INTEGRATION_APP_ID") or 0) +GITHUB_INTEGRATION_APP_NAME = os.getenv("GITHUB_INTEGRATION_APP_NAME", "") +GITHUB_INTEGRATION_APP_PRIVATE_KEY = read_secret("github_integration_app_private_key") +GITHUB_INTEGRATION_APP_INSTALLATION_ID = int( + os.getenv("GITHUB_INTEGRATION_APP_INSTALLATION_ID") or 0 +) +GITHUB_INTEGRATION_APP_WEBHOOK_SECRET = read_secret("github_integration_app_webhook_secret") +GITHUB_MODEL_LIBRARY_ORG_NAME = os.getenv("GITHUB_MODEL_LIBRARY_ORG_NAME", "") +GITHUB_INDIVIDUAL_FILE_SIZE_LIMIT = os.getenv( + "GITHUB_INDIVIDUAL_FILE_SIZE_LIMIT", 100 * 1024 * 1024 +) + TEST_BASIC_AUTH_PASSWORD = os.getenv("TEST_BASIC_AUTH_PASSWORD", "test password") TEST_USER_ID = os.getenv("TEST_USER_ID", 1000000) TEST_USERNAME = os.getenv("TEST_USERNAME", "__test_user__") diff --git a/django/core/settings/e2e.py b/django/core/settings/e2e.py index a7bc69622..7c9e72a67 100644 --- a/django/core/settings/e2e.py +++ b/django/core/settings/e2e.py @@ -7,7 +7,7 @@ SHARE_DIR = path.realpath("/shared/e2e") LIBRARY_ROOT = path.join(SHARE_DIR, "library") LIBRARY_PREVIOUS_ROOT = path.join(SHARE_DIR, ".latest") -REPOSITORY_ROOT = path.join(BASE_DIR, "repository") +REPOSITORY_ROOT = path.join(SHARE_DIR, "repository") BACKUP_ROOT = path.join(SHARE_DIR, "backups") BORG_ROOT = path.join(BACKUP_ROOT, "repo") EXTRACT_ROOT = path.join(SHARE_DIR, "extract") diff --git a/django/core/settings/staging.py b/django/core/settings/staging.py index 9821e5557..74332965c 100644 --- a/django/core/settings/staging.py +++ b/django/core/settings/staging.py @@ -181,5 +181,10 @@ "handlers": ["comsesfile"], "propagate": False, }, + "huey": { + "level": "WARNING", + "handlers": ["comsesfile"], + "propagate": False, + }, }, } diff --git a/django/core/settings/test.py b/django/core/settings/test.py index be685f1fd..f699cc030 100644 --- a/django/core/settings/test.py +++ b/django/core/settings/test.py @@ -18,7 +18,7 @@ SHARE_DIR = path.realpath("library/tests/tmp") LIBRARY_ROOT = path.join(SHARE_DIR, "library") LIBRARY_PREVIOUS_ROOT = path.join(SHARE_DIR, ".latest") -REPOSITORY_ROOT = path.join(BASE_DIR, "repository") +REPOSITORY_ROOT = path.join(SHARE_DIR, "repository") BACKUP_ROOT = path.join(SHARE_DIR, "backups") BORG_ROOT = path.join(BACKUP_ROOT, "repo") EXTRACT_ROOT = path.join(SHARE_DIR, "extract") diff --git a/django/core/tests/base.py b/django/core/tests/base.py index b8e5b495c..e8f06ee64 100644 --- a/django/core/tests/base.py +++ b/django/core/tests/base.py @@ -173,5 +173,13 @@ def initialize_test_shared_folders(): ) +def clear_test_shared_folder(dir=settings.REPOSITORY_ROOT): + for fs in os.scandir(dir): + if fs.is_dir(): + shutil.rmtree(os.path.join(dir, fs.name), ignore_errors=True) + elif fs.is_file(): + os.remove(os.path.join(dir, fs.name)) + + def destroy_test_shared_folders(): shutil.rmtree(settings.SHARE_DIR, ignore_errors=True) diff --git a/django/curator/fs.py b/django/curator/fs.py index 5d8eec0f8..6a197fda2 100644 --- a/django/curator/fs.py +++ b/django/curator/fs.py @@ -8,6 +8,8 @@ def fsck(queryset): results = OrderedDict() for release in queryset: + if release.is_imported: + continue rfsc = CodebaseReleaseFileConsistencyChecker(release) errors = rfsc.check() if errors: diff --git a/django/curator/tests/test_dump_restore.py b/django/curator/tests/test_dump_restore.py index 0d232ebb9..b7b71a39b 100644 --- a/django/curator/tests/test_dump_restore.py +++ b/django/curator/tests/test_dump_restore.py @@ -19,7 +19,7 @@ from core.tests.base import EventFactory, JobFactory from library.fs import import_archive from library.models import Codebase -from library.tests.base import CodebaseFactory +from library.tests.base import CodebaseFactory, TEST_SAMPLES_DIR logger = logging.getLogger(__name__) @@ -51,7 +51,7 @@ def setUp(self): fs_api = self.release.get_fs_api() import_archive( codebase_release=self.release, - nested_code_folder_name="library/tests/archives/nestedcode", + nested_code_folder_name=TEST_SAMPLES_DIR / "archives" / "nestedcode", fs_api=fs_api, ) diff --git a/django/deploy/huey.sh b/django/deploy/huey.sh new file mode 100755 index 000000000..ee3d365a2 --- /dev/null +++ b/django/deploy/huey.sh @@ -0,0 +1,2 @@ +#!/bin/bash +exec /code/manage.py run_huey diff --git a/django/library/fs.py b/django/library/fs.py index f3352e581..f34206704 100644 --- a/django/library/fs.py +++ b/django/library/fs.py @@ -1,4 +1,7 @@ +from abc import ABC, abstractmethod import json +import requests +import yaml import logging import mimetypes import os @@ -6,11 +9,15 @@ import shutil import tarfile import zipfile +import filecmp +from contextlib import contextmanager +from packaging.version import Version from enum import Enum from functools import total_ordering from pathlib import Path from tempfile import TemporaryDirectory -from typing import Optional +from typing import Callable, Optional +from git import Actor, GitCommandError, InvalidGitRepositoryError, Repo import bagit import rarfile @@ -18,6 +25,7 @@ from django.core.files.storage import FileSystemStorage from django.core.files.uploadedfile import File from django.urls import reverse +from django.utils import timezone from rest_framework.exceptions import ValidationError from core import fs @@ -38,13 +46,14 @@ class StagingDirectories(Enum): aip = 3 -class FileCategoryDirectories(Enum): +class FileCategories(Enum): code = 1 data = 2 docs = 3 media = 4 originals = 5 results = 6 + metadata = 7 @total_ordering @@ -73,25 +82,25 @@ def downgrade(self, minimum=0): ACCEPT_ALL_REGEX = re.compile(r".*") MIMETYPE_MATCHER = { - FileCategoryDirectories.code: ACCEPT_ALL_REGEX, - FileCategoryDirectories.data: ACCEPT_ALL_REGEX, - FileCategoryDirectories.docs: re.compile( + FileCategories.code: ACCEPT_ALL_REGEX, + FileCategories.data: ACCEPT_ALL_REGEX, + FileCategories.docs: re.compile( r"text/markdown|application/pdf|text/plain|text/x-rtf|application/vnd.oasis.opendocument.text" ), - FileCategoryDirectories.media: re.compile(r"image/.*|video/.*"), - FileCategoryDirectories.originals: ACCEPT_ALL_REGEX, - FileCategoryDirectories.results: ACCEPT_ALL_REGEX, + FileCategories.media: re.compile(r"image/.*|video/.*"), + FileCategories.originals: ACCEPT_ALL_REGEX, + FileCategories.results: ACCEPT_ALL_REGEX, } -def get_category(name) -> FileCategoryDirectories: +def get_category(name) -> FileCategories: category_name = Path(name).parts[0] try: - return FileCategoryDirectories[category_name] + return FileCategories[category_name] except KeyError: raise ValidationError( "Target folder name {} invalid. Must be one of {}".format( - category_name, list(d.name for d in FileCategoryDirectories) + category_name, list(d.name for d in FileCategories) ) ) @@ -223,7 +232,7 @@ def validate(self): msgs.append(self.validate_file(filename, content)) return msgs - def list(self, category: Optional[FileCategoryDirectories] = None, absolute=False): + def list(self, category: Optional[FileCategories] = None, absolute=False): path = Path(self.location) if category is not None: path = path.joinpath(category.name) @@ -266,7 +275,7 @@ def log_save(self, name, content): msgs.append(self.error(e)) return msgs - def clear_category(self, category: FileCategoryDirectories): + def clear_category(self, category: FileCategories): shutil.rmtree(os.path.join(self.location, category.name), ignore_errors=True) def clear(self): @@ -285,7 +294,7 @@ def log_delete(self, name): class CodebaseReleaseOriginalStorage(CodebaseReleaseStorage): stage = StagingDirectories.originals - def get_existing_archive_name(self, category: FileCategoryDirectories): + def get_existing_archive_name(self, category: FileCategories): for p in self.list(category): if p.is_file() and fs.is_archive(p): return str(p) @@ -341,17 +350,15 @@ def make_bag(self, metadata): class CodebaseReleaseAipStorage(CodebaseReleaseStorage): """Places files from the sip folder into aip""" + stage = StagingDirectories.aip + def import_sip(self, sip_storage: CodebaseReleaseSipStorage): shutil.copytree(sip_storage.location, self.location) -class CodebaseReleaseFsApi: +class BaseCodebaseReleaseFsApi(ABC): """ - Interface to maintain files associated with a codebase - - FIXME: This is not currently protected against concurrent file access but only the submitter can edit files - associated with a codebase release at the moment. Will need to implement file locks if/when this assumption fails to - hold + Base interface to maintain files associated with a codebase release """ def __init__( @@ -360,11 +367,11 @@ def __init__( system_file_presence_message_level=MessageLevels.error, mimetype_mismatch_message_level=MessageLevels.error, ): + self.release = codebase_release self.uuid = str(codebase_release.codebase.uuid) self.identifier = codebase_release.codebase.identifier self.version_number = codebase_release.version_number self.release_id = codebase_release.id - self.codemeta = codebase_release.codemeta self.bagit_info = codebase_release.bagit_info self.mimetype_mismatch_message_level = mimetype_mismatch_message_level @@ -405,10 +412,26 @@ def originals_dir(self): def sip_dir(self): return self.rootdir.joinpath("sip") + @property + def codemeta_contents(self) -> str: + return self.release.codemeta_json_str + @property def codemeta_path(self): return self.sip_contents_dir.joinpath("codemeta.json") + @property + def cff_contents(self) -> str: + return self.release.cff_yaml_str + + @property + def cff_path(self): + return self.sip_contents_dir.joinpath("CITATION.cff") + + @property + def license_path(self): + return self.sip_contents_dir.joinpath("LICENSE") + @property def sip_contents_dir(self): return self.sip_dir.joinpath("data") @@ -445,27 +468,7 @@ def get_stage_storage(self, stage: StagingDirectories): else: raise ValueError(f"StageDirectories values {stage} not valid") - def get_sip_list_url(self, category: FileCategoryDirectories): - return reverse( - "library:codebaserelease-sip-files-list", - kwargs={ - "identifier": str(self.identifier), - "version_number": self.version_number, - "category": category.name, - }, - ) - - def get_originals_list_url(self, category: FileCategoryDirectories): - return reverse( - "library:codebaserelease-original-files-list", - kwargs={ - "identifier": str(self.identifier), - "version_number": self.version_number, - "category": category.name, - }, - ) - - def get_absolute_url(self, category: FileCategoryDirectories, relpath: Path): + def get_absolute_url(self, category: FileCategories, relpath: Path): return reverse( "library:codebaserelease-original-files-detail", kwargs={ @@ -497,7 +500,10 @@ def initialize( mimetype_mismatch_message_level=MessageLevels.error, bagit_info=None, ): - fs_api = CodebaseReleaseFsApi( + """Initialize a new FS Api instance for a codebase release, including creating + the SIP directory and bagging the contents if it does not already exist + """ + fs_api = cls( codebase_release, system_file_presence_message_level=system_file_presence_message_level, mimetype_mismatch_message_level=mimetype_mismatch_message_level, @@ -511,30 +517,55 @@ def initialize( def create_or_update_codemeta(self, force=False): """ Returns True if a codemeta.json file was created, False otherwise - :param metadata: an optional dictionary with codemeta properties - :return: """ path = self.codemeta_path if force or not path.exists(): with path.open(mode="w", encoding="utf-8") as codemeta_out: - json.dump(self.codemeta.to_dict(), codemeta_out) + codemeta_out.write(self.codemeta_contents) return True return False - def get_codemeta_json(self): - return self.codemeta.to_json() + def create_or_update_citation_cff(self, force=False): + """ + Returns True if a CITATION.cff file was created, False otherwise + """ + path = self.cff_path + try: + cff_contents = self.cff_contents + except Exception as e: + logger.exception( + f"error generating CITATION.cff for release {self.release}: {e}" + ) + return False + if force or not path.exists(): + with path.open(mode="w", encoding="utf-8") as cff_out: + cff_out.write(cff_contents) + return True + return False - def build_published_archive(self, force=False): + def create_or_update_license(self, force=False): """ - FIXME: some of this should be moved to an async processing task. + Returns True if a LICENSE file was created, False otherwise """ + path = self.license_path + if self.release.license and (force or not path.exists()): + with path.open(mode="w", encoding="utf-8") as license_out: + license_out.write(self.release.license_text) + return True + return False + + def build_published_archive(self, force=False): self.create_or_update_codemeta(force=force) + self.create_or_update_citation_cff(force=force) + self.create_or_update_license(force=force) bag = self.get_or_create_sip_bag(self.bagit_info) self.validate_bagit(bag) self.build_archive(force=force) def build_review_archive(self): self.create_or_update_codemeta(force=True) + self.create_or_update_citation_cff(force=True) + self.create_or_update_license(force=True) shutil.make_archive( str(self.review_archivepath.with_suffix("")), format="zip", @@ -564,19 +595,100 @@ def archive_size(self): def review_archive_size(self): return self.review_archivepath.stat().st_size - def clear_category(self, category: FileCategoryDirectories): - originals_storage = self.get_originals_storage() - originals_storage.clear_category(category) - sip_storage = self.get_sip_storage() - sip_storage.clear_category(category) + @abstractmethod + def list(self, stage: StagingDirectories, category: Optional[FileCategories]): + pass + + @abstractmethod + def list_sip_contents(self, path=None) -> dict: + pass + + @abstractmethod + def check_category_file_exists(self, category: FileCategories) -> bool: + """returns True if at least one file with the given category exists + in the sip storage, False otherwise + """ + pass + + def get_or_create_sip_bag(self, bagit_info=None): + sip_dir = str(self.sip_dir) + logger.info("creating bagit metadata at %s", sip_dir) + bag = fs.make_bag(sip_dir, bagit_info) + bag.save(manifests=True) + return bag + + def build_aip(self, sip_dir: Optional[str] = None): + logger.info("building aip") + if sip_dir is None: + sip_dir = str(self.sip_dir) + shutil.rmtree(str(self.aip_dir), ignore_errors=True) + shutil.copytree(sip_dir, str(self.aip_dir)) + + def build_archive_at_dest(self, dest): + logger.info("building archive") + self.build_aip() + if self.aip_contents_dir.exists(): + with zipfile.ZipFile(dest, "w") as archive: + for root_path, dirs, file_paths in os.walk(str(self.aip_contents_dir)): + for file_path in file_paths: + path = Path(root_path, file_path) + archive.write( + str(path), + arcname=str(path.relative_to(self.aip_contents_dir)), + ) + logger.info("building archive succeeded") + return True + else: + logger.error("building archive failed - no aip directory") + return False + + def build_archive(self, force=False): + if not self.archivepath.exists() or force: + self.build_archive_at_dest(dest=str(self.archivepath)) - def list( - self, stage: StagingDirectories, category: Optional[FileCategoryDirectories] + def create_or_update_metadata_files(self, force=False): + self.create_or_update_codemeta(force=force) + self.create_or_update_citation_cff(force=force) + self.create_or_update_license(force=force) + + def rebuild_metadata(self): + self.create_or_update_metadata_files(force=True) + # only rebuild the archive package if it already exists + if self.aip_dir.exists(): + self.build_archive(force=True) + + +class CodebaseReleaseFsApi(BaseCodebaseReleaseFsApi): + """ + File system API for managing a non-imported (regular, directly uploaded) codebase release. + + NOTE: This is not currently protected against concurrent file access but only the submitter can edit files + associated with a codebase release at the moment. Will need to implement file locks if/when this assumption fails to + hold + """ + + def __init__( + self, + codebase_release, + system_file_presence_message_level=MessageLevels.error, + mimetype_mismatch_message_level=MessageLevels.error, ): + if codebase_release.is_imported: + raise ValueError("CodebaseRelease must be a non-imported release") + super().__init__( + codebase_release, + system_file_presence_message_level, + mimetype_mismatch_message_level, + ) + + def list(self, stage, category): stage_storage = self.get_stage_storage(stage) return [str(p) for p in stage_storage.list(category)] def list_sip_contents(self, path=None): + """recursively build a tree representing the SIP contents. + Each node includes a label (file name), path (relative to sip contents), and category + """ if path is None: path = self.sip_contents_dir name = "archive-project-root" @@ -587,41 +699,42 @@ def list_sip_contents(self, path=None): if p.is_dir(): contents["contents"].append(self.list_sip_contents(p)) else: - contents["contents"].append({"label": p.name}) + try: + rel_parent = p.parent.relative_to(self.sip_contents_dir) + category_str = ( + str(rel_parent) + if rel_parent != Path(".") + else FileCategories.metadata.name + ) + except ValueError: + # parent is not a subdirectory of sip_contents_dir + category_str = FileCategories.metadata.name + contents["contents"].append( + { + "label": p.name, + "path": str(p.relative_to(self.sip_contents_dir)), + "category": category_str, + } + ) return contents + def check_category_file_exists(self, category): + sip_storage = self.get_sip_storage() + category_dir_exists = sip_storage.exists(category.name) + category_dir_list = list(sip_storage.list(category)) + return category_dir_exists and bool(category_dir_list) + def retrieve( self, stage: StagingDirectories, - category: FileCategoryDirectories, + category: FileCategories, relpath: Path, ): stage_storage = self.get_stage_storage(stage) relpath = Path(category.name, relpath) return stage_storage.open(str(relpath)) - def delete(self, category: FileCategoryDirectories, relpath: Path): - originals_storage = self.get_originals_storage() - sip_storage = self.get_sip_storage() - relpath = Path(category.name, relpath) - logs = MessageGroup() - if originals_storage.is_archive_directory(category): - self.clear_category(category) - else: - if not originals_storage.exists(str(relpath)): - logs.append( - create_fs_message( - f"No file at path {relpath} to delete", - StagingDirectories.originals, - MessageLevels.error, - ) - ) - return logs - logs.append(sip_storage.log_delete(str(relpath))) - logs.append(originals_storage.log_delete(str(relpath))) - return logs - - def _add_to_sip(self, name, content, category: FileCategoryDirectories): + def _add_to_sip(self, name, content, category: FileCategories): sip_storage = self.get_sip_storage() filename = self.originals_dir.joinpath(name) if fs.is_archive(name): @@ -630,20 +743,40 @@ def _add_to_sip(self, name, content, category: FileCategoryDirectories): else: return sip_storage.log_save(name=name, content=content) - def add_category(self, category: FileCategoryDirectories, src): - logger.info("adding category %s", category.name) - originals_storage = self.get_originals_storage() + def build_sip(self) -> MessageGroup: + logger.info("building sip") + originals_storage = self.get_originals_storage(self.originals_dir) + sip_storage = self.get_sip_storage() + sip_storage.clear() + msgs = self._create_msg_group() - for dirpath, dirnames, filenames in os.walk(src): - for filename in filenames: - filename = os.path.join(dirpath, filename) - name = os.path.join(category.name, str(Path(filename).relative_to(src))) - logger.debug("adding file %s", name) - with open(filename, "rb") as content: - msgs.append(originals_storage.log_save(name, content)) + for name in originals_storage.list(): + path = self.originals_dir.joinpath(name) + logger.debug("adding file: %s", path.relative_to(self.originals_dir)) + category = get_category(Path(name).parts[0]) + with File(path.open("rb")) as f: + msgs.append( + self._add_to_sip(name=str(name), content=f, category=category) + ) + return msgs - def add(self, category: FileCategoryDirectories, content, name=None): + def rebuild(self) -> MessageGroup: + """rebuild the submission package and archive if it already exists""" + msgs = self.build_sip() + self.create_or_update_metadata_files(force=True) + # only rebuild the archive package if it already exists + if self.aip_dir.exists(): + self.build_archive(force=True) + return msgs + + def clear_category(self, category: FileCategories): + originals_storage = self.get_originals_storage() + originals_storage.clear_category(category) + sip_storage = self.get_sip_storage() + sip_storage.clear_category(category) + + def add(self, category: FileCategories, content, name=None): if name is None: name = os.path.join(category.name, content.name) else: @@ -669,7 +802,7 @@ def copy_originals(self, source_release): self.identifier, ) source_fs_api = source_release.get_fs_api() - for category in FileCategoryDirectories: + for category in FileCategories: source_files = source_fs_api.list(StagingDirectories.originals, category) for relpath in source_files: with source_fs_api.retrieve( @@ -677,67 +810,606 @@ def copy_originals(self, source_release): ) as file_content: self.add(category, file_content, name=relpath) - def get_or_create_sip_bag(self, bagit_info=None): - sip_dir = str(self.sip_dir) - logger.info("creating bagit metadata at %s", sip_dir) - bag = fs.make_bag(sip_dir, bagit_info) - bag.save(manifests=True) - return bag + def delete(self, category: FileCategories, relpath: Path): + originals_storage = self.get_originals_storage() + sip_storage = self.get_sip_storage() + relpath = Path(category.name, relpath) + logs = MessageGroup() + if originals_storage.is_archive_directory(category): + self.clear_category(category) + else: + if not originals_storage.exists(str(relpath)): + logs.append( + create_fs_message( + f"No file at path {relpath} to delete", + StagingDirectories.originals, + MessageLevels.error, + ) + ) + return logs + logs.append(sip_storage.log_delete(str(relpath))) + logs.append(originals_storage.log_delete(str(relpath))) + return logs - def build_sip(self, originals_dir: Optional[str] = None): - logger.info("building sip") - if originals_dir is None: - originals_dir = self.originals_dir - originals_storage = self.get_originals_storage(originals_dir) + +class CategoryManifestManager: + def __init__(self, imported_release_package): + self.imported_release_package = imported_release_package + + @property + def data(self) -> dict: + return self.imported_release_package.category_manifest + + def build(self, file_list: list[Path]): + """generate a manifest from scratch from a list of files (normally sip.list()). + This overwrites the existing manifest + """ + manifest = {} + for name in file_list: + manifest[str(name)] = self._guess_file_category(name) + self.update(manifest) + + def _guess_file_category(self, name: Path) -> str: + """return an appropriate category name for a file based on its extension. + currently defaults to code for all files except pdfs, which can be reasonably assumed to be docs + """ + if name.suffix == ".pdf": + return FileCategories.docs.name + return FileCategories.code.name + + def update(self, manifest): + """save the manifest to the imported release package""" + self.imported_release_package.category_manifest = manifest + self.imported_release_package.save() + + def update_file_category(self, name, category: FileCategories): + manifest = self.data + if name not in manifest: + raise ValueError(f"file {name} not in manifest") + manifest[name] = category.name + self.update(manifest) + + def remove_file(self, name): + manifest = self.data + del manifest[name] + self.update(manifest) + + def add_file(self, name, category: FileCategories = FileCategories.code): + manifest = self.data + manifest[name] = category.name + self.update(manifest) + + def fix_from_list(self, file_list: list[Path]): + """update the manifest to match the file list. This will add any files in the file list that are not in the + manifest, and remove any files in the manifest that are not in the file list + """ + manifest = self.data + for name in file_list: + if name not in manifest: + manifest[name] = self._guess_file_category(name) + for name in list(manifest.keys()): + if name not in file_list: + del manifest[name] + self.update(manifest) + + +class ImportedCodebaseReleaseFsApi(BaseCodebaseReleaseFsApi): + """ + File system API for managing an imported (i.e. from a GitHub release) codebase release. + + NOTE: This is not currently protected against concurrent file access but only the submitter can edit files + associated with a codebase release at the moment. Will need to implement file locks if/when this assumption fails to + hold + """ + + def __init__( + self, + codebase_release, + system_file_presence_message_level=MessageLevels.error, + mimetype_mismatch_message_level=MessageLevels.error, + ): + self.imported_release_package = codebase_release.imported_release_package + if not self.imported_release_package: + raise ValueError("CodebaseRelease must be an imported release") + super().__init__( + codebase_release, + system_file_presence_message_level, + mimetype_mismatch_message_level, + ) + self.imported_release_package = codebase_release.imported_release_package + self.manifest = CategoryManifestManager(self.imported_release_package) + + def list(self, stage=StagingDirectories.sip, category=None): + if category is not None: + return [ + str(relpath) + for relpath, cat in self.manifest.data.items() + if cat == category.name + ] + else: + return list(self.manifest.data.keys()) + + def list_sip_contents(self, path=None): + """recursively build a tree representing the SIP contents. + Each node includes a label (file name), path (relative to sip contents), and category + """ + if path is None: + path = self.sip_contents_dir + name = "archive-project-root" + else: + name = path.name + contents = {"label": name, "contents": []} + for p in path.iterdir(): + if p.is_dir(): + contents["contents"].append(self.list_sip_contents(p)) + else: + relpath = p.relative_to(self.sip_contents_dir) + category_str = self.manifest.data.get( + str(relpath), FileCategories.metadata.name + ) + contents["contents"].append( + { + "label": p.name, + "path": str(p.relative_to(self.sip_contents_dir)), + "category": category_str, + } + ) + return contents + + def check_category_file_exists(self, category): + return category.name in set(self.manifest.data.values()) + + def create_or_update_codemeta(self, force=False): + created = super().create_or_update_codemeta(force=force) + if created: + name = str(self.codemeta_path.relative_to(self.sip_contents_dir)) + self.manifest.add_file(name, FileCategories.metadata) + + def create_or_update_citation_cff(self, force=False): + created = super().create_or_update_citation_cff(force) + if created: + name = str(self.cff_path.relative_to(self.sip_contents_dir)) + self.manifest.add_file(name, FileCategories.metadata) + + def create_or_update_license(self, force=False): + created = super().create_or_update_license(force) + if created: + name = str(self.license_path.relative_to(self.sip_contents_dir)) + self.manifest.add_file(name, FileCategories.metadata) + + def download_archive(self, download_url: str, installation_token: str) -> Path: + """Download a release package archive from a remote URL and + places it in the originals stage directory""" + originals_storage = self.get_originals_storage() + if not os.path.exists(originals_storage.location): + os.makedirs(originals_storage.location, exist_ok=True) + originals_storage.clear() + headers = { + "Authorization": f"Bearer {installation_token}", + } + response = requests.get(download_url, headers=headers, stream=True) + response.raise_for_status() + cd = response.headers.get("content-disposition") + if cd and "filename=" in cd: + filename = re.findall("filename=(.+)", cd)[0] + else: + tag_name = self.imported_release_package.tag_name + filename = f"{tag_name}.zip" + file_path = Path(originals_storage.location) / filename + with file_path.open("wb") as f: + for chunk in response.iter_content(chunk_size=8192): + if chunk: + f.write(chunk) + logger.info(f"downloaded imported release archive to {file_path}") + return file_path + + def extract_to_sip(self, archive_path: Path): + """Extract the downloaded release package archive into the SIP storage""" sip_storage = self.get_sip_storage() sip_storage.clear() + if not zipfile.is_zipfile(str(archive_path)): + raise ValueError("Archive file must be a zip archive") + extract_zip_without_top_dir(archive_path, Path(sip_storage.location)) + logger.info(f"extracted imported release archive to {sip_storage.location}") - msgs = self._create_msg_group() - for name in originals_storage.list(): - path = self.originals_dir.joinpath(name) - logger.debug("adding file: %s", path.relative_to(self.originals_dir)) - category = get_category(Path(name).parts[0]) - with File(path.open("rb")) as f: - msgs.append( - self._add_to_sip(name=str(name), content=f, category=category) + def import_release_package( + self, installation_token: str, download_url: str | None = None + ) -> tuple[dict, dict]: + """import a release archive from a remote URL (imported_release_package.download_url by default) + by downloading into the originals storage and extracting into the SIP storage. + + returns a tuple of dicts representing extracted metadata from known metadata files found in the archive, + currently: (codemeta.json, CITATION.cff) + + NOTE: currently only supports zip archives + """ + if download_url is None: + download_url = self.imported_release_package.download_url + archive_path = self.download_archive(download_url, installation_token) + self.extract_to_sip(archive_path) + sip_contents = list(self.get_sip_storage().list()) + self.manifest.build(sip_contents) + return self._extract_metadata_files(sip_contents) + + def _extract_metadata_files(self, sip_contents) -> tuple[dict, dict]: + """searches the extracted archive for known metadata files and returns their contents + + returns a tuple of dicts, currently: (codemeta.json, CITATION.cff) + """ + + def find_file(file_list, target: str) -> Path | None: + """ + search for a target file in the provided list of paths. + target is case-insensitive + """ + # check files in the root first + for f in file_list: + if len(f.parts) == 1 and f.name.lower() == target.lower(): + return f + for f in file_list: + if f.name.lower() == target.lower(): + return f + return None + + codemeta_path = find_file(sip_contents, "codemeta.json") + cff_path = find_file(sip_contents, "CITATION.cff") + codemeta = None + cff = None + + if codemeta_path: + try: + with self.get_sip_storage().open(str(codemeta_path), mode="r") as f: + file_content = f.read() + parsed = json.loads(file_content) + codemeta = parsed if isinstance(parsed, dict) else None + except Exception: + codemeta = None + + if cff_path: + try: + with self.get_sip_storage().open(str(cff_path), mode="r") as f: + file_content = f.read() + parsed = yaml.safe_load(file_content) + cff = parsed if isinstance(parsed, dict) else None + except Exception: + cff = None + + return codemeta, cff + + +class CodebaseGitRepositoryApi: + """ + Manage a (local) git repository mirror of a codebase + """ + + FILE_SIZE_LIMIT = settings.GITHUB_INDIVIDUAL_FILE_SIZE_LIMIT + MEGABYTE = 1024 * 1024 + FILE_SIZE_LIMIT_MB = FILE_SIZE_LIMIT / MEGABYTE + DEFAULT_BRANCH_NAME = "main" + RELEASE_BRANCH_PREFIX = "release/" + + def __init__(self, codebase): + self.codebase = codebase + self.mirror = codebase.git_mirror + if not self.mirror: + raise ValueError("Codebase must have a git_mirror") + self.repo_dir = Path(self.codebase.base_git_dir).absolute() + + @property + def committer(self): + return Actor("CoMSES Net", settings.EDITOR_EMAIL) + + @property + def author(self): + profile = self.codebase.submitter.member_profile + author_email = ( + f"{profile.github_username}@users.noreply.github.com" + if profile.github_username + else profile.email + ) + return Actor(profile.name, author_email) + + def get_release_branch_name(self, release): + return f"{self.RELEASE_BRANCH_PREFIX}{release.version_number}" + + @classmethod + def check_file_sizes(cls, codebase): + releases = codebase.ordered_releases_list(internal_only=True) + for release in releases: + release_fs_api = release.get_fs_api() + sip_storage = release_fs_api.get_sip_storage() + for file in sip_storage.list(absolute=True): + if file.stat().st_size > cls.FILE_SIZE_LIMIT: + file_size_mb = file.stat().st_size / cls.MEGABYTE + raise ValidationError( + f"File {file} is too large ({file_size_mb}MB), individual files must be under {cls.FILE_SIZE_LIMIT_MB}MB" + ) + + @contextmanager + def use_temporary_repo(self, from_existing=False): + """ + context manager that allows for 'atomic' operations on the git repository + by creating a temporary copy and copying it back after the block is executed + """ + original_repo_dir = self.repo_dir + with TemporaryDirectory() as tmpdir: + self.repo_dir = Path(tmpdir) + if from_existing: + shutil.copytree(original_repo_dir, self.repo_dir, dirs_exist_ok=True) + self.initialize(should_exist=True) + yield + if original_repo_dir.exists(): + shutil.rmtree(original_repo_dir) + shutil.copytree(self.repo_dir, original_repo_dir, dirs_exist_ok=True) + self.repo_dir = original_repo_dir + + def initialize(self, should_exist=False): + """ + initialize the git repository or connect to an existing one + + :param should_exist: if True, raise an error if the repository does not exist + """ + if not self.repo_dir.exists(): + if should_exist: + raise RuntimeError(f"Repository {self.repo_dir} does not exist") + self.repo_dir.mkdir(parents=True) + try: + self.repo = Repo(self.repo_dir) + except InvalidGitRepositoryError: + if should_exist: + raise RuntimeError(f"Repository {self.repo_dir} does not exist") + self.repo = Repo.init( + self.repo_dir, initial_branch=self.DEFAULT_BRANCH_NAME + ) + except Exception as e: + logger.exception(e) + raise RuntimeError(f"Failed to initialize git repository") + + def checkout_main(self): + self.repo.git.checkout(self.DEFAULT_BRANCH_NAME) + + def clear_existing_files(self): + """ + clear any existing files in the working tree (tracked or untracked) besides .git + """ + for item in self.repo_dir.iterdir(): + if item.name != ".git": + if item.is_dir(): + shutil.rmtree(item) + else: + item.unlink() + self.repo.index.remove( + [str(item.relative_to(self.repo_dir))], + working_tree=True, + r=True, ) - return msgs + def add_release_files(self, release): + """ + copy over submission package files for a release to the working tree of the git repo + starting from a clean directory by removing all files except .git/ + """ + release_fs_api: CodebaseReleaseFsApi = release.get_fs_api() + sip_storage = release_fs_api.get_sip_storage() + self.clear_existing_files() + # copy over files from the sip storage and add to the index + # FIXME: consider moving this copy all operation to the CodebaseReleaseStorage class + for file in sip_storage.list(absolute=True): + rel_path = file.relative_to(sip_storage.location) + dest_path = self.repo_dir / rel_path + dest_path.parent.mkdir(parents=True, exist_ok=True) + shutil.copy(file, dest_path) + self.repo.index.add([str(rel_path)]) + + def add_readme(self, release): + """ + add a readme file to the repository root. If one already exists somewhere, move it. + Otherwise, generate one from a template + """ + release_fs_api: CodebaseReleaseFsApi = release.get_fs_api() + sip_storage = release_fs_api.get_sip_storage() + readme_pattern = re.compile( + r"(?i)^readme(?:\.(?:markdown|mdown|mkdn|md|textile|rdoc|org|creole|mediawiki|wiki|rst|asciidoc|adoc|asc|pod|txt))?$" + ) + for file in sip_storage.list(absolute=True): + # check for an existing readme and duplicate it to the repo root + # for github to recognize. Otherwise, we'll generate one later + if readme_pattern.match(file.name): + shutil.copy(file, self.repo_dir / file.name) + self.repo.index.add([file.name]) + return + readme_content = f"# {self.codebase.title}\n\n{self.codebase.description.raw}\n" + self._add_single_file("README.md", readme_content) + + def _add_single_file(self, filename, content: str, overwrite=False): + dest_path = self.repo_dir / filename + if not dest_path.exists() or overwrite: + with dest_path.open("w") as f: + f.write(content) + self.repo.index.add([filename]) + + def commit_release(self, release, tag=True): + """ + commit the the release and tag it, should only be called after adding all necessary files + """ + # make sure the commit goes to main, then create the release branch later + # unless this is the first commit + if self.DEFAULT_BRANCH_NAME in self.repo.heads: + self.checkout_main() + commit_msg = ( + f"Release {release.version_number}\n\n{release.release_notes.raw}\n" + ) + for rc in release.coauthor_release_contributors: + contributor = rc.contributor + email = "" + # try to use the co-author's github account email, otherwise just leave it blank + if contributor.user and contributor.user.member_profile.github_username: + email = f"{contributor.user.member_profile.github_username}@users.noreply.github.com" + commit_msg += f"\nCo-authored-by: {contributor.name} <{email}>" + commit = self.repo.index.commit( + message=commit_msg, + committer=self.committer, + author=self.author, + author_date=release.last_published_on, + ) + if tag: + self.repo.create_tag(f"{release.version_number}") + return commit - def build_aip(self, sip_dir: Optional[str] = None): - logger.info("building aip") - if sip_dir is None: - sip_dir = str(self.sip_dir) - shutil.rmtree(str(self.aip_dir), ignore_errors=True) - shutil.copytree(sip_dir, str(self.aip_dir)) + def create_release_branch(self, release, commit): + """ + create a new branch for the release + """ + release_branch_name = self.get_release_branch_name(release) + self.repo.create_head(release_branch_name, commit) + return release_branch_name - def build_archive_at_dest(self, dest): - logger.info("building archive") - self.build_aip() - if self.aip_contents_dir.exists(): - with zipfile.ZipFile(dest, "w") as archive: - for root_path, dirs, file_paths in os.walk(str(self.aip_contents_dir)): - for file_path in file_paths: - path = Path(root_path, file_path) - archive.write( - str(path), - arcname=str(path.relative_to(self.aip_contents_dir)), - ) - logger.info("building archive succeeded") - return True + def update_release_branch(self, release) -> Repo | None: + """ + update a release branch with new metadata, merging back into main (fast-forward) + if it is the latest release + + this ONLY updates metadata files and does not add + changes to the code, docs, etc. as it is assumed that any synced releases are published + and frozen + + returns None if no changes were made, otherwise returns the updated repo + """ + with self.use_temporary_repo(from_existing=True): + self.initialize(should_exist=True) + release_branch_name = self.get_release_branch_name(release) + # determine whether this is the latest release (i.e. points to the + # same thing as main) and should merge back into main + release_branch = self.repo.heads[release_branch_name] + main_branch = self.repo.heads[self.DEFAULT_BRANCH_NAME] + merge_into_main = (main_branch.commit == release_branch.commit) and ( + main_branch.commit == self.repo.head.commit + ) + + self.repo.git.checkout(release_branch_name) + self.add_release_files(release) + self.add_readme(release) + + # check for changes before committing + if not self.repo.is_dirty(): + return None + + commit_msg = f"Update metadata for release {release.version_number}" + self.repo.index.commit( + message=commit_msg, + committer=self.committer, + author=self.author, + author_date=timezone.now(), + ) + if merge_into_main: + self.checkout_main() + try: + self.repo.git.merge("--ff-only", release_branch_name) + except Exception as e: + logger.error( + f"Unexpected divergence when trying to merge {release_branch_name} into {self.DEFAULT_BRANCH_NAME}: {e}" + ) + self.checkout_main() + + return Repo(self.repo_dir) + + def append_releases(self, releases=None) -> Repo: + """ + add new releases to the git repository. + releases must be newer/higher than the latest mirrored release so that they can be added on top + + this should only be used if no releases have been removed or otherwise modified since these require + rewriting history and this method strictly appends new releases + + :param releases: list of releases to append, if None, all unmirrored releases will be appended + """ + self.check_file_sizes(self.codebase) + if not releases: + releases = self.mirror.unbuilt_releases + if not releases: + # nothing to do, return the existing repo + return Repo(self.repo_dir) + with self.use_temporary_repo(from_existing=True): + # make sure the releases are higher than the latest mirrored release + if not all( + Version(release.version_number) + > Version(self.mirror.latest_built_release.version_number) + for release in releases + ): + raise ValueError( + "Releases must be higher than the latest mirrored release to append" + ) + # make sure the releases are ordered by version number + releases = sorted(releases, key=lambda r: Version(r.version_number)) + # append releases to the git repo by adding files, committing, and creating a branch + for release in releases: + self.add_release_files(release) + self.add_readme(release) + commit = self.commit_release(release) + self.create_release_branch(release, commit) + self.checkout_main() + # record newly mirrored releases and update timestamp + self.mirror.mark_releases_built(releases) + return Repo(self.repo_dir) + + def build(self) -> Repo: + """ + builds or rebuilds the git repository from codebase releases + + this will create an entirely new repository and should only be used if we are creating the + mirror for the first time or need to rebuild the entire history + """ + self.check_file_sizes(self.codebase) + releases = self.codebase.ordered_releases_list(internal_only=True) + if not releases: + raise ValidationError("Must have at least one public release to build from") + with self.use_temporary_repo(): + self.initialize() + for release in releases: + self.add_release_files(release) + self.add_readme(release) + commit = self.commit_release(release) + self.create_release_branch(release, commit) + self.checkout_main() + # record mirrored releases and update timestamp + self.mirror.mark_releases_built(releases) + return Repo(self.repo_dir) + + def update_or_build(self) -> Repo: + # if the repo doesn't exist, is empty, or the mirror object is not tracking them, + # build (or rebuild) the repo + if ( + not self.repo_dir.exists() + or not self.repo_dir.joinpath(".git").exists() + or not self.mirror.built_releases.exists() + ): + return self.build() else: - logger.error("building archive failed - no aip directory") - return False + return self.append_releases() - def build_archive(self, force=False): - if not self.archivepath.exists() or force: - self.build_archive_at_dest(dest=str(self.archivepath)) + def dirs_equal(self, dir1: Path, dir2: Path, ignore=[".git"]): + """ + check if two directories are equal by recursively comparing their contents + excluding the files in the ignore list (default is just .git) - def rebuild(self): - msgs = self.build_sip() - self.create_or_update_codemeta(force=True) - self.build_archive(force=True) - return msgs + this will likely go unused in favor of a more efficient method for checking if a + release mirror (commit) is up to date + """ + dir1 = Path(dir1) + dir2 = Path(dir2) + comparison = filecmp.dircmp(dir1, dir2, ignore=ignore) + if ( + comparison.left_only + or comparison.right_only + or comparison.diff_files + or comparison.funny_files + ): + return False + else: + for subdir in comparison.common_dirs: + if not self.dirs_equal(dir1 / subdir, dir2 / subdir): + return False + return True class ArchiveExtractor: @@ -771,7 +1443,7 @@ def find_root_directory(self, basedir): if len(dirnames) != 1 or len(filenames) != 0: return dirpath - def process(self, category: FileCategoryDirectories, filename: str): + def process(self, category: FileCategories, filename: str): msgs = MessageGroup() try: with TemporaryDirectory() as d: @@ -821,7 +1493,24 @@ def import_archive(codebase_release, nested_code_folder_name, fs_api=None): archive_name = f"{nested_code_folder_name}.zip" shutil.make_archive(nested_code_folder_name, "zip", nested_code_folder_name) with open(archive_name, "rb") as f: - msgs = fs_api.add( - FileCategoryDirectories.code, content=f, name="nestedcode.zip" - ) + msgs = fs_api.add(FileCategories.code, content=f, name="nestedcode.zip") return msgs + + +def extract_zip_without_top_dir(zip_path: Path, extract_to: Path): + """extract a zip archive to a directory, removing the top-level directory""" + with zipfile.ZipFile(zip_path, "r") as z: + all_names = [m.filename for m in z.infolist()] + top_level = os.path.commonprefix(all_names).rstrip("/") + # remove the top-level dir from each path and extract + for member in z.infolist(): + relative_path = os.path.relpath(member.filename, top_level) + if relative_path == ".": # skip top-level dir + continue + target_path = extract_to / relative_path + if member.is_dir(): + target_path.mkdir(parents=True, exist_ok=True) + else: + target_path.parent.mkdir(parents=True, exist_ok=True) + with target_path.open("wb") as f: + f.write(z.read(member)) diff --git a/django/library/github_integration.py b/django/library/github_integration.py new file mode 100644 index 000000000..c7450dd44 --- /dev/null +++ b/django/library/github_integration.py @@ -0,0 +1,427 @@ +import logging +import re +import uuid +from github import GithubIntegration, Auth, Github +from github.GithubException import GithubException, UnknownObjectException +from github.Repository import Repository as GithubRepo +from git import PushInfo, Repo as GitRepo +from django.conf import settings +from django.core.cache import cache +from django.db import transaction +from django.utils import timezone + +from .metadata import ReleaseMetadataConverter +from .models import ( + Codebase, + CodebaseGitRemote, + CodebaseRelease, + Contributor, + License, + GithubIntegrationAppInstallation, + ImportedReleasePackage, +) + +logger = logging.getLogger(__name__) + +INSTALLATION_ACCESS_TOKEN_REDIS_KEY = "github_installation_access_token" + + +class GitHubRepoValidator: + + def __init__(self, repo_name: str): + self.repo_name = repo_name + + def validate_format(self): + if not re.fullmatch(r"[A-Za-z0-9_.-]+", self.repo_name): + raise ValueError( + "The repository name can only contain ASCII letters, digits, and the characters ., -, and _" + ) + if not (1 <= len(self.repo_name) <= 100): + raise ValueError("Repository name is too long (maximum is 100 characters)") + if self.repo_name.endswith(".git"): + raise ValueError("Repository name cannot end with '.git'") + if "github" in self.repo_name: + raise ValueError("Repository name cannot contain 'github'") + + def check_org_repo_name_unused(self): + if settings.GITHUB_MODEL_LIBRARY_ORG_NAME in self.repo_name: + raise ValueError( + f"Repository name cannot contain the organization name: '{settings.GITHUB_MODEL_LIBRARY_ORG_NAME}'" + ) + github = Github(GitHubApi.get_org_installation_access_token()) + full_name = f"{settings.GITHUB_MODEL_LIBRARY_ORG_NAME}/{self.repo_name}" + try: + github.get_organization(settings.GITHUB_MODEL_LIBRARY_ORG_NAME).get_repo( + self.repo_name + ) + raise ValueError( + f"Repository already exists at https://github.com/{full_name}" + ) + except UnknownObjectException: + return True + + def get_existing_user_repo_url( + self, installation: GithubIntegrationAppInstallation + ): + token = GitHubApi.get_user_installation_access_token(installation) + full_name = f"{installation.github_login}/{self.repo_name}" + github_repo = GitHubApi.get_existing_repo(token, full_name) + return github_repo.html_url + + def check_user_repo_empty(self, installation: GithubIntegrationAppInstallation): + token = GitHubApi.get_user_installation_access_token(installation) + full_name = f"{installation.github_login}/{self.repo_name}" + github_repo = GitHubApi.get_existing_repo( + token, + full_name, + ) + try: + # this should raise a 404 if the repository is empty + github_repo.get_contents("") + raise ValueError( + f"Repository at https://github.com/{full_name} is not empty" + ) + except GithubException as e: + if e.status == 404: + return True + raise e + + +class GitHubApi: + """Functionality for interacting with a remote Github repository + and Github API + """ + + def __init__( + self, + codebase: Codebase, + remote: CodebaseGitRemote, + local_repo: GitRepo, + ): + self.codebase = codebase + self.remote = remote + self.local_repo = local_repo + self._github_repo = None + + @property + def repo_owner(self): + return self.remote.owner + + @property + def repo_name(self): + return self.remote.repo_name + + @property + def is_user_repo(self): + return self.remote.is_user_repo + + @property + def github_repo(self) -> GithubRepo: + if not self._github_repo: + full_name = f"{self.repo_owner}/{self.repo_name}" + self._github_repo = self.get_existing_repo( + self.installation_access_token, + full_name, + ) + return self._github_repo + + @property + def installation_access_token(self): + if self.is_user_repo: + return self.get_user_installation_access_token(self.remote.installation) + return self.get_org_installation_access_token() + + @staticmethod + def get_user_installation_access_token( + installation: GithubIntegrationAppInstallation | None, + ) -> str | None: + if not installation: + return None + auth = Auth.AppAuth( + settings.GITHUB_INTEGRATION_APP_ID, + settings.GITHUB_INTEGRATION_APP_PRIVATE_KEY, + ) + integration = GithubIntegration(auth=auth) + installation_auth = integration.get_access_token(installation.installation_id) + return installation_auth.token + + @classmethod + def get_org_installation_access_token(cls) -> str: + cached_token = cache.get(INSTALLATION_ACCESS_TOKEN_REDIS_KEY) + if cached_token: + return cached_token + return cls.refresh_org_installation_access_token() + + @staticmethod + def refresh_org_installation_access_token() -> str: + """retrieve a new installation access token for the Github app installed + on the central CoMSES model library organization account and cache it for future use + """ + auth = Auth.AppAuth( + settings.GITHUB_INTEGRATION_APP_ID, + settings.GITHUB_INTEGRATION_APP_PRIVATE_KEY, + ) + integration = GithubIntegration(auth=auth) + installation_auth = integration.get_access_token( + settings.GITHUB_INTEGRATION_APP_INSTALLATION_ID + ) + token = installation_auth.token + seconds_until_expiration = ( + installation_auth.expires_at - timezone.now() + ).total_seconds() + # cache the token for 1 minute less than the expiration time + cache.set( + INSTALLATION_ACCESS_TOKEN_REDIS_KEY, + token, + seconds_until_expiration - 60, + ) + return token + + def get_or_create_repo(self) -> GithubRepo: + """get or create the Github repository for a user or organization""" + try: + return self.github_repo + except: + if self.is_user_repo: + raise ValueError("User-owned repositories must be created beforehand") + else: + self._github_repo = self._create_org_repo() + return self._github_repo + + def push(self, local_repo: GitRepo) -> str: + """push the local git repository to the Github repository""" + token = self.installation_access_token + push_url = f"https://x-access-token:{token}@github.com/{self.github_repo.full_name}.git" + return self._push_to_url(local_repo, push_url) + + def create_releases(self, local_repo: GitRepo): + """create Github releases for each tag in the local repository that + does not already have a corresponding release in the remote repository""" + for tag in local_repo.tags: + try: + existing_release = self.github_repo.get_release(tag.name) + except: + existing_release = None + if not existing_release: + self.github_repo.create_git_release( + tag.name, + name=tag.name, + message=tag.commit.message, + draft=False, + prerelease=False, + ) + + @staticmethod + def get_existing_repo(access_token: str, full_name: str) -> GithubRepo: + """attempt to get an existing repository for the authenticated user or organization""" + github = Github(access_token) + try: + return github.get_repo(full_name) + except: + raise ValueError( + f"Github repository https://github.com/{full_name} does not exist or is inaccessible" + ) + + def _create_org_repo(self): + """create a new repository in the CoMSES model library organization + + this function requires the `repo` scope for the installation access token + """ + token = self.installation_access_token + github = Github(token) + org = github.get_organization(settings.GITHUB_MODEL_LIBRARY_ORG_NAME) + repo = org.create_repo( + name=self.repo_name, + description=f"Mirror of {self.codebase.permanent_url}", + ) + return repo + + def _push_to_url(self, local_repo: GitRepo, push_url: str) -> str: + if "origin" not in local_repo.remotes: + local_repo.create_remote("origin", push_url) + else: + local_repo.remotes["origin"].set_url(push_url) + # https://gitpython.readthedocs.io/en/stable/reference.html#git.remote.PushInfo + remote = local_repo.remote(name="origin") + result_all = remote.push(all=True) + result_tags = remote.push(tags=True) + timestamp = f"[{timezone.now().isoformat()}]:\n" + summaries = [] + success_mask = PushInfo.NEW_HEAD | PushInfo.FAST_FORWARD | PushInfo.UP_TO_DATE + for info in result_all: + if info: # result will be None if the push failed entirely + if info.flags & success_mask: + summaries.append(f"branch ({info.local_ref}): successfully pushed") + else: + summaries.append( + f"branch ({info.local_ref}): did not push, likely due to changes in GitHub repository" + ) + if not summaries: + return timestamp + "push failed entirely" + return timestamp + "\n".join(summaries) + + +class GitHubReleaseImporter: + def __init__(self, payload: dict): + # https://docs.github.com/en/webhooks/webhook-events-and-payloads?actionType=released#release + github_action = payload.get("action") + if github_action == "released": + # release was published, or a pre-release was changed to a release + self.is_new_github_release = True + elif github_action == "edited": + # details of a release, pre-release, or draft were edited + self.is_new_github_release = False + else: + raise ValueError("Unhandled action type") + + self.github_release = payload.get("release") + self.installation = payload.get("installation") + self.repository = payload.get("repository") + if not (self.github_release and self.installation and self.repository): + raise ValueError("Payload is missing required fields") + + if self.github_release.get("draft") or self.github_release.get("prerelease"): + raise ValueError("Draft or pre-release, ignoring") + + self.github_release_id = str(self.github_release.get("id")) + + try: + self.remote = CodebaseGitRemote.objects.get( + should_import=True, + owner=self.repository["owner"]["login"], + repo_name=self.repository["name"], + ) + self.codebase = self.remote.mirror.codebase + except CodebaseGitRemote.DoesNotExist: + raise ValueError("Remote does not exist") + + @property + def installation_token(self): + installation = self.codebase.submitter.github_integration_app_installation + return GitHubApi.get_user_installation_access_token(installation) + + def import_or_reimport(self) -> bool: + if not self.github_release.get("zipball_url"): + return self.log_failure("No zipball found in the github release") + + try: + existing_release = self.codebase.releases.filter( + codebase=self.codebase, + imported_release_package__uid=self.github_release_id, + status__in=[ + CodebaseRelease.Status.UNPUBLISHED, + CodebaseRelease.Status.UNDER_REVIEW, + ], + ).first() + if existing_release: + return self.reimport_release(existing_release) + else: + return self.import_new_release() + except Exception as e: + logger.exception( + f"Error importing GitHub release with id {self.github_release_id}): {e}" + ) + return self.log_failure("An unexpected error occurred") + + def import_new_release(self) -> bool: + # make sure the release doesn't already exist as imported release + if self.codebase.releases.filter( + imported_release_package__uid=self.github_release_id + ).exists(): + return self.log_failure("Release already exists") + + # determine version number, make sure it doesn't already exist + version_number = self.extract_semver(self.github_release.get("tag_name", "")) + if not version_number: + version_number = self.extract_semver(self.github_release.get("name", "")) + if not version_number: + return self.log_failure( + "Missing a semantic version number (X.X.X) in the release tag or name" + ) + if self.codebase.releases.filter(version_number=version_number).exists(): + return self.log_failure( + f"Release with version {version_number} already exists" + ) + + # create a new imported codebase release + with transaction.atomic(): + package = ImportedReleasePackage.objects.create( + uid=self.github_release_id, + service=ImportedReleasePackage.Services.GITHUB, + name=self.github_release.get("tag_name"), + display_name=self.github_release.get("name", ""), + html_url=self.github_release.get("html_url", ""), + download_url=self.github_release.get("zipball_url", ""), + extra_data=self.github_release, + ) + release = CodebaseRelease.objects.create( + codebase=self.codebase, + submitter=self.codebase.submitter, + status=CodebaseRelease.Status.UNPUBLISHED, + share_uuid=uuid.uuid4(), + version_number=version_number, + imported_release_package=package, + ) + # add submitter as a release contributor automatically + contributor, created = Contributor.from_user(self.codebase.submitter) + release.add_contributor(contributor) + + return self._import_package_and_metadata(release) + + def reimport_release(self, release) -> bool: + # ignore request if the release package hasn't changed + # unless the release is newly released on github + if not self.is_new_github_release: + if ( + release.imported_release_package.download_url + == self.github_release.get("zipball_url") + ): + return False + + return self._import_package_and_metadata(release) + + def _import_package_and_metadata(self, release) -> bool: + # import the release package + fs_api = release.get_fs_api() + codemeta, cff = fs_api.import_release_package(self.installation_token) + + # extract metadata from the release package and save it to the release + release_fields = ReleaseMetadataConverter( + codemeta, cff, self.repository, self.github_release + ).convert() + license_spdx_id = release_fields.pop("license_spdx_id", None) + platforms = release_fields.pop("platforms", []) + programming_languages = release_fields.pop("programming_languages", []) + for key, value in release_fields.items(): + setattr(release, key, value) + license = License.objects.filter(name=license_spdx_id).first() + if license: + release.license = license + if platforms: + release.platform_tags.add(*platforms) + if programming_languages: + release.programming_languages.add(*programming_languages) + release.save() + + return self.log_success() + + def extract_semver(self, value) -> str | None: + match = re.search(r"v?(\d+\.\d+\.\d+)", value) + return match.group(1) if match else None + + def log_failure(self, message: str): + self._log( + f"Failed to {'' if self.is_new_github_release else 're-'}import release {self.github_release.get('name')}:\n{message}" + ) + return False + + def log_success(self): + self._log( + f"Successfully {'' if self.is_new_github_release else 're-'}imported release {self.github_release.get('name')}" + ) + return True + + def _log(self, message: str): + timestamp = f"[{timezone.now().isoformat()}]:\n" + self.remote.last_import_log = timestamp + message + self.remote.save() diff --git a/django/library/jinja2/library/codebases/git.jinja b/django/library/jinja2/library/codebases/git.jinja new file mode 100644 index 000000000..cd91ab61e --- /dev/null +++ b/django/library/jinja2/library/codebases/git.jinja @@ -0,0 +1,41 @@ +{% extends "base.jinja" %} +{% from "common.jinja" import breadcrumb %} + +{% block title %}GitHub Sync{% endblock %} + +{% block introduction %}
+ GitHub Sync allows you to connect your model in the CoMSES Model Library (CML) to a GitHub + repository. +
++ When creating a new repository here, a git repo will be automatically built from the public + releases of your model and pushed to GitHub. This will be updated every time you + publish a new release or update the metadata of an existing one, until it is disabled. +
++ Changes made to synced repositories on GitHub can be automically pulled back into the CML by + enabling importing from GitHub and creating a new release on GitHub. This works + similarly to the Zenodo GitHub integration. +
++ Learn more about how it works +
+