From c6e5b56f4aff898f3bbc2214631bd09d9697f91c Mon Sep 17 00:00:00 2001 From: sgfost Date: Thu, 9 Jan 2025 14:18:07 -0700 Subject: [PATCH 01/66] feat: add model for tracking git mirror of a codebase --- ...3_codebasegitmirror_codebase_git_mirror.py | 69 ++++++++++++++++ django/library/models.py | 79 +++++++++++++++++++ 2 files changed, 148 insertions(+) create mode 100644 django/library/migrations/0033_codebasegitmirror_codebase_git_mirror.py diff --git a/django/library/migrations/0033_codebasegitmirror_codebase_git_mirror.py b/django/library/migrations/0033_codebasegitmirror_codebase_git_mirror.py new file mode 100644 index 000000000..3d8510af1 --- /dev/null +++ b/django/library/migrations/0033_codebasegitmirror_codebase_git_mirror.py @@ -0,0 +1,69 @@ +# Generated by Django 4.2.17 on 2025-01-09 20:57 + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ("library", "0032_license_text_codemeta_snapshot"), + ] + + operations = [ + migrations.CreateModel( + name="CodebaseGitMirror", + fields=[ + ( + "id", + models.AutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("date_created", models.DateTimeField(auto_now_add=True)), + ("last_modified", models.DateTimeField(auto_now=True)), + ("repository_name", models.CharField(max_length=100, unique=True)), + ( + "remote_url", + models.URLField( + blank=True, help_text="URL of mirrored remote repository" + ), + ), + ("last_local_update", models.DateTimeField(blank=True, null=True)), + ("last_remote_update", models.DateTimeField(blank=True, null=True)), + ( + "user_access_token", + models.CharField(blank=True, max_length=200, null=True), + ), + ( + "organization_login", + models.CharField(blank=True, max_length=100, null=True), + ), + ( + "local_releases", + models.ManyToManyField( + related_name="+", to="library.codebaserelease" + ), + ), + ( + "remote_releases", + models.ManyToManyField( + related_name="+", to="library.codebaserelease" + ), + ), + ], + ), + migrations.AddField( + model_name="codebase", + name="git_mirror", + field=models.OneToOneField( + null=True, + on_delete=django.db.models.deletion.SET_NULL, + related_name="codebase", + to="library.codebasegitmirror", + ), + ), + ] diff --git a/django/library/models.py b/django/library/models.py index e91846749..51da5fbe3 100644 --- a/django/library/models.py +++ b/django/library/models.py @@ -616,6 +616,64 @@ def updated_after(self, start_date, end_date=None, **kwargs): return new_codebases, updated_codebases, releases +class CodebaseGitMirror(models.Model): + """ + Keeps track of a git repository and its GitHub remote that were created + from a Codebase using the mirror (read-only archiving) workflow + """ + + # is_active = models.BooleanField(default=True) + date_created = models.DateTimeField(auto_now_add=True) + last_modified = models.DateTimeField(auto_now=True) + repository_name = models.CharField(max_length=100, unique=True) + remote_url = models.URLField( + blank=True, + help_text=_("URL of mirrored remote repository"), + ) + # keep track of timestamp and releases that have been mirrored locally + last_local_update = models.DateTimeField(null=True, blank=True) + local_releases = models.ManyToManyField("CodebaseRelease", related_name="+") + # keep track of timestamp and releases that have been synced to the remote + last_remote_update = models.DateTimeField(null=True, blank=True) + remote_releases = models.ManyToManyField("CodebaseRelease", related_name="+") + user_access_token = models.CharField(max_length=200, null=True, blank=True) + organization_login = models.CharField(max_length=100, null=True, blank=True) + + @property + def latest_local_release(self): + return max(self.local_releases.all(), key=lambda r: Version(r.version_number)) + + @property + def latest_remote_release(self): + return max(self.remote_releases.all(), key=lambda r: Version(r.version_number)) + + @property + def unmirrored_local_releases(self): + return self.codebase.public_releases().exclude( + id__in=self.local_releases.values_list("id", flat=True) + ) + + @property + def unmirrored_remote_releases(self): + return self.local_releases.exclude( + id__in=self.remote_releases.values_list("id", flat=True) + ) + + def update_local_releases(self, new_releases: models.QuerySet | list): + if self.local_releases.exists(): + self.local_releases.add(*new_releases) + else: + self.local_releases.set(new_releases) + self.last_local_update = timezone.now() + self.save() + + def update_remote_releases(self): + releases = self.local_releases.all() + self.remote_releases.set(releases) + self.last_remote_update = timezone.now() + self.save() + + @add_to_comses_permission_whitelist class Codebase(index.Indexed, ModeratedContent, ClusterableModel): """ @@ -655,6 +713,13 @@ class Codebase(index.Indexed, ModeratedContent, ClusterableModel): on_delete=models.SET_NULL, ) + git_mirror = models.OneToOneField( + "CodebaseGitMirror", + null=True, + related_name="codebase", + on_delete=models.SET_NULL, + ) + repository_url = models.URLField( blank=True, help_text=_( @@ -833,6 +898,16 @@ def base_library_dir(self): def base_git_dir(self): return pathlib.Path(settings.REPOSITORY_ROOT, str(self.uuid)) + def create_git_mirror(self, repository_name, **kwargs): + if not self.git_mirror: + git_mirror = CodebaseGitMirror.objects.create( + repository_name=repository_name, + **kwargs, + ) + self.git_mirror = git_mirror + self.save() + return self.git_mirror + @property def publication_year(self): return ( @@ -1911,6 +1986,10 @@ def publish(self): schedule_mint_public_doi( self.id, dry_run=settings.DEPLOY_ENVIRONMENT.is_development ) + if self.codebase.git_mirror: + from .tasks import update_mirrored_codebase + + transaction.on_commit(lambda: update_mirrored_codebase(self.codebase.id)) def _publish(self): if not self.live: From 43147940588b5a1829f5dd4d639770f69df8de05 Mon Sep 17 00:00:00 2001 From: sgfost Date: Thu, 9 Jan 2025 14:26:45 -0700 Subject: [PATCH 02/66] feat: add codebase git repository fs api this API is responsible for managing a local git repository mirror for a comses codebase. PUBLIC release archives are commits/tags in the history. Release branches are created for each release and only added to if there is an update to metadata `build()` and `append_releases()` are the two main API methods which construct (or rebuild) a git repo and add new releases to the repo, respectively `update_release_branch()` will add a new commit containing changes to a release branch (and update main if they point to the same thing). This will mainly be used for updating metadata --- django/core/models.py | 8 + django/core/tests/base.py | 8 + django/curator/tests/test_dump_restore.py | 4 +- django/library/fs.py | 342 +++++++++++++++++- django/library/tests/base.py | 23 +- .../tests/{ => samples}/archives/.gitignore | 0 .../tests/{ => samples}/archives/invalid.zip | 0 .../archives/nestedcode/.DS_store} | 0 .../archives/nestedcode/.svn/svn_files_here | 0 .../archives/nestedcode/README.md | 0 .../archives/nestedcode/src/ex.py | 0 .../animals-model/1.0.0/code/model.py | 1 + .../animals-model/1.0.0/data/input.csv | 2 + .../animals-model/1.0.0/docs/README.txt | 3 + .../animals-model/1.0.0/results/analysis.txt | 1 + .../animals-model/2.0.0/code/animals/cow.py | 3 + .../animals-model/2.0.0/code/animals/horse.py | 3 + .../animals-model/2.0.0/code/animals/sheep.py | 3 + .../animals-model/2.0.0/code/model.py | 1 + .../animals-model/2.0.0/data/input.csv | 2 + .../animals-model/2.0.0/docs/README.md | 5 + django/library/tests/test_fs.py | 174 ++++++++- django/requirements.txt | 2 + 23 files changed, 568 insertions(+), 17 deletions(-) rename django/library/tests/{ => samples}/archives/.gitignore (100%) rename django/library/tests/{ => samples}/archives/invalid.zip (100%) rename django/library/tests/{archives/nestedcode/.DS_Store => samples/archives/nestedcode/.DS_store} (100%) rename django/library/tests/{ => samples}/archives/nestedcode/.svn/svn_files_here (100%) rename django/library/tests/{ => samples}/archives/nestedcode/README.md (100%) rename django/library/tests/{ => samples}/archives/nestedcode/src/ex.py (100%) create mode 100644 django/library/tests/samples/releases/animals-model/1.0.0/code/model.py create mode 100644 django/library/tests/samples/releases/animals-model/1.0.0/data/input.csv create mode 100644 django/library/tests/samples/releases/animals-model/1.0.0/docs/README.txt create mode 100644 django/library/tests/samples/releases/animals-model/1.0.0/results/analysis.txt create mode 100644 django/library/tests/samples/releases/animals-model/2.0.0/code/animals/cow.py create mode 100644 django/library/tests/samples/releases/animals-model/2.0.0/code/animals/horse.py create mode 100644 django/library/tests/samples/releases/animals-model/2.0.0/code/animals/sheep.py create mode 100644 django/library/tests/samples/releases/animals-model/2.0.0/code/model.py create mode 100644 django/library/tests/samples/releases/animals-model/2.0.0/data/input.csv create mode 100644 django/library/tests/samples/releases/animals-model/2.0.0/docs/README.md diff --git a/django/core/models.py b/django/core/models.py index 2c394d0fe..be945edc2 100644 --- a/django/core/models.py +++ b/django/core/models.py @@ -466,6 +466,14 @@ def github_url(self): """ return self.get_social_account_profile_url("github") + @property + def github_username(self): + github_account = self.get_social_account("github") + if github_account: + return github_account.extra_data.get("login") + else: + return None + def get_social_account_profile_url(self, provider_name): social_acct = self.get_social_account(provider_name) if social_acct: diff --git a/django/core/tests/base.py b/django/core/tests/base.py index 7d8852a21..1eb58bf09 100644 --- a/django/core/tests/base.py +++ b/django/core/tests/base.py @@ -187,5 +187,13 @@ def initialize_test_shared_folders(): ) +def clear_test_shared_folder(dir=settings.REPOSITORY_ROOT): + for fs in os.scandir(dir): + if fs.is_dir(): + shutil.rmtree(os.path.join(dir, fs.name), ignore_errors=True) + elif fs.is_file(): + os.remove(os.path.join(dir, fs.name)) + + def destroy_test_shared_folders(): shutil.rmtree(settings.SHARE_DIR, ignore_errors=True) diff --git a/django/curator/tests/test_dump_restore.py b/django/curator/tests/test_dump_restore.py index 0d232ebb9..b7b71a39b 100644 --- a/django/curator/tests/test_dump_restore.py +++ b/django/curator/tests/test_dump_restore.py @@ -19,7 +19,7 @@ from core.tests.base import EventFactory, JobFactory from library.fs import import_archive from library.models import Codebase -from library.tests.base import CodebaseFactory +from library.tests.base import CodebaseFactory, TEST_SAMPLES_DIR logger = logging.getLogger(__name__) @@ -51,7 +51,7 @@ def setUp(self): fs_api = self.release.get_fs_api() import_archive( codebase_release=self.release, - nested_code_folder_name="library/tests/archives/nestedcode", + nested_code_folder_name=TEST_SAMPLES_DIR / "archives" / "nestedcode", fs_api=fs_api, ) diff --git a/django/library/fs.py b/django/library/fs.py index bbd13f6b5..9eee5a731 100644 --- a/django/library/fs.py +++ b/django/library/fs.py @@ -1,3 +1,5 @@ +import json +import yaml import logging import mimetypes import os @@ -5,11 +7,15 @@ import shutil import tarfile import zipfile +import filecmp +from contextlib import contextmanager +from packaging.version import Version from enum import Enum from functools import total_ordering from pathlib import Path from tempfile import TemporaryDirectory -from typing import Optional +from typing import Callable, Optional +from git import Actor, GitCommandError, InvalidGitRepositoryError, Repo import bagit import rarfile @@ -17,6 +23,7 @@ from django.core.files.storage import FileSystemStorage from django.core.files.uploadedfile import File from django.urls import reverse +from django.utils import timezone from rest_framework.exceptions import ValidationError from core import fs @@ -794,6 +801,339 @@ def rebuild(self, metadata_only=False) -> MessageGroup: return msgs +class CodebaseGitRepositoryApi: + """ + Manage a (local) git repository mirror of a codebase + """ + + FILE_SIZE_LIMIT = settings.GITHUB_INDIVIDUAL_FILE_SIZE_LIMIT + MEGABYTE = 1024 * 1024 + FILE_SIZE_LIMIT_MB = FILE_SIZE_LIMIT / MEGABYTE + DEFAULT_BRANCH_NAME = "main" + RELEASE_BRANCH_PREFIX = "release/" + + def __init__(self, codebase): + self.codebase = codebase + self.mirror = codebase.git_mirror + if not self.mirror: + raise ValueError("Codebase must have a git_mirror") + self.repo_dir = Path(self.codebase.base_git_dir, str(self.repo_name)).absolute() + + @property + def repo_name(self): + return self.mirror.repository_name + + @property + def committer(self): + return Actor("CoMSES Net", settings.EDITOR_EMAIL) + + @property + def author(self): + profile = self.codebase.submitter.member_profile + author_email = ( + f"{profile.github_username}@users.noreply.github.com" + if profile.github_username + else profile.email + ) + return Actor(profile.name, author_email) + + def get_release_branch_name(self, release): + return f"{self.RELEASE_BRANCH_PREFIX}{release.version_number}" + + @classmethod + def check_file_sizes(cls, codebase): + releases = codebase.ordered_releases_list() + for release in releases: + release_fs_api = release.get_fs_api() + sip_storage = release_fs_api.get_sip_storage() + for file in sip_storage.list(absolute=True): + if file.stat().st_size > cls.FILE_SIZE_LIMIT: + file_size_mb = file.stat().st_size / cls.MEGABYTE + raise ValidationError( + f"File {file} is too large ({file_size_mb}MB), individual files must be under {cls.FILE_SIZE_LIMIT_MB}MB" + ) + + @contextmanager + def use_temporary_repo(self, from_existing=False): + """ + context manager that allows for 'atomic' operations on the git repository + by creating a temporary copy and copying it back after the block is executed + """ + original_repo_dir = self.repo_dir + with TemporaryDirectory() as tmpdir: + self.repo_dir = Path(tmpdir) + if from_existing: + shutil.copytree(original_repo_dir, self.repo_dir, dirs_exist_ok=True) + self.initialize(should_exist=True) + yield + if original_repo_dir.exists(): + shutil.rmtree(original_repo_dir) + shutil.copytree(self.repo_dir, original_repo_dir, dirs_exist_ok=True) + self.repo_dir = original_repo_dir + + def initialize(self, should_exist=False): + """ + initialize the git repository or connect to an existing one + + :param should_exist: if True, raise an error if the repository does not exist + """ + if not self.repo_dir.exists(): + if should_exist: + raise RuntimeError(f"Repository {self.repo_dir} does not exist") + self.repo_dir.mkdir(parents=True) + try: + self.repo = Repo(self.repo_dir) + except InvalidGitRepositoryError: + if should_exist: + raise RuntimeError(f"Repository {self.repo_dir} does not exist") + self.repo = Repo.init( + self.repo_dir, initial_branch=self.DEFAULT_BRANCH_NAME + ) + except Exception as e: + logger.exception(e) + raise RuntimeError(f"Failed to initialize git repository") + + def checkout_main(self): + self.repo.git.checkout(self.DEFAULT_BRANCH_NAME) + + def clear_existing_files(self): + """ + clear any existing files in the working tree (tracked or untracked) besides .git + """ + for item in self.repo_dir.iterdir(): + if item.name != ".git": + if item.is_dir(): + shutil.rmtree(item) + else: + item.unlink() + self.repo.index.remove( + [str(item.relative_to(self.repo_dir))], + working_tree=True, + r=True, + ) + + def add_release_files(self, release): + """ + copy over submission package files for a release to the working tree of the git repo + starting from a clean directory by removing all files except .git/ + """ + release_fs_api: CodebaseReleaseFsApi = release.get_fs_api() + sip_storage = release_fs_api.get_sip_storage() + self.clear_existing_files() + # copy over files from the sip storage and add to the index + # FIXME: consider moving this copy all operation to the CodebaseReleaseStorage class + for file in sip_storage.list(absolute=True): + rel_path = file.relative_to(sip_storage.location) + dest_path = self.repo_dir / rel_path + dest_path.parent.mkdir(parents=True, exist_ok=True) + shutil.copy(file, dest_path) + self.repo.index.add([str(rel_path)]) + + def add_readme(self, release): + """ + add a readme file to the repository root. If one already exists somewhere, move it. + Otherwise, generate one from a template + """ + release_fs_api: CodebaseReleaseFsApi = release.get_fs_api() + sip_storage = release_fs_api.get_sip_storage() + readme_pattern = re.compile( + r"(?i)^readme(?:\.(?:markdown|mdown|mkdn|md|textile|rdoc|org|creole|mediawiki|wiki|rst|asciidoc|adoc|asc|pod|txt))?$" + ) + for file in sip_storage.list(absolute=True): + # check for an existing readme and duplicate it to the repo root + # for github to recognize. Otherwise, we'll generate one later + if readme_pattern.match(file.name): + shutil.copy(file, self.repo_dir / file.name) + self.repo.index.add([file.name]) + return + readme_content = f"# {self.codebase.title}\n\n{self.codebase.description.raw}\n" + self._add_single_file("README.md", readme_content) + + def _add_single_file(self, filename, content: str, overwrite=False): + dest_path = self.repo_dir / filename + if not dest_path.exists() or overwrite: + with dest_path.open("w") as f: + f.write(content) + self.repo.index.add([filename]) + + def commit_release(self, release, tag=True): + """ + commit the the release and tag it, should only be called after adding all necessary files + """ + # make sure the commit goes to main, then create the release branch later + # unless this is the first commit + if self.DEFAULT_BRANCH_NAME in self.repo.heads: + self.checkout_main() + commit_msg = ( + f"Release {release.version_number}\n\n{release.release_notes.raw}\n" + ) + for rc in release.coauthor_release_contributors: + contributor = rc.contributor + email = "" + # try to use the co-author's github account email, otherwise just leave it blank + if contributor.user and contributor.user.member_profile.github_username: + email = f"{contributor.user.member_profile.github_username}@users.noreply.github.com" + commit_msg += f"\nCo-authored-by: {contributor.name} <{email}>" + commit = self.repo.index.commit( + message=commit_msg, + committer=self.committer, + author=self.author, + author_date=release.last_published_on, + ) + if tag: + self.repo.create_tag(f"{release.version_number}") + return commit + + def create_release_branch(self, release, commit): + """ + create a new branch for the release + """ + release_branch_name = self.get_release_branch_name(release) + self.repo.create_head(release_branch_name, commit) + return release_branch_name + + def update_release_branch(self, release) -> Repo | None: + """ + update a release branch with new metadata, merging back into main (fast-forward) + if it is the latest release + + this ONLY updates metadata files and does not add + changes to the code, docs, etc. as it is assumed that any synced releases are published + and frozen + + returns None if no changes were made, otherwise returns the updated repo + """ + with self.use_temporary_repo(from_existing=True): + self.initialize(should_exist=True) + release_branch_name = self.get_release_branch_name(release) + # determine whether this is the latest release (i.e. points to the + # same thing as main) and should merge back into main + release_branch = self.repo.heads[release_branch_name] + main_branch = self.repo.heads[self.DEFAULT_BRANCH_NAME] + merge_into_main = (main_branch.commit == release_branch.commit) and ( + main_branch.commit == self.repo.head.commit + ) + + self.repo.git.checkout(release_branch_name) + self.add_release_files(release) + self.add_readme(release) + + # check for changes before committing + if not self.repo.is_dirty(): + return None + + commit_msg = f"Update metadata for release {release.version_number}" + self.repo.index.commit( + message=commit_msg, + committer=self.committer, + author=self.author, + author_date=timezone.now(), + ) + if merge_into_main: + self.checkout_main() + try: + self.repo.git.merge("--ff-only", release_branch_name) + except Exception as e: + logger.error( + f"Unexpected divergence when trying to merge {release_branch_name} into {self.DEFAULT_BRANCH_NAME}: {e}" + ) + self.checkout_main() + + return Repo(self.repo_dir) + + def append_releases(self, releases=None) -> Repo: + """ + add new releases to the git repository. + releases must be newer/higher than the latest mirrored release so that they can be added on top + + this should only be used if no releases have been removed or otherwise modified since these require + rewriting history and this method strictly appends new releases + + :param releases: list of releases to append, if None, all unmirrored releases will be appended + """ + self.check_file_sizes(self.codebase) + if not releases: + releases = self.mirror.unmirrored_local_releases + if not releases: + # nothing to do, return the existing repo + return Repo(self.repo_dir) + with self.use_temporary_repo(from_existing=True): + # make sure the releases are higher than the latest mirrored release + if not all( + Version(release.version_number) + > Version(self.mirror.latest_local_release.version_number) + for release in releases + ): + raise ValueError( + "Releases must be higher than the latest mirrored release to append" + ) + # make sure the releases are ordered by version number + releases = sorted(releases, key=lambda r: Version(r.version_number)) + # append releases to the git repo by adding files, committing, and creating a branch + for release in releases: + self.add_release_files(release) + self.add_readme(release) + commit = self.commit_release(release) + self.create_release_branch(release, commit) + self.checkout_main() + # record newly mirrored releases and update timestamp + self.mirror.update_local_releases(releases) + return Repo(self.repo_dir) + + def build(self) -> Repo: + """ + builds or rebuilds the git repository from codebase releases + + this will create an entirely new repository and should only be used if we are creating the + mirror for the first time or need to rebuild the entire history + """ + self.check_file_sizes(self.codebase) + releases = self.codebase.ordered_releases_list() + if not releases: + raise ValidationError("Must have at least one public release to build from") + with self.use_temporary_repo(): + self.initialize() + for release in releases: + self.add_release_files(release) + self.add_readme(release) + commit = self.commit_release(release) + self.create_release_branch(release, commit) + self.checkout_main() + # record mirrored releases and update timestamp + self.mirror.update_local_releases(releases) + return Repo(self.repo_dir) + + def update_or_build(self) -> Repo: + if self.repo_dir.exists() and self.repo_dir.joinpath(".git").exists(): + return self.append_releases() + else: + return self.build() + + def dirs_equal(self, dir1: Path, dir2: Path, ignore=[".git"]): + """ + check if two directories are equal by recursively comparing their contents + excluding the files in the ignore list (default is just .git) + + this will likely go unused in favor of a more efficient method for checking if a + release mirror (commit) is up to date + """ + dir1 = Path(dir1) + dir2 = Path(dir2) + comparison = filecmp.dircmp(dir1, dir2, ignore=ignore) + if ( + comparison.left_only + or comparison.right_only + or comparison.diff_files + or comparison.funny_files + ): + return False + else: + for subdir in comparison.common_dirs: + if not self.dirs_equal(dir1 / subdir, dir2 / subdir): + return False + return True + + class ArchiveExtractor: def __init__(self, sip_storage: CodebaseReleaseSipStorage): self.sip_storage = sip_storage diff --git a/django/library/tests/base.py b/django/library/tests/base.py index 001478b69..2105fc4cd 100644 --- a/django/library/tests/base.py +++ b/django/library/tests/base.py @@ -1,5 +1,6 @@ import io import logging +from pathlib import Path import random from uuid import UUID @@ -19,6 +20,9 @@ ) from library.serializers import CodebaseSerializer +TEST_SAMPLES_DIR = Path("library/tests/samples") + + logger = logging.getLogger(__name__) @@ -179,7 +183,7 @@ class ReleaseSetup: PROGRAMMING_LANGUAGES = ["Python", "TypeScript"] @classmethod - def setUpPublishableDraftRelease(cls, codebase): + def setUpPublishableDraftRelease(cls, codebase, with_files=True): draft_release = codebase.create_release( status=CodebaseRelease.Status.DRAFT, initialize=True, @@ -196,15 +200,14 @@ def setUpPublishableDraftRelease(cls, codebase): release_contributor_factory = ReleaseContributorFactory(draft_release) contributor = contributor_factory.create() release_contributor_factory.create(contributor) - - code_file = io.BytesIO(b"print('hello world')") - code_file.name = "some_code_file.py" - docs_file = io.BytesIO(b"# Documentation") - docs_file.name = "some_doc_file.md" - fs_api = draft_release.get_fs_api() - fs_api.add(content=code_file, category=FileCategoryDirectories.code) - fs_api.add(content=docs_file, category=FileCategoryDirectories.docs) + if with_files: + code_file = io.BytesIO(b"print('hello world')") + code_file.name = "some_code_file.py" + docs_file = io.BytesIO(b"# Documentation") + docs_file.name = "some_doc_file.md" + fs_api = draft_release.get_fs_api() + fs_api.add(content=code_file, category=FileCategoryDirectories.code) + fs_api.add(content=docs_file, category=FileCategoryDirectories.docs) draft_release.save() - return draft_release diff --git a/django/library/tests/archives/.gitignore b/django/library/tests/samples/archives/.gitignore similarity index 100% rename from django/library/tests/archives/.gitignore rename to django/library/tests/samples/archives/.gitignore diff --git a/django/library/tests/archives/invalid.zip b/django/library/tests/samples/archives/invalid.zip similarity index 100% rename from django/library/tests/archives/invalid.zip rename to django/library/tests/samples/archives/invalid.zip diff --git a/django/library/tests/archives/nestedcode/.DS_Store b/django/library/tests/samples/archives/nestedcode/.DS_store similarity index 100% rename from django/library/tests/archives/nestedcode/.DS_Store rename to django/library/tests/samples/archives/nestedcode/.DS_store diff --git a/django/library/tests/archives/nestedcode/.svn/svn_files_here b/django/library/tests/samples/archives/nestedcode/.svn/svn_files_here similarity index 100% rename from django/library/tests/archives/nestedcode/.svn/svn_files_here rename to django/library/tests/samples/archives/nestedcode/.svn/svn_files_here diff --git a/django/library/tests/archives/nestedcode/README.md b/django/library/tests/samples/archives/nestedcode/README.md similarity index 100% rename from django/library/tests/archives/nestedcode/README.md rename to django/library/tests/samples/archives/nestedcode/README.md diff --git a/django/library/tests/archives/nestedcode/src/ex.py b/django/library/tests/samples/archives/nestedcode/src/ex.py similarity index 100% rename from django/library/tests/archives/nestedcode/src/ex.py rename to django/library/tests/samples/archives/nestedcode/src/ex.py diff --git a/django/library/tests/samples/releases/animals-model/1.0.0/code/model.py b/django/library/tests/samples/releases/animals-model/1.0.0/code/model.py new file mode 100644 index 000000000..8cde7829c --- /dev/null +++ b/django/library/tests/samples/releases/animals-model/1.0.0/code/model.py @@ -0,0 +1 @@ +print("hello world") diff --git a/django/library/tests/samples/releases/animals-model/1.0.0/data/input.csv b/django/library/tests/samples/releases/animals-model/1.0.0/data/input.csv new file mode 100644 index 000000000..29aa17e68 --- /dev/null +++ b/django/library/tests/samples/releases/animals-model/1.0.0/data/input.csv @@ -0,0 +1,2 @@ +horses,sheep +10,15 diff --git a/django/library/tests/samples/releases/animals-model/1.0.0/docs/README.txt b/django/library/tests/samples/releases/animals-model/1.0.0/docs/README.txt new file mode 100644 index 000000000..c9be3ff53 --- /dev/null +++ b/django/library/tests/samples/releases/animals-model/1.0.0/docs/README.txt @@ -0,0 +1,3 @@ +instructions: + +python model.py diff --git a/django/library/tests/samples/releases/animals-model/1.0.0/results/analysis.txt b/django/library/tests/samples/releases/animals-model/1.0.0/results/analysis.txt new file mode 100644 index 000000000..29fee6381 --- /dev/null +++ b/django/library/tests/samples/releases/animals-model/1.0.0/results/analysis.txt @@ -0,0 +1 @@ +result tells us there are more sheep diff --git a/django/library/tests/samples/releases/animals-model/2.0.0/code/animals/cow.py b/django/library/tests/samples/releases/animals-model/2.0.0/code/animals/cow.py new file mode 100644 index 000000000..f4963717d --- /dev/null +++ b/django/library/tests/samples/releases/animals-model/2.0.0/code/animals/cow.py @@ -0,0 +1,3 @@ +class Cow: + def __init__(self, name): + self.name = name diff --git a/django/library/tests/samples/releases/animals-model/2.0.0/code/animals/horse.py b/django/library/tests/samples/releases/animals-model/2.0.0/code/animals/horse.py new file mode 100644 index 000000000..84790eaf9 --- /dev/null +++ b/django/library/tests/samples/releases/animals-model/2.0.0/code/animals/horse.py @@ -0,0 +1,3 @@ +class Horse: + def __init__(self, name): + self.name = name diff --git a/django/library/tests/samples/releases/animals-model/2.0.0/code/animals/sheep.py b/django/library/tests/samples/releases/animals-model/2.0.0/code/animals/sheep.py new file mode 100644 index 000000000..3c575356d --- /dev/null +++ b/django/library/tests/samples/releases/animals-model/2.0.0/code/animals/sheep.py @@ -0,0 +1,3 @@ +class Sheep: + def __init__(self, name): + self.name = name diff --git a/django/library/tests/samples/releases/animals-model/2.0.0/code/model.py b/django/library/tests/samples/releases/animals-model/2.0.0/code/model.py new file mode 100644 index 000000000..8cde7829c --- /dev/null +++ b/django/library/tests/samples/releases/animals-model/2.0.0/code/model.py @@ -0,0 +1 @@ +print("hello world") diff --git a/django/library/tests/samples/releases/animals-model/2.0.0/data/input.csv b/django/library/tests/samples/releases/animals-model/2.0.0/data/input.csv new file mode 100644 index 000000000..603cf0ed3 --- /dev/null +++ b/django/library/tests/samples/releases/animals-model/2.0.0/data/input.csv @@ -0,0 +1,2 @@ +horses,sheep,cows +10,15,5 diff --git a/django/library/tests/samples/releases/animals-model/2.0.0/docs/README.md b/django/library/tests/samples/releases/animals-model/2.0.0/docs/README.md new file mode 100644 index 000000000..64258c3b0 --- /dev/null +++ b/django/library/tests/samples/releases/animals-model/2.0.0/docs/README.md @@ -0,0 +1,5 @@ +# instructions: + +``` +python model.py +``` \ No newline at end of file diff --git a/django/library/tests/test_fs.py b/django/library/tests/test_fs.py index 5bcd21f9a..d7923db8e 100644 --- a/django/library/tests/test_fs.py +++ b/django/library/tests/test_fs.py @@ -1,19 +1,24 @@ from pathlib import Path - +import os +from git import Repo from django.test import TestCase +from django.conf import settings from core.tests.base import ( UserFactory, destroy_test_shared_folders, initialize_test_shared_folders, + clear_test_shared_folder, ) from library.fs import ( FileCategoryDirectories, StagingDirectories, MessageLevels, import_archive, + CodebaseGitRepositoryApi, ) -from library.tests.base import CodebaseFactory +from library.tests.base import CodebaseFactory, TEST_SAMPLES_DIR +from library.models import License import logging @@ -26,7 +31,7 @@ def setUpModule(): class ArchiveExtractorTestCase(TestCase): - nested_code_folder = Path("library/tests/archives/nestedcode") + nested_code_folder = TEST_SAMPLES_DIR / "archives" / "nestedcode" def setUp(self): self.user_factory = UserFactory() @@ -70,7 +75,7 @@ def test_zipfile_saving(self): ) def test_invalid_zipfile_saving(self): - archive_name = "library/tests/archives/invalid.zip" + archive_name = str(TEST_SAMPLES_DIR / "archives" / "invalid.zip") fs_api = self.codebase_release.get_fs_api() with open(archive_name, "rb") as f: msgs = fs_api.add( @@ -86,5 +91,166 @@ def tearDownClass(cls): cls.nested_code_folder.with_suffix(".zip").unlink(missing_ok=True) +class GitRepoApiTestCase(TestCase): + model_dir = TEST_SAMPLES_DIR / "releases" / "animals-model" + release_1_dir = model_dir / "1.0.0" + release_2_dir = model_dir / "2.0.0" + + def setUp(self): + self.user_factory = UserFactory() + self.submitter = self.user_factory.create() + self.codebase_factory = CodebaseFactory(submitter=self.submitter) + self.codebase = self.codebase_factory.create() + self.release_1 = self.codebase.create_release() + self.git_mirror = self.codebase.create_git_mirror("animals-model") + + def tearDown(self): + clear_test_shared_folder(settings.REPOSITORY_ROOT) + + def test_repo_build(self): + update_release_from_sample( + self.release_1, self.release_1_dir, version_number="1.0.0" + ) + self.release_1.publish() + public_release_count = self.codebase.public_releases().count() + self.assertEqual(public_release_count, 1) + api = CodebaseGitRepositoryApi(self.codebase) + api.build() + # check that the mirror model is updated and the repo is built + self.assertIsNotNone(self.git_mirror.last_local_update) + self.assertEqual(self.git_mirror.local_releases.count(), 1) + self.assertTrue(os.path.exists(api.repo_dir)) + # check git stuff + repo = Repo(api.repo_dir) + self.assertFalse(repo.is_dirty()) + self.assertEqual(sum(1 for _ in repo.iter_commits()), public_release_count) + self.assertEqual(len(repo.tags), public_release_count) + # check contents + self.assertTrue(os.path.exists(api.repo_dir / "codemeta.json")) + self.assertTrue(os.path.exists(api.repo_dir / "CITATION.cff")) + self.assertTrue(os.path.exists(api.repo_dir / "LICENSE")) + fs_api = self.release_1.get_fs_api() + fs_api.list(StagingDirectories.sip, FileCategoryDirectories.code) + for category in ["code", "data", "docs"]: + self.assertTrue( + api.dirs_equal( + fs_api.sip_contents_dir / category, + api.repo_dir / category, + ) + ) + + def test_repo_append_releases(self): + update_release_from_sample( + self.release_1, self.release_1_dir, version_number="1.0.0" + ) + self.release_1.publish() + api = CodebaseGitRepositoryApi(self.codebase) + api.build() + self.release_2 = self.codebase.create_release() + update_release_from_sample( + self.release_2, self.release_2_dir, version_number="2.0.0" + ) + self.release_2.publish() + api.append_releases() + + self.assertEqual(self.git_mirror.local_releases.count(), 2) + # check git stuff + repo = Repo(api.repo_dir) + self.assertFalse(repo.is_dirty()) + public_release_count = self.codebase.public_releases().count() + self.assertEqual(sum(1 for _ in repo.iter_commits()), public_release_count) + self.assertEqual(len(repo.tags), public_release_count) + # check contents + self.assertTrue(os.path.exists(api.repo_dir / "codemeta.json")) + self.assertTrue(os.path.exists(api.repo_dir / "CITATION.cff")) + self.assertTrue(os.path.exists(api.repo_dir / "LICENSE")) + fs_api = self.release_2.get_fs_api() + fs_api.list(StagingDirectories.sip, FileCategoryDirectories.code) + for category in ["code", "data", "docs"]: + self.assertTrue( + api.dirs_equal( + fs_api.sip_contents_dir / category, + api.repo_dir / category, + ) + ) + + def test_will_not_append_lower_version(self): + # publish 2.0.0 first and build + update_release_from_sample( + self.release_1, self.release_1_dir, version_number="1.0.0" + ) + self.release_2 = self.codebase.create_release() + update_release_from_sample( + self.release_2, self.release_2_dir, version_number="2.0.0" + ) + self.release_2.publish() + api = CodebaseGitRepositoryApi(self.codebase) + api.build() + # now publish release 1.0.0 + self.release_1.publish() + self.assertRaises(ValueError, api.append_releases) + + def test_repo_rebuild(self): + update_release_from_sample( + self.release_1, self.release_1_dir, version_number="1.0.0" + ) + self.release_1.publish() + api = CodebaseGitRepositoryApi(self.codebase) + api.build() + self.release_2 = self.codebase.create_release() + update_release_from_sample( + self.release_2, self.release_2_dir, version_number="2.0.0" + ) + self.release_2.publish() + api.build() + + self.assertEqual(self.git_mirror.local_releases.count(), 2) + # check git stuff + repo = Repo(api.repo_dir) + self.assertFalse(repo.is_dirty()) + public_release_count = self.codebase.public_releases().count() + self.assertEqual(sum(1 for _ in repo.iter_commits()), public_release_count) + self.assertEqual(len(repo.tags), public_release_count) + # check contents + self.assertTrue(os.path.exists(api.repo_dir / "codemeta.json")) + self.assertTrue(os.path.exists(api.repo_dir / "CITATION.cff")) + self.assertTrue(os.path.exists(api.repo_dir / "LICENSE")) + fs_api = self.release_2.get_fs_api() + fs_api.list(StagingDirectories.sip, FileCategoryDirectories.code) + for category in ["code", "data", "docs"]: + self.assertTrue( + api.dirs_equal( + fs_api.sip_contents_dir / category, + api.repo_dir / category, + ) + ) + + def tearDownModule(): destroy_test_shared_folders() + + +# helpers ================================================ + + +def upload_category(fs_api, release_dir: Path, category: str): + category_path = release_dir / category + for filepath in category_path.rglob("*"): + if filepath.is_file(): + with filepath.open("rb") as f: + relpath = filepath.relative_to(category_path) + file_name = str(relpath) + fs_api.add(FileCategoryDirectories[category], content=f, name=file_name) + + +def update_release_from_sample(release, sample_dir, version_number): + release.os = "Linux" + release.programming_languages.add("Python") + release.license = License.objects.create(name="MIT") + release.release_notes = "Initial release" + release.version_number = version_number + release.save() + fs_api = release.get_fs_api() + for category in ["code", "data", "docs"]: + upload_category(fs_api, sample_dir, category) + return release diff --git a/django/requirements.txt b/django/requirements.txt index 09d4f6690..634a23812 100644 --- a/django/requirements.txt +++ b/django/requirements.txt @@ -27,6 +27,7 @@ Django==5.2.11 drf-spectacular==0.29.0 elasticsearch-dsl>=7.0.0,<8.0.0 elasticsearch>=7.0.0,<8.0.0 +gitpython==3.1.43 html2text>=2016.9.19 huey==2.6.0 jinja2==3.1.6 @@ -37,6 +38,7 @@ numpy==1.26.4 pandas==2.2.2 psycopg2-binary==2.9.11 pyjwt[crypto]>=2.4.0,<3.0.0 +PyGithub==2.5.0 pytz==2025.2 pyyaml>=6.0.1 # used for institution -> affiliation data migration From 679a1ab56a268b01106cc3f07835382c4361ce5d Mon Sep 17 00:00:00 2001 From: sgfost Date: Thu, 9 Jan 2025 14:33:50 -0700 Subject: [PATCH 03/66] feat: add github integration api and mirroring tasks the GithubApi provides access to auth and repository actions adds 3 huey (async) tasks for creating a mirror, updating a mirror, and updating metadata for a single release of a mirror --- Makefile | 2 +- base.yml | 6 + deploy/conf/.env.template | 6 + django/core/settings/defaults.py | 17 ++ django/library/github_integration.py | 253 +++++++++++++++++++++++++++ django/library/tasks.py | 94 ++++++++++ 6 files changed, 377 insertions(+), 1 deletion(-) create mode 100644 django/library/github_integration.py diff --git a/Makefile b/Makefile index 0aec8dc5f..131fce276 100644 --- a/Makefile +++ b/Makefile @@ -9,7 +9,7 @@ SECRETS_DIR=${BUILD_DIR}/secrets DB_PASSWORD_PATH=${SECRETS_DIR}/db_password PGPASS_PATH=${SECRETS_DIR}/.pgpass SECRET_KEY_PATH=${SECRETS_DIR}/django_secret_key -EXT_SECRETS=hcaptcha_secret github_client_secret orcid_client_secret discourse_api_key discourse_sso_secret mail_api_key datacite_api_password youtube_api_key +EXT_SECRETS=hcaptcha_secret github_client_secret orcid_client_secret discourse_api_key discourse_sso_secret mail_api_key datacite_api_password youtube_api_key github_integration_app_private_key github_integration_app_client_secret GENERATED_SECRETS=$(DB_PASSWORD_PATH) $(PGPASS_PATH) $(SECRET_KEY_PATH) ENVREPLACE := deploy/scripts/envreplace diff --git a/base.yml b/base.yml index 616c2828f..3d5a769e0 100644 --- a/base.yml +++ b/base.yml @@ -67,6 +67,8 @@ services: - django_secret_key - github_client_secret - orcid_client_secret + - github_integration_app_private_key + - github_integration_app_client_secret - hcaptcha_secret - mail_api_key - youtube_api_key @@ -99,6 +101,10 @@ secrets: file: ./build/secrets/django_secret_key github_client_secret: file: ./build/secrets/github_client_secret + github_integration_app_private_key: + file: ./build/secrets/github_integration_app_private_key + github_integration_app_client_secret: + file: ./build/secrets/github_integration_app_client_secret hcaptcha_secret: file: ./build/secrets/hcaptcha_secret mail_api_key: diff --git a/deploy/conf/.env.template b/deploy/conf/.env.template index 8f65f1272..93fffdb85 100644 --- a/deploy/conf/.env.template +++ b/deploy/conf/.env.template @@ -45,6 +45,12 @@ DATACITE_DRY_RUN="true" # allowed values: "true" or "false" # youtube api settings YOUTUBE_CHANNEL_ID= +# github integration app +GITHUB_INTEGRATION_APP_ID= +GITHUB_INTEGRATION_APP_INSTALLATION_ID= +GITHUB_INTEGRATION_APP_CLIENT_ID= +GITHUB_MODEL_LIBRARY_ORG_NAME= + # test TEST_USER_ID=10000000 TEST_USERNAME=__test_user__ diff --git a/django/core/settings/defaults.py b/django/core/settings/defaults.py index b69678a34..ce4e252df 100644 --- a/django/core/settings/defaults.py +++ b/django/core/settings/defaults.py @@ -553,6 +553,23 @@ def set_environment(env: Environment): GITHUB_CLIENT_ID = os.getenv("GITHUB_CLIENT_ID", "") GITHUB_CLIENT_SECRET = read_secret("github_client_secret") +GITHUB_INTEGRATION_APP_ID = int(os.getenv("GITHUB_INTEGRATION_APP_ID") or 0) +GITHUB_INTEGRATION_APP_PRIVATE_KEY = read_secret("github_integration_app_private_key") +GITHUB_INTEGRATION_APP_INSTALLATION_ID = int( + os.getenv("GITHUB_INTEGRATION_APP_INSTALLATION_ID") or 0 +) +# client id and secret are only used for getting user access tokens to be able to push +# to the user's repositories. We are not re-using the regular oauth app in order to +# keep minimal permissions +GITHUB_INTEGRATION_APP_CLIENT_ID = os.getenv("GITHUB_INTEGRATION_APP_ID", "") +GITHUB_INTEGRATION_APP_CLIENT_SECRET = read_secret( + "github_integration_app_client_secret" +) +GITHUB_MODEL_LIBRARY_ORG_NAME = os.getenv("GITHUB_MODEL_LIBRARY_ORG_NAME", "") +GITHUB_INDIVIDUAL_FILE_SIZE_LIMIT = os.getenv( + "GITHUB_INDIVIDUAL_FILE_SIZE_LIMIT", 100 * 1024 * 1024 +) + TEST_BASIC_AUTH_PASSWORD = os.getenv("TEST_BASIC_AUTH_PASSWORD", "test password") TEST_USER_ID = os.getenv("TEST_USER_ID", 1000000) TEST_USERNAME = os.getenv("TEST_USERNAME", "__test_user__") diff --git a/django/library/github_integration.py b/django/library/github_integration.py new file mode 100644 index 000000000..722d8a6b6 --- /dev/null +++ b/django/library/github_integration.py @@ -0,0 +1,253 @@ +import re +from github import GithubIntegration, Auth, Github +from github.GithubException import UnknownObjectException +from github.Repository import Repository as GithubRepo +from git import Repo as GitRepo +from django.conf import settings +from django.core.cache import cache +from django.utils import timezone + +from .models import Codebase + +INSTALLATION_ACCESS_TOKEN_REDIS_KEY = "github_installation_access_token" + + +class GithubRepoNameValidator: + @classmethod + def validate( + cls, + repo_name: str, + username: str | None = None, + user_access_token: str | None = None, + ): + cls._validate_format(repo_name) + if username and user_access_token: + cls._check_user_repo_name_unused(repo_name, username, user_access_token) + elif username: + raise ValueError("User access token required for user repository") + else: + cls._check_org_repo_name_unused(repo_name) + + @staticmethod + def _validate_format(repo_name: str): + if not re.fullmatch(r"[A-Za-z0-9_.-]+", repo_name): + raise ValueError( + "The repository name can only contain ASCII letters, digits, and the characters ., -, and _" + ) + if not (1 <= len(repo_name) <= 100): + raise ValueError("Repository name is too long (maximum is 100 characters)") + if repo_name.endswith(".git"): + raise ValueError("Repository name cannot end with '.git'") + if "github" in repo_name: + raise ValueError("Repository name cannot contain 'github'") + + @staticmethod + def _check_user_repo_name_unused( + repo_name: str, username: str, user_access_token: str + ): + if username in repo_name: + raise ValueError( + f"Repository name cannot contain your username: '{username}'" + ) + github = Github(user_access_token) + try: + github.get_user(username).get_repo(repo_name) + raise ValueError( + f"Repository name already exists at https://github.com/{username}/{repo_name}" + ) + except UnknownObjectException: + return True + + @staticmethod + def _check_org_repo_name_unused(repo_name: str): + if settings.GITHUB_MODEL_LIBRARY_ORG_NAME in repo_name: + raise ValueError( + f"Repository name cannot contain the organization name: '{settings.GITHUB_MODEL_LIBRARY_ORG_NAME}'" + ) + github = Github(GithubApi.get_installation_access_token()) + try: + github.get_organization(settings.GITHUB_MODEL_LIBRARY_ORG_NAME).get_repo( + repo_name + ) + raise ValueError( + f"Repository name already exists at https://github.com/{settings.GITHUB_MODEL_LIBRARY_ORG_NAME}/{repo_name}" + ) + except UnknownObjectException: + return True + + +class GithubApi: + """Functionality for interacting with a remote Github repository + and Github API + """ + + def __init__( + self, + codebase: Codebase, + local_repo: GitRepo, + repo_name: str, + is_user_repo=False, + organization_login: str | None = None, + user_access_token: str | None = None, + private_repo=False, + ): + if is_user_repo: + raise NotImplementedError("User repositories not yet supported") + self.private_repo = private_repo + self.codebase = codebase + self.local_repo = local_repo + self.repo_name = repo_name + self.is_user_repo = is_user_repo + if is_user_repo and not organization_login: + raise ValueError("User access token required for user repository") + if not is_user_repo and not organization_login: + raise ValueError("Organization login required for org repository") + self.organization_login = organization_login + self.user_access_token = user_access_token + self._github_repo = None + + @property + def github_repo(self) -> GithubRepo: + if not self._github_repo: + try: + self._github_repo = self._get_existing_repo() + except: + raise ValueError("Github repository not created yet") + return self._github_repo + + @property + def installation_access_token(self): + return self.get_installation_access_token() + + @classmethod + def get_installation_access_token(cls): + cached_token = cache.get(INSTALLATION_ACCESS_TOKEN_REDIS_KEY) + if cached_token: + return cached_token + return cls.refresh_installation_access_token() + + @staticmethod + def refresh_installation_access_token(): + """retrieve a new installation access token for the Github app + and cache it for future use + """ + auth = Auth.AppAuth( + settings.GITHUB_INTEGRATION_APP_ID, + settings.GITHUB_INTEGRATION_APP_PRIVATE_KEY, + ) + integration = GithubIntegration(auth=auth) + installation_auth = integration.get_access_token( + settings.GITHUB_INTEGRATION_APP_INSTALLATION_ID + ) + token = installation_auth.token + seconds_until_expiration = ( + installation_auth.expires_at - timezone.now() + ).total_seconds() + # cache the token for 1 minute less than the expiration time + cache.set( + INSTALLATION_ACCESS_TOKEN_REDIS_KEY, + token, + seconds_until_expiration - 60, + ) + return token + + @staticmethod + def get_user_access_token(code: str): + # just need to link to the app install and it will go to callback with ?code=... + """return an access token for the Github user + + this token is used to authenticate requests to the Github API + to act on behalf of the user on resources they own + """ + github = Github() + app = github.get_oauth_application( + settings.GITHUB_INTEGRATION_APP_CLIENT_ID, + settings.GITHUB_INTEGRATION_APP_CLIENT_SECRET, + ) + return app.get_access_token(code).token + + def get_or_create_repo(self) -> GithubRepo: + """get or create the Github repository for a user or organization""" + try: + return self.github_repo + except: + if self.is_user_repo: + self._github_repo = self._create_user_repo() + else: + self._github_repo = self._create_org_repo() + return self._github_repo + + def push(self, local_repo: GitRepo): + """push the local git repository to the Github repository""" + if self.is_user_repo: + raise NotImplementedError("User repositories not yet supported") + else: + token = self.installation_access_token + push_url = f"https://x-access-token:{token}@github.com/{self.github_repo.full_name}.git" + self._push_to_url(local_repo, push_url) + + def create_releases(self, local_repo: GitRepo): + """create Github releases for each tag in the local repository that + does not already have a corresponding release in the remote repository""" + for tag in local_repo.tags: + try: + existing_release = self.github_repo.get_release(tag.name) + except: + existing_release = None + if not existing_release: + self.github_repo.create_git_release( + tag.name, + name=tag.name, + message=tag.commit.message, + draft=False, + prerelease=False, + ) + + def _get_existing_repo(self): + """attempt to get an existing repository for the authenticated user or organization""" + if self.is_user_repo: + github = Github(self.user_access_token) + name = github.get_user().login + return github.get_repo(f"{name}/{self.repo_name}") + else: + github = Github(self.installation_access_token) + return github.get_repo(f"{self.organization_login}/{self.repo_name}") + + def _create_user_repo(self): + """create a new repository in the user's account + + this function requires the `repo` scope for the user access token + """ + token = self.user_access_token + if not token: + raise ValueError("User access token required for creating user repository") + github = Github(token) + repo = github.get_user().create_repo( + name=self.repo_name, + description=self.codebase.description, + private=self.private_repo, + ) + return repo + + def _create_org_repo(self): + """create a new repository in the CoMSES model library organization + + this function requires the `repo` scope for the installation access token + """ + token = self.installation_access_token + github = Github(token) + org = github.get_organization(settings.GITHUB_MODEL_LIBRARY_ORG_NAME) + repo = org.create_repo( + name=self.repo_name, + description=f"Mirror of {self.codebase.permanent_url}", + private=self.private_repo, + ) + return repo + + def _push_to_url(self, local_repo: GitRepo, push_url: str): + if "origin" not in local_repo.remotes: + local_repo.create_remote("origin", push_url) + else: + local_repo.remotes["origin"].set_url(push_url) + local_repo.git.push("--all") + local_repo.git.push("--tags") diff --git a/django/library/tasks.py b/django/library/tasks.py index 0cbc93804..31e7073d5 100644 --- a/django/library/tasks.py +++ b/django/library/tasks.py @@ -1,17 +1,111 @@ from huey.contrib.djhuey import db_task, on_commit_task +from django.conf import settings + +from .models import Codebase, CodebaseRelease +from .github_integration import GithubApi +from .fs import CodebaseGitRepositoryApi import logging logger = logging.getLogger(__name__) +@db_task(retries=3, retry_delay=30) +def mirror_codebase(codebase_id: int, private_repo=False): + """asynchronous task that mirrors a codebase to a remote Github repository""" + codebase = Codebase.objects.get(id=codebase_id) + mirror = codebase.git_mirror + if not mirror: + raise ValueError("Codebase does not have a git mirror") + mirror.organization_login = settings.GITHUB_MODEL_LIBRARY_ORG_NAME + mirror.save() + + git_fs_api = CodebaseGitRepositoryApi(codebase) + local_repo = git_fs_api.update_or_build() + + gh_api = GithubApi( + codebase=codebase, + local_repo=local_repo, + repo_name=mirror.repository_name, + is_user_repo=False, + organization_login=mirror.organization_login, + user_access_token=mirror.user_access_token, + private_repo=private_repo, + ) + repo = gh_api.get_or_create_repo() + mirror.remote_url = repo.html_url + gh_api.push(local_repo) + gh_api.create_releases(local_repo) + mirror.update_remote_releases() + + +@db_task(retries=3, retry_delay=30) +def update_mirrored_codebase(codebase_id: int): + """asynchronous task that updates a mirrored codebase by pushing new releases to Github""" + codebase = Codebase.objects.get(id=codebase_id) + mirror = codebase.git_mirror + if not mirror: + raise ValueError("Codebase does not have a git mirror") + if not mirror.remote_url: + raise ValueError("Codebase git mirror does not have a remote url") + + git_fs_api = CodebaseGitRepositoryApi(codebase) + local_repo = git_fs_api.append_releases() + gh_api = GithubApi( + codebase=codebase, + local_repo=local_repo, + repo_name=mirror.repository_name, + is_user_repo=bool(mirror.user_access_token), + organization_login=mirror.organization_login, + user_access_token=mirror.user_access_token, + ) + gh_api.push(local_repo) + gh_api.create_releases(local_repo) + mirror.update_remote_releases() + + +@db_task(retries=3, retry_delay=30) +def update_mirrored_release_metadata(release_id: int): + """asynchronous task that updates a SINGLE RELEASE BRANCH with any metadata changes + that may have occurred. + + This should be called when release metadata has been changed + """ + release = CodebaseRelease.objects.get(id=release_id) + codebase = release.codebase + mirror = codebase.git_mirror + if not mirror: + raise ValueError("Codebase does not have a git mirror") + if not mirror.remote_url: + raise ValueError("Codebase git mirror does not have a remote url") + + git_fs_api = CodebaseGitRepositoryApi(codebase) + local_repo = git_fs_api.update_release_branch(release) + if local_repo: + gh_api = GithubApi( + codebase=codebase, + local_repo=local_repo, + repo_name=mirror.repository_name, + is_user_repo=bool(mirror.user_access_token), + organization_login=mirror.organization_login, + user_access_token=mirror.user_access_token, + ) + gh_api.push(local_repo) + mirror.update_remote_releases() + + @db_task(retries=1, retry_delay=30) def update_fs_release_metadata(release_id: int): from .models import CodebaseRelease release = CodebaseRelease.objects.get(id=release_id) + codebase = release.codebase fs_api = release.get_fs_api() fs_api.rebuild(metadata_only=True) + # if the release is published and the codebase has a git mirror, + # update the metadata in the git repository + if release.is_published and codebase.git_mirror and codebase.git_mirror.remote_url: + update_mirrored_release_metadata(release_id) @on_commit_task() From 7d053f7c1dfbadd45e9b3e80e21ad87bcf49b15d Mon Sep 17 00:00:00 2001 From: sgfost Date: Thu, 9 Jan 2025 14:39:27 -0700 Subject: [PATCH 04/66] feat: github integration ui and mirroring feature * /github page to describe the integration features * sidebar element on release detail page will show information about integration status for that codebase, and allow users with edit permissions to create a new mirror --- .../library/codebases/releases/retrieve.jinja | 56 +++++ .../library/github-integration-overview.jinja | 234 ++++++++++++++++++ django/library/metadata.py | 7 +- django/library/urls.py | 10 + django/library/views.py | 45 +++- frontend/src/apps/github_mirror.ts | 8 + frontend/src/components/GithubMirrorModal.vue | 93 +++++++ .../src/components/form/HoneypotField.vue | 2 +- frontend/src/composables/api/codebase.ts | 5 + frontend/src/scss/_global.scss | 32 +++ 10 files changed, 489 insertions(+), 3 deletions(-) create mode 100644 django/library/jinja2/library/github-integration-overview.jinja create mode 100644 frontend/src/apps/github_mirror.ts create mode 100644 frontend/src/components/GithubMirrorModal.vue diff --git a/django/library/jinja2/library/codebases/releases/retrieve.jinja b/django/library/jinja2/library/codebases/releases/retrieve.jinja index 5922f78e5..d50e518ea 100644 --- a/django/library/jinja2/library/codebases/releases/retrieve.jinja +++ b/django/library/jinja2/library/codebases/releases/retrieve.jinja @@ -289,6 +289,61 @@ {% endif %} {% endwith %} +{% if codebase.git_mirror or has_change_perm %} + +{% endif %} +