diff --git a/README.md b/README.md index ad60a54..292b474 100644 --- a/README.md +++ b/README.md @@ -64,6 +64,99 @@ cd envs/dev terraform apply ``` +## GCP Deployment (dev / staging / prod) + +This repository contains Terraform configurations for deploying Mega to GCP under: + +- `deployment/envs/gcp/dev` +- `deployment/envs/gcp/staging` +- `deployment/envs/gcp/prod` + +Each environment directory contains: + +- `main.tf` +- `variables.tf` +- `providers.tf` +- `versions.tf` +- `terraform.tfvars.example` + +### Prerequisites + +- Install `gcloud` and authenticate +- Ensure you have permissions to create: VPC, GKE, Cloud SQL, Memorystore, Filestore, Artifact Registry, Cloud Logging/Monitoring + +Recommended API enablement: + +```bash +gcloud services enable \ + container.googleapis.com \ + artifactregistry.googleapis.com \ + sqladmin.googleapis.com \ + servicenetworking.googleapis.com \ + redis.googleapis.com \ + file.googleapis.com \ + logging.googleapis.com \ + monitoring.googleapis.com +``` + +### Configure variables + +Copy the example file and edit values. + +```bash +cd deployment/envs/gcp/dev +cp terraform.tfvars.example terraform.tfvars +``` + +Sensitive values should be provided via environment variables when using CI/CD: + +```bash +export TF_VAR_db_username="mega_user" +export TF_VAR_db_password="your-db-password" +export TF_VAR_rails_master_key="your-rails-master-key" +``` + +### Apply + +```bash +terraform init +terraform plan +terraform apply +``` + +### Get GKE credentials + +After the cluster is created: + +```bash +gcloud container clusters get-credentials mega-gke --region us-central1 --project YOUR_PROJECT_ID +``` + +### Verify logging & monitoring + +- GKE cluster is configured with Cloud Logging and Cloud Monitoring. +- You can validate by checking Cloud Console: + - Logging: Logs Explorer (resource type `k8s_container`) + - Monitoring: Kubernetes Engine dashboards + +### E2E validation (GKE / Orion Worker) + +See `deployment/gcp/e2e/README.e2e.md`. + +Example: + +```bash +kubectl apply -f deployment/gcp/e2e/connectivity-check-job.yaml +kubectl -n orion-worker wait --for=condition=complete job/orion-worker-connectivity-check --timeout=120s +kubectl -n orion-worker logs job/orion-worker-connectivity-check +``` + +### Destroy / rollback + +```bash +terraform destroy +``` + ## Inspect state When you applied your configuration, Terraform wrote data about your infrastructure into a file called `terraform.tfstate`. Terraform stores data about your infrastructure in its state file, which it uses to manage resources over their lifecycle. diff --git a/envs/gcp/README.md b/envs/gcp/README.md new file mode 100755 index 0000000..f5425db --- /dev/null +++ b/envs/gcp/README.md @@ -0,0 +1,322 @@ +# Mega GCP Deployment + +This directory contains Terraform configurations for deploying Mega on Google Cloud Platform (GCP). +It mirrors the structure of the AWS deployment while adapting to GCP-native resources and conventions. + +## Directory Structure + +``` +deployment/envs/gcp/ +├── dev/ # Development environment +├── staging/ # Staging environment +├── prod/ # Production environment +└── README.md # This file +``` + +Each environment directory contains: + +- `main.tf` – Main Terraform configuration +- `variables.tf` – Variable definitions +- `terraform.tfvars.example` – Example variable values +- `providers.tf` – GCP provider configuration +- `versions.tf` – Terraform and provider versions + +## Prerequisites + +### Required Tools +- Terraform (>= 1.0) +- gcloud CLI +- kubectl (for post-deployment validation) + +### Required Permissions +Your GCP account or service account must be able to create: +- VPC / Subnets / Firewall rules +- GKE clusters and node pools +- Cloud SQL (PostgreSQL / MySQL) +- Memorystore (Redis) +- Filestore (NFS) +- Artifact Registry +- Cloud Logging / Monitoring +- IAM service accounts and bindings + +### Required APIs +Enable the following APIs in your project: + +```bash +gcloud services enable \ + container.googleapis.com \ + artifactregistry.googleapis.com \ + sqladmin.googleapis.com \ + servicenetworking.googleapis.com \ + redis.googleapis.com \ + file.googleapis.com \ + logging.googleapis.com \ + monitoring.googleapis.com \ + cloudresourcemanager.googleapis.com \ + serviceusage.googleapis.com \ + iam.googleapis.com +``` + +## Quick Start + +### 1) Clone and Prepare + +```bash +git clone +cd deployment/envs/gcp/dev +cp terraform.tfvars.example terraform.tfvars +``` + +Edit `terraform.tfvars` with your values: + +```hcl +project_id = "your-gcp-project-id" +base_domain = "dev.gitmono.com" + +# Optional: override defaults +# region = "us-central1" +# zone = "us-central1-b" + +# Storage (mapped from AWS s3_*) +storage_bucket = "mega-dev-storage" + +# Database +db_username = "mega_user" +db_password = "your-db-password" +db_schema = "mega_dev" + +# Rails/UI +rails_master_key = "your-rails-master-key" +rails_env = "development" +ui_env = "dev" + +# Application +app_suffix = "dev" +app_service_name = "mega-app" +app_image = "us-central1-docker.pkg.dev/your-gcp-project-id/orion-worker/mega:latest" +app_container_port = 80 +app_replicas = 1 + +# Ingress +ingress_name = "mega-ingress" +ingress_static_ip_name = "mega-dev-ip" +ingress_managed_certificate_domains = ["dev.gitmono.com"] +ingress_rules = [ + { + host = "dev.gitmono.com" + service_name = "mega-app" + service_port = 80 + } +] + +# Service Accounts (optional) +iam_service_accounts = { + mega-app = { + display_name = "Mega App Service Account" + roles = ["roles/cloudsql.client", "roles/storage.objectViewer"] + wi_bindings = [ + { + namespace = "default" + k8s_service_account_name = "mega-app-sa" + } + ] + } +} + +# Feature flags +enable_build_env = true +enable_gcs = false +enable_cloud_sql = false +enable_redis = false +enable_filestore = false +enable_apps = false +enable_ingress = false +enable_logging = true +enable_monitoring = true +enable_alerts = false + +# Orion Worker (optional) +enable_orion_worker = false +# orion_worker_image = "public.ecr.aws/m8q5m4u3/mega:orion-client-0.1.0-pre-release-amd64" +# orion_worker_server_ws = "wss://orion.gitmono.com/ws" +# orion_worker_nodepool_name = "build-default" +``` + +### 2) Initialize and Deploy + +```bash +terraform init +terraform plan +terraform apply +``` + +### 3) Get GKE Credentials + +```bash +gcloud container clusters get-credentials mega-gke --region us-central1 --project YOUR_PROJECT_ID +``` + +### 4) Verify Deployment + +#### Basic Resource Validation + +```bash +bash ../../gcp/e2e/minimal-validation.sh dev +``` + +#### Orion Worker E2E (if enabled) + +```bash +kubectl -n orion-worker get ds/orion-worker +kubectl -n orion-worker get pods -l app=orion-worker -o wide + +# Connectivity check +kubectl apply -f ../../gcp/e2e/connectivity-check-job.yaml +kubectl -n orion-worker wait --for=condition=complete job/orion-worker-connectivity-check --timeout=120s +kubectl -n orion-worker logs job/orion-worker-connectivity-check + +# Task execution test +kubectl apply -f ../../gcp/e2e/task-e2e-trigger-job.yaml +kubectl -n orion-worker logs -f job/orion-task-e2e-trigger +``` + +## Architecture Overview + +### Core Components + +| Component | GCP Resource | AWS Equivalent | +|-----------|--------------|----------------| +| VPC / Subnets | `google_compute_network` / `google_compute_subnetwork` | VPC / Subnets | +| NAT / Router | `google_compute_router_nat` / `google_compute_router` | NAT Gateway | +| Firewall | `google_compute_firewall` | Security Groups | +| Container Runtime | GKE (`google_container_cluster`) | ECS / Fargate | +| Load Balancer | GKE Ingress / GCLB | ALB | +| Object Storage | GCS (`google_storage_bucket`) | S3 | +| File Storage | Filestore (`google_filestore_instance`) | EFS | +| Relational DB | Cloud SQL (`google_sql_database_instance`) | RDS | +| Cache | Memorystore (`google_redis_instance`) | ElastiCache | +| Container Registry | Artifact Registry (`google_artifact_registry_repository`) | ECR | +| Logging / Monitoring | Cloud Logging / Monitoring | CloudWatch | +| IAM | Service Accounts / Workload Identity | IAM Roles / Policies | + +### Build Execution Environment (#1841) + +- **Orion Worker**: Deployed as a DaemonSet on a dedicated node pool (`taint: dedicated=orion-build`) +- **Node Pool**: `n2-standard-8` with `dedicated=orion-build:NoSchedule` +- **Storage**: HostPath volumes for `/data` and `/workspace` +- **Connectivity**: Outbound internet via Cloud NAT (public nodes) or Private Service Connect (private nodes) + +## Variables Reference + +### Required Variables + +| Name | Description | Example | +|------|-------------|---------| +| `project_id` | GCP project ID | `infra-20250121-20260121-0235` | +| `base_domain` | Base domain for services | `dev.gitmono.com` | + +### Optional Variables + +| Name | Description | Default | +|------|-------------|---------| +| `region` | GCP region | `us-central1` | +| `zone` | GCP zone for zonal resources | `""` | +| `name_prefix` | Prefix for resource names | `mega` | +| `enable_build_env` | Enable GKE and build environment | `true` | +| `enable_gcs` | Enable GCS bucket | `false` | +| `enable_cloud_sql` | Enable Cloud SQL | `false` | +| `enable_redis` | Enable Memorystore Redis | `false` | +| `enable_filestore` | Enable Filestore | `false` | +| `enable_apps` | Enable application services | `false` | +| `enable_ingress` | Enable Ingress controller | `false` | +| `enable_orion_worker` | Enable Orion Worker DaemonSet | `false` | + +## Outputs + +| Name | Description | +|------|-------------| +| `gke_cluster_name` | GKE cluster name | +| `gke_cluster_location` | GKE cluster location | +| `artifact_registry_repo` | Artifact Registry repository | +| `pg_endpoint` | Cloud SQL database endpoint (if enabled) | +| `valkey_endpoint` | Redis endpoint (if enabled) | +| `alb_dns_name` | Ingress IP/hostname (if enabled) | +| `project_id` | GCP project ID | + +## Environment Differences + +| Variable | Dev | Staging | Prod | +|----------|-----|---------|------| +| `name_prefix` | `mega` | `mega-staging` | `mega-prod` | +| `subnet_cidr` | `10.20.0.0/16` | `10.30.0.0/16` | `10.40.0.0/16` | +| `pods_secondary_range` | `10.21.0.0/16` | `10.31.0.0/16` | `10.41.0.0/16` | +| `services_secondary_range` | `10.22.0.0/16` | `10.32.0.0/16` | `10.42.0.0/16` | +| `cluster_name` | `mega-gke` | `mega-staging` | `mega-prod` | +| `node_machine_type` | `n2-standard-8` | `e2-standard-4` | `e2-standard-8` | +| `node_min_count` | `0` | `1` | `2` | +| `node_max_count` | `10` | `5` | `20` | +| `cloud_sql_availability_type` | `ZONAL` | `ZONAL` | `REGIONAL` | +| `cloud_sql_deletion_protection` | `false` | `false` | `true` | +| `redis_memory_size_gb` | `1` | `2` | `4` | +| `app_replicas` | `1` | `2` | `3` | +| `enable_alerts` | `false` | `false` | `true` | + +## Best Practices + +### Security +- Use Workload Identity instead of service account keys +- Enable private endpoints for databases in production +- Apply least privilege IAM roles +- Do not commit `terraform.tfvars` with real credentials + +### Cost Management +- Use smaller instance types in dev/staging +- Enable deletion protection only in production +- Set appropriate autoscaling limits +- Clean up resources when not in use + +### State Management +- Use remote state storage (GCS backend) +- Lock state to prevent concurrent modifications +- Consider state isolation per environment + +## Troubleshooting + +### Common Issues + +1. **API not enabled**: Ensure all required APIs are enabled in your project +2. **Permission denied**: Check IAM permissions for the service account +3. **Quota exceeded**: Request quota increases for resources like CPUs or IP addresses +4. **Resource conflicts**: Check for naming conflicts with existing resources +5. **Terraform state issues**: Use `terraform state list` and `terraform state rm` if needed + +### Cleanup + +```bash +terraform destroy +``` + +### Validation Scripts + +- `../../gcp/e2e/minimal-validation.sh` – Basic resource validation +- `../../gcp/e2e/connectivity-check-job.yaml` – Network connectivity test +- `../../gcp/e2e/task-e2e-trigger-job.yaml` – End-to-end task execution test + +## Contributing + +When contributing to this deployment: + +1. Follow the existing module structure +2. Use consistent naming conventions +3. Update documentation for new features +4. Test changes in a non-production environment first +5. Ensure all Terraform code is formatted (`terraform fmt`) +6. Validate configuration (`terraform validate`) + +## References + +- [GCP Terraform Provider Documentation](https://registry.terraform.io/providers/hashicorp/google/latest/docs) +- [GKE Documentation](https://cloud.google.com/kubernetes-engine/docs) +- [Cloud SQL Documentation](https://cloud.google.com/sql/docs) +- [Memorystore Documentation](https://cloud.google.com/memorystore/docs) +- [Filestore Documentation](https://cloud.google.com/filestore/docs) diff --git a/envs/gcp/dev/main.tf b/envs/gcp/dev/main.tf new file mode 100755 index 0000000..298ea33 --- /dev/null +++ b/envs/gcp/dev/main.tf @@ -0,0 +1,291 @@ +locals { + enable_build_env = var.enable_build_env + enable_gcs = var.enable_gcs + enable_cloud_sql = var.enable_cloud_sql + enable_redis = var.enable_redis + enable_filestore = var.enable_filestore + enable_apps = var.enable_apps + enable_ingress = var.enable_ingress +} + +module "network" { + count = local.enable_build_env ? 1 : 0 + source = "../../../modules/gcp/network" + + name_prefix = var.name_prefix + region = var.region + network_name = var.network_name + subnet_name = var.subnet_name + subnet_cidr = var.subnet_cidr + pods_secondary_range = var.pods_secondary_range + services_secondary_range = var.services_secondary_range +} + +module "artifact_registry" { + count = local.enable_build_env ? 1 : 0 + source = "../../../modules/gcp/artifact_registry" + + location = var.artifact_registry_location + repo_name = var.artifact_registry_repo +} + +module "gke" { + count = local.enable_build_env ? 1 : 0 + source = "../../../modules/gcp/gke" + + project_id = var.project_id + region = var.region + cluster_name = var.cluster_name + + network_self_link = module.network[0].network_self_link + subnetwork_self_link = module.network[0].subnetwork_self_link + + ip_range_pods_name = module.network[0].pods_secondary_range_name + ip_range_services_name = module.network[0].services_secondary_range_name + + logging_service = var.enable_logging ? "logging.googleapis.com/kubernetes" : "none" + monitoring_service = var.enable_monitoring ? "monitoring.googleapis.com/kubernetes" : "none" +} + +module "iam" { + source = "../../../modules/gcp/iam" + + project_id = var.project_id + prefix = coalesce(var.app_suffix, var.name_prefix) + service_accounts = var.iam_service_accounts +} + +module "monitoring" { + source = "../../../modules/gcp/monitoring" + + project_id = var.project_id + enable_logging = var.enable_logging + enable_monitoring = var.enable_monitoring + enable_alerts = var.enable_alerts + alert_notification_channels = var.alert_notification_channels + log_sink_name = var.log_sink_name + log_sink_destination = var.log_sink_destination +} + +module "nodepool" { + count = local.enable_build_env ? 1 : 0 + source = "../../../modules/gcp/gke/nodepool" + + project_id = var.project_id + region = var.region + cluster_name = module.gke[0].cluster_name + + name = var.nodepool_name + machine_type = var.node_machine_type + disk_size_gb = var.node_disk_size_gb + + min_count = var.node_min_count + max_count = var.node_max_count + + labels = { + nodepool = var.nodepool_name + } + + taints = [ + { + key = "dedicated" + value = "orion-build" + effect = "NO_SCHEDULE" + } + ] +} + +module "gcs" { + count = local.enable_gcs ? 1 : 0 + source = "../../../modules/gcp/gcs" + + name = var.gcs_bucket + location = var.region + force_destroy = var.gcs_force_destroy + uniform_bucket_level_access = var.gcs_uniform_bucket_level_access +} + +module "cloud_sql" { + count = local.enable_cloud_sql ? 1 : 0 + source = "../../../modules/gcp/cloud_sql" + + name = var.cloud_sql_instance_name + database_version = var.cloud_sql_database_version + region = var.region + tier = var.cloud_sql_tier + disk_size = var.cloud_sql_disk_size + disk_type = var.cloud_sql_disk_type + availability_type = var.cloud_sql_availability_type + private_network = local.enable_build_env ? module.network[0].network_self_link : "" + private_ip_prefix_length = var.cloud_sql_private_ip_prefix_length + enable_private_service_connection = var.cloud_sql_enable_private_service_connection + enable_public_ip = var.cloud_sql_enable_public_ip + db_name = var.cloud_sql_db_name + db_username = var.db_username + db_password = var.db_password + backup_enabled = var.cloud_sql_backup_enabled + deletion_protection = var.cloud_sql_deletion_protection +} + +module "redis" { + count = local.enable_redis ? 1 : 0 + source = "../../../modules/gcp/redis" + + name = var.redis_instance_name + region = var.region + tier = var.redis_tier + memory_size_gb = var.redis_memory_size_gb + network = local.enable_build_env ? module.network[0].network_self_link : "" + transit_encryption_mode = var.redis_transit_encryption_mode +} + +module "filestore" { + count = local.enable_filestore ? 1 : 0 + source = "../../../modules/gcp/filestore" + + name = var.filestore_instance_name + location = var.zone != "" ? var.zone : "${var.region}-b" + network = local.enable_build_env ? module.network[0].network_self_link : "" + tier = var.filestore_tier + capacity_gb = var.filestore_capacity_gb + file_share_name = var.filestore_file_share_name + reserved_ip_range = var.filestore_reserved_ip_range +} + +module "gke_service" { + count = local.enable_apps ? 1 : 0 + source = "../../../modules/gcp/gke_service" + + name = var.app_service_name + namespace = var.app_namespace + image = var.app_image + container_port = var.app_container_port + env = var.app_env + volumes = var.app_volumes + volume_mounts = var.app_volume_mounts + replicas = var.app_replicas + service_type = var.app_service_type + cpu_request = var.app_cpu_request + memory_request = var.app_memory_request + cpu_limit = var.app_cpu_limit + memory_limit = var.app_memory_limit + enable_hpa = var.app_enable_hpa + hpa_min_replicas = var.app_hpa_min_replicas + hpa_max_replicas = var.app_hpa_max_replicas + hpa_cpu_utilization = var.app_hpa_cpu_utilization +} + +module "ingress" { + count = local.enable_ingress ? 1 : 0 + source = "../../../modules/gcp/ingress" + + name = var.ingress_name + namespace = var.ingress_namespace + static_ip_name = var.ingress_static_ip_name + ingress_class_name = var.ingress_class_name + managed_certificate_domains = var.ingress_managed_certificate_domains + rules = var.ingress_rules +} + +output "gke_cluster_name" { + value = local.enable_build_env ? module.gke[0].cluster_name : null +} + +output "gke_cluster_location" { + value = local.enable_build_env ? module.gke[0].location : null +} + +output "artifact_registry_repo" { + value = local.enable_build_env ? module.artifact_registry[0].repository : null +} + +output "gcs_bucket_name" { + value = local.enable_gcs ? module.gcs[0].bucket_name : null +} + +output "cloud_sql_db_endpoint" { + value = local.enable_cloud_sql ? module.cloud_sql[0].db_endpoint : null +} + +output "cloud_sql_connection_name" { + value = local.enable_cloud_sql ? module.cloud_sql[0].connection_name : null +} + +output "redis_host" { + value = local.enable_redis ? module.redis[0].host : null +} + +output "redis_port" { + value = local.enable_redis ? module.redis[0].port : null +} + +output "pg_endpoint" { + value = local.enable_cloud_sql ? module.cloud_sql[0].db_endpoint : null +} + +output "valkey_endpoint" { + value = local.enable_redis ? [{ address = module.redis[0].host, port = module.redis[0].port }] : null +} + +output "alb_dns_name" { + value = local.enable_ingress ? coalesce(module.ingress[0].ip_address, module.ingress[0].hostname) : null +} + +output "filestore_instance_name" { + value = local.enable_filestore ? module.filestore[0].instance_name : null +} + +output "filestore_file_share_name" { + value = local.enable_filestore ? module.filestore[0].file_share_name : null +} + +output "filestore_ip_address" { + value = local.enable_filestore ? module.filestore[0].ip_address : null +} + +output "iam_service_accounts" { + description = "Created service accounts with emails and names" + value = module.iam.service_accounts +} + +output "iam_workload_identity_bindings" { + description = "Workload Identity bindings (K8s SA -> GCP SA)" + value = module.iam.workload_identity_bindings +} + +output "project_id" { + description = "GCP project ID" + value = var.project_id +} + +module "orion_worker" { + count = var.enable_orion_worker ? 1 : 0 + source = "../../../modules/gcp/orion_worker" + + namespace = "orion-worker" + image = var.orion_worker_image + server_ws = var.orion_worker_server_ws + + scorpio_base_url = var.orion_worker_scorpio_base_url + scorpio_lfs_url = var.orion_worker_scorpio_lfs_url + rust_log = var.orion_worker_rust_log + + tolerations = [ + { + key = "dedicated" + operator = "Equal" + value = "orion-build" + effect = "NoSchedule" + } + ] + + node_selector = { + nodepool = var.orion_worker_nodepool_name + } + + cpu_request = var.orion_worker_cpu_request + memory_request = var.orion_worker_memory_request + cpu_limit = var.orion_worker_cpu_limit + memory_limit = var.orion_worker_memory_limit +} + diff --git a/envs/gcp/dev/providers.tf b/envs/gcp/dev/providers.tf new file mode 100755 index 0000000..96762f4 --- /dev/null +++ b/envs/gcp/dev/providers.tf @@ -0,0 +1,10 @@ +provider "google" { + project = var.project_id + region = var.region +} + +provider "google-beta" { + project = var.project_id + region = var.region +} + diff --git a/envs/gcp/dev/terraform.tfvars.example b/envs/gcp/dev/terraform.tfvars.example new file mode 100755 index 0000000..932e425 --- /dev/null +++ b/envs/gcp/dev/terraform.tfvars.example @@ -0,0 +1,79 @@ +# Required +project_id = "your-gcp-project-id" +base_domain = "git.gitmono.com" + +# Optional: Override defaults +# region = "us-central1" +# zone = "us-central1-b" + +# Storage (mapped from AWS s3_*) +storage_bucket = "mega-dev-storage" +# storage_key = "your-access-key" +# storage_secret_key = "your-secret-key" + +# Database +db_username = "mega_user" +db_password = "your-db-password" +db_schema = "mega_dev" + +# Rails/UI +rails_master_key = "your-rails-master-key" +rails_env = "development" +ui_env = "dev" + +# Application +app_suffix = "dev" +app_service_name = "mega-app" +app_image = "us-central1-docker.pkg.dev/your-gcp-project-id/orion-worker/mega:latest" +app_container_port = 80 +app_replicas = 1 + +# Ingress +ingress_name = "mega-ingress" +ingress_static_ip_name = "mega-dev-ip" +ingress_managed_certificate_domains = ["dev.mega.example.com"] +ingress_rules = [ + { + host = "dev.mega.example.com" + service_name = "mega-app" + service_port = 80 + } +] + +# Service Accounts (optional) +iam_service_accounts = { + mega_app = { + display_name = "Mega App Service Account" + roles = ["roles/cloudsql.client", "roles/storage.objectViewer"] + wi_bindings = [ + { + namespace = "default" + k8s_service_account_name = "mega-app-sa" + } + ] + } +} + +# Feature flags (set to false to disable components) +enable_build_env = true +enable_gcs = false +enable_cloud_sql = false +enable_redis = false +enable_filestore = false +enable_apps = false +enable_ingress = false +enable_logging = true +enable_monitoring = true +enable_alerts = false + +# Orion Worker (optional) +enable_orion_worker = true +# orion_worker_image = "public.ecr.aws/m8q5m4u3/mega:orion-client-0.1.0-pre-release-amd64" +# orion_worker_server_ws = "wss://orion.gitmono.com/ws" +# orion_worker_nodepool_name = "build-default" + +# Resource names (optional) +gcs_bucket = "mega-dev-storage" +cloud_sql_instance_name = "mega-dev-db" +redis_instance_name = "mega-dev-redis" +filestore_instance_name = "mega-dev-fs" diff --git a/envs/gcp/dev/variables.tf b/envs/gcp/dev/variables.tf new file mode 100755 index 0000000..1ddacef --- /dev/null +++ b/envs/gcp/dev/variables.tf @@ -0,0 +1,607 @@ +variable "project_id" { + type = string +} + +variable "region" { + type = string + default = "us-central1" +} + +variable "zone" { + type = string + description = "GCP zone for zonal resources (e.g. Filestore)." + default = "" +} + +variable "zones" { + type = list(string) + description = "Zones for the node pool. If empty, node_locations will not be set and GKE will choose." + default = [] +} + +variable "name_prefix" { + type = string + default = "mega" +} + +variable "base_domain" { + type = string + default = "" +} + +variable "enable_build_env" { + type = bool + default = true +} + +variable "enable_gcs" { + type = bool + default = false +} + +variable "enable_cloud_sql" { + type = bool + default = false +} + +variable "enable_redis" { + type = bool + default = false +} + +variable "enable_filestore" { + type = bool + default = false +} + +variable "enable_apps" { + type = bool + default = false +} + +variable "enable_ingress" { + type = bool + default = false +} + +variable "network_name" { + type = string + default = "mega-gke-net" +} + +variable "subnet_name" { + type = string + default = "mega-gke-subnet" +} + +variable "subnet_cidr" { + type = string + default = "10.20.0.0/16" +} + +variable "pods_secondary_range" { + type = string + default = "10.21.0.0/16" +} + +variable "services_secondary_range" { + type = string + default = "10.22.0.0/16" +} + +variable "cluster_name" { + type = string + default = "mega-gke" +} + +variable "artifact_registry_location" { + type = string + default = "us-central1" +} + +variable "artifact_registry_repo" { + type = string + default = "orion-worker" +} + +variable "nodepool_name" { + type = string + default = "build-default" +} + +variable "node_machine_type" { + type = string + default = "n2-standard-8" +} + +variable "node_disk_size_gb" { + type = number + default = 200 +} + +variable "node_min_count" { + type = number + default = 0 +} + +variable "node_max_count" { + type = number + default = 10 +} + +variable "gcs_bucket" { + type = string + description = "GCS bucket name" + default = "" +} + +variable "gcs_force_destroy" { + type = bool + description = "Allow force deletion of bucket objects" + default = false +} + +variable "gcs_uniform_bucket_level_access" { + type = bool + description = "Enable uniform bucket-level access" + default = true +} + +variable "cloud_sql_instance_name" { + type = string + description = "Cloud SQL instance name" + default = "" +} + +variable "cloud_sql_database_version" { + type = string + description = "Cloud SQL database version (e.g. POSTGRES_17, MYSQL_8_0)" + default = "POSTGRES_17" +} + +variable "cloud_sql_tier" { + type = string + description = "Cloud SQL instance tier" + default = "db-g1-small" +} + +variable "cloud_sql_disk_size" { + type = number + description = "Cloud SQL disk size in GB" + default = 20 +} + +variable "cloud_sql_disk_type" { + type = string + description = "Cloud SQL disk type" + default = "PD_SSD" +} + +variable "cloud_sql_availability_type" { + type = string + description = "Cloud SQL availability type (ZONAL or REGIONAL)" + default = "ZONAL" +} + +variable "cloud_sql_private_ip_prefix_length" { + type = number + description = "Prefix length for private services range" + default = 16 +} + +variable "cloud_sql_enable_private_service_connection" { + type = bool + description = "Create private service networking connection" + default = true +} + +variable "cloud_sql_enable_public_ip" { + type = bool + description = "Enable public IPv4 for Cloud SQL" + default = false +} + +variable "cloud_sql_db_name" { + type = string + description = "Default database name" + default = "" +} + +variable "cloud_sql_backup_enabled" { + type = bool + description = "Enable automated backups" + default = true +} + +variable "cloud_sql_deletion_protection" { + type = bool + description = "Enable deletion protection" + default = false +} + +variable "redis_instance_name" { + type = string + description = "Memorystore instance name" + default = "" +} + +variable "redis_tier" { + type = string + description = "Memorystore tier" + default = "STANDARD_HA" +} + +variable "redis_memory_size_gb" { + type = number + description = "Memory size in GB" + default = 1 +} + +variable "redis_transit_encryption_mode" { + type = string + description = "Transit encryption mode" + default = "DISABLED" +} + +variable "filestore_instance_name" { + type = string + description = "Filestore instance name" + default = "" +} + +variable "filestore_tier" { + type = string + description = "Filestore tier" + default = "STANDARD" +} + +variable "filestore_capacity_gb" { + type = number + description = "Capacity in GB" + default = 1024 +} + +variable "filestore_file_share_name" { + type = string + description = "File share name" + default = "share1" +} + +variable "filestore_reserved_ip_range" { + type = string + description = "Optional reserved IP range (e.g. 10.0.20.0/29)" + default = null +} + +variable "app_service_name" { + type = string + description = "Kubernetes service name" + default = "" +} + +variable "app_namespace" { + type = string + description = "Kubernetes namespace" + default = "default" +} + +variable "app_image" { + type = string + description = "Container image" + default = "" +} + +variable "app_container_port" { + type = number + description = "Container port" + default = 80 +} + +variable "app_env" { + type = list(map(string)) + description = "Environment variables" + default = [] +} + +variable "app_volumes" { + type = list(object({ + name = string + nfs_server = string + nfs_path = string + })) + description = "Pod volumes (NFS only)" + default = [] +} + +variable "app_volume_mounts" { + type = list(object({ + name = string + mount_path = string + read_only = bool + })) + description = "Container volume mounts" + default = [] +} + +variable "app_replicas" { + type = number + description = "Number of replicas" + default = 1 +} + +variable "app_service_type" { + type = string + description = "Kubernetes service type" + default = "ClusterIP" +} + +variable "app_cpu_request" { + type = string + description = "CPU request" + default = null +} + +variable "app_memory_request" { + type = string + description = "Memory request" + default = null +} + +variable "app_cpu_limit" { + type = string + description = "CPU limit" + default = null +} + +variable "app_memory_limit" { + type = string + description = "Memory limit" + default = null +} + +variable "app_enable_hpa" { + type = bool + description = "Enable HorizontalPodAutoscaler" + default = false +} + +variable "app_hpa_min_replicas" { + type = number + description = "HPA minimum replicas" + default = 1 +} + +variable "app_hpa_max_replicas" { + type = number + description = "HPA maximum replicas" + default = 5 +} + +variable "app_hpa_cpu_utilization" { + type = number + description = "Target CPU utilization percentage" + default = 80 +} + +variable "ingress_name" { + type = string + description = "Ingress name" + default = "" +} + +variable "ingress_namespace" { + type = string + description = "Kubernetes namespace" + default = "default" +} + +variable "ingress_static_ip_name" { + type = string + description = "Global static IP name for GCE ingress" + default = null +} + +variable "ingress_class_name" { + type = string + description = "Ingress class name" + default = "gce" +} + +variable "ingress_managed_certificate_domains" { + type = list(string) + description = "Domains for GKE ManagedCertificate" + default = [] +} + +variable "ingress_rules" { + type = list(object({ + host = string + service_name = string + service_port = number + })) + description = "Ingress host rules" + default = [] +} + +variable "storage_key" { + type = string + description = "Storage access key (mapped from AWS s3_key)" + default = "" + sensitive = true +} + +variable "storage_secret_key" { + type = string + description = "Storage secret key (mapped from AWS s3_secret_key)" + default = "" + sensitive = true +} + +variable "storage_bucket" { + type = string + description = "Storage bucket name (mapped from AWS s3_bucket)" + default = "" +} + +variable "db_username" { + type = string + description = "Database username" + default = "" + sensitive = true +} + +variable "db_password" { + type = string + description = "Database password" + default = "" + sensitive = true +} + +variable "db_schema" { + type = string + description = "Database schema name (compat with AWS envs/dev)" + default = "" +} + +variable "rails_master_key" { + type = string + description = "Rails master key (compat with AWS envs/dev)" + default = "" + sensitive = true +} + +variable "rails_env" { + type = string + description = "Rails env (compat with AWS envs/dev)" + default = "" +} + +variable "ui_env" { + type = string + description = "UI env (compat with AWS envs/dev)" + default = "" +} + +variable "app_suffix" { + type = string + description = "Application suffix (compat with AWS envs/dev)" + default = "" +} + +variable "iam_service_accounts" { + type = map(object({ + display_name = optional(string) + description = optional(string) + roles = optional(list(string), []) + wi_bindings = optional(list(object({ + namespace = string + k8s_service_account_name = string + })), []) + })) + description = "Service accounts to create and their IAM roles / Workload Identity bindings" + default = {} +} + +variable "enable_logging" { + type = bool + default = true + description = "Enable Cloud Logging for GKE" +} + +variable "enable_monitoring" { + type = bool + default = true + description = "Enable Cloud Monitoring for GKE" +} + +variable "enable_alerts" { + type = bool + default = false + description = "Enable example alert policies" +} + +variable "alert_notification_channels" { + type = list(string) + default = [] + description = "List of notification channel IDs for alerts" +} + +variable "log_sink_name" { + type = string + default = "" + description = "Optional log sink name for exporting logs" +} + +variable "log_sink_destination" { + type = string + default = "" + description = "Optional log sink destination" +} + +variable "enable_orion_worker" { + type = bool + default = false + description = "Enable Orion Worker deployment" +} + +variable "orion_worker_image" { + type = string + default = "public.ecr.aws/m8q5m4u3/mega:orion-client-0.1.0-pre-release-amd64" + description = "Orion Worker container image" +} + +variable "orion_worker_server_ws" { + type = string + default = "wss://orion.gitmono.com/ws" + description = "Orion server WebSocket URL" +} + +variable "orion_worker_scorpio_base_url" { + type = string + default = "https://git.gitmono.com" + description = "Scorpio base URL" +} + +variable "orion_worker_scorpio_lfs_url" { + type = string + default = "https://git.gitmono.com" + description = "Scorpio LFS URL" +} + +variable "orion_worker_rust_log" { + type = string + default = "info" + description = "Rust log level" +} + +variable "orion_worker_nodepool_name" { + type = string + default = "build-default" + description = "Node pool name for Orion Worker scheduling" +} + +variable "orion_worker_cpu_request" { + type = string + default = "6" + description = "CPU request for Orion Worker" +} + +variable "orion_worker_memory_request" { + type = string + default = "24Gi" + description = "Memory request for Orion Worker" +} + +variable "orion_worker_cpu_limit" { + type = string + default = "8" + description = "CPU limit for Orion Worker" +} + +variable "orion_worker_memory_limit" { + type = string + default = "30Gi" + description = "Memory limit for Orion Worker" +} + + diff --git a/envs/gcp/dev/versions.tf b/envs/gcp/dev/versions.tf new file mode 100755 index 0000000..f6554e0 --- /dev/null +++ b/envs/gcp/dev/versions.tf @@ -0,0 +1,15 @@ +terraform { + required_version = ">= 1.5.0" + + required_providers { + google = { + source = "hashicorp/google" + version = ">= 5.0" + } + google-beta = { + source = "hashicorp/google-beta" + version = ">= 5.0" + } + } +} + diff --git a/envs/gcp/prod/main.tf b/envs/gcp/prod/main.tf new file mode 100755 index 0000000..63340c6 --- /dev/null +++ b/envs/gcp/prod/main.tf @@ -0,0 +1,295 @@ +locals { + enable_build_env = var.enable_build_env + enable_gcs = var.enable_gcs + enable_cloud_sql = var.enable_cloud_sql + enable_redis = var.enable_redis + enable_filestore = var.enable_filestore + enable_apps = var.enable_apps + enable_ingress = var.enable_ingress +} + +module "network" { + count = local.enable_build_env ? 1 : 0 + source = "../../../modules/gcp/network" + + name_prefix = var.name_prefix + region = var.region + network_name = var.network_name + subnet_name = var.subnet_name + subnet_cidr = var.subnet_cidr + pods_secondary_range = var.pods_secondary_range + services_secondary_range = var.services_secondary_range +} + +module "artifact_registry" { + count = local.enable_build_env ? 1 : 0 + source = "../../../modules/gcp/artifact_registry" + + location = var.artifact_registry_location + repo_name = var.artifact_registry_repo +} + +module "gke" { + count = local.enable_build_env ? 1 : 0 + source = "../../../modules/gcp/gke" + + project_id = var.project_id + region = var.region + cluster_name = var.cluster_name + + network_self_link = module.network[0].network_self_link + subnetwork_self_link = module.network[0].subnetwork_self_link + + ip_range_pods_name = module.network[0].pods_secondary_range_name + ip_range_services_name = module.network[0].services_secondary_range_name + + logging_service = var.enable_logging ? "logging.googleapis.com/kubernetes" : "none" + monitoring_service = var.enable_monitoring ? "monitoring.googleapis.com/kubernetes" : "none" +} + +module "nodepool" { + count = local.enable_build_env ? 1 : 0 + source = "../../../modules/gcp/gke/nodepool" + + project_id = var.project_id + region = var.region + cluster_name = module.gke[0].cluster_name + + name = var.nodepool_name + machine_type = var.node_machine_type + disk_size_gb = var.node_disk_size_gb + + min_count = var.node_min_count + max_count = var.node_max_count + + labels = { + nodepool = var.nodepool_name + } + + taints = [ + { + key = "dedicated" + value = "orion-build" + effect = "NO_SCHEDULE" + } + ] +} + +module "iam" { + source = "../../../modules/gcp/iam" + + project_id = var.project_id + prefix = coalesce(var.app_suffix, var.name_prefix) + service_accounts = var.iam_service_accounts +} + +module "monitoring" { + source = "../../../modules/gcp/monitoring" + + project_id = var.project_id + enable_logging = var.enable_logging + enable_monitoring = var.enable_monitoring + enable_alerts = var.enable_alerts + alert_notification_channels = var.alert_notification_channels + log_sink_name = var.log_sink_name + log_sink_destination = var.log_sink_destination +} + +module "gcs" { + count = local.enable_gcs ? 1 : 0 + source = "../../../modules/gcp/gcs" + + name = var.gcs_bucket + location = var.region + force_destroy = var.gcs_force_destroy + uniform_bucket_level_access = var.gcs_uniform_bucket_level_access +} + +module "cloud_sql" { + count = local.enable_cloud_sql ? 1 : 0 + source = "../../../modules/gcp/cloud_sql" + + name = var.cloud_sql_instance_name + database_version = var.cloud_sql_database_version + region = var.region + tier = var.cloud_sql_tier + disk_size = var.cloud_sql_disk_size + disk_type = var.cloud_sql_disk_type + availability_type = var.cloud_sql_availability_type + private_network = local.enable_build_env ? module.network[0].network_self_link : "" + private_ip_prefix_length = var.cloud_sql_private_ip_prefix_length + enable_private_service_connection = var.cloud_sql_enable_private_service_connection + enable_public_ip = var.cloud_sql_enable_public_ip + db_name = var.cloud_sql_db_name + db_username = var.db_username + db_password = var.db_password + backup_enabled = var.cloud_sql_backup_enabled + deletion_protection = var.cloud_sql_deletion_protection +} + +module "redis" { + count = local.enable_redis ? 1 : 0 + source = "../../../modules/gcp/redis" + + name = var.redis_instance_name + region = var.region + tier = var.redis_tier + memory_size_gb = var.redis_memory_size_gb + network = local.enable_build_env ? module.network[0].network_self_link : "" + transit_encryption_mode = var.redis_transit_encryption_mode +} + +module "filestore" { + count = local.enable_filestore ? 1 : 0 + source = "../../../modules/gcp/filestore" + + name = var.filestore_instance_name + location = var.zone != "" ? var.zone : "${var.region}-b" + network = local.enable_build_env ? module.network[0].network_self_link : "" + tier = var.filestore_tier + capacity_gb = var.filestore_capacity_gb + file_share_name = var.filestore_file_share_name + reserved_ip_range = var.filestore_reserved_ip_range +} + +module "gke_service" { + count = local.enable_apps ? 1 : 0 + source = "../../../modules/gcp/gke_service" + + name = var.app_service_name + namespace = var.app_namespace + image = var.app_image + container_port = var.app_container_port + env = var.app_env + volumes = var.app_volumes + volume_mounts = var.app_volume_mounts + replicas = var.app_replicas + service_type = var.app_service_type + cpu_request = var.app_cpu_request + memory_request = var.app_memory_request + cpu_limit = var.app_cpu_limit + memory_limit = var.app_memory_limit + enable_hpa = var.app_enable_hpa + hpa_min_replicas = var.app_hpa_min_replicas + hpa_max_replicas = var.app_hpa_max_replicas + hpa_cpu_utilization = var.app_hpa_cpu_utilization +} + +module "ingress" { + count = local.enable_ingress ? 1 : 0 + source = "../../../modules/gcp/ingress" + + name = var.ingress_name + namespace = var.ingress_namespace + static_ip_name = var.ingress_static_ip_name + ingress_class_name = var.ingress_class_name + managed_certificate_domains = var.ingress_managed_certificate_domains + rules = var.ingress_rules +} + +output "gke_cluster_name" { + value = local.enable_build_env ? module.gke[0].cluster_name : null +} + +output "gke_cluster_location" { + value = local.enable_build_env ? module.gke[0].location : null +} + +output "artifact_registry_repo" { + value = local.enable_build_env ? module.artifact_registry[0].repository : null +} + +output "gcs_bucket_name" { + value = local.enable_gcs ? module.gcs[0].bucket_name : null +} + +output "cloud_sql_db_endpoint" { + value = local.enable_cloud_sql ? module.cloud_sql[0].db_endpoint : null +} + +output "cloud_sql_connection_name" { + value = local.enable_cloud_sql ? module.cloud_sql[0].connection_name : null +} + +output "redis_host" { + value = local.enable_redis ? module.redis[0].host : null +} + +output "redis_port" { + value = local.enable_redis ? module.redis[0].port : null +} + +output "pg_endpoint" { + value = local.enable_cloud_sql ? module.cloud_sql[0].db_endpoint : null +} + +output "valkey_endpoint" { + value = local.enable_redis ? [{ address = module.redis[0].host, port = module.redis[0].port }] : null +} + +output "alb_dns_name" { + value = local.enable_ingress ? coalesce(module.ingress[0].ip_address, module.ingress[0].hostname) : null +} + +output "filestore_instance_name" { + value = local.enable_filestore ? module.filestore[0].instance_name : null +} + +output "filestore_file_share_name" { + value = local.enable_filestore ? module.filestore[0].file_share_name : null +} + +output "filestore_ip_address" { + value = local.enable_filestore ? module.filestore[0].ip_address : null +} + +output "iam_service_accounts" { + description = "Created service accounts with emails and names" + value = module.iam.service_accounts +} + +output "iam_workload_identity_bindings" { + description = "Workload Identity bindings (K8s SA -> GCP SA)" + value = module.iam.workload_identity_bindings +} + +output "project_id" { + description = "GCP project ID" + value = var.project_id +} + +output "monitoring_logging_api_enabled" { + description = "Whether Logging/Monitoring APIs are enabled" + value = module.monitoring.logging_api_enabled && module.monitoring.monitoring_api_enabled +} + +module "orion_worker" { + count = var.enable_orion_worker ? 1 : 0 + source = "../../../modules/gcp/orion_worker" + + namespace = "orion-worker" + image = var.orion_worker_image + server_ws = var.orion_worker_server_ws + + scorpio_base_url = var.orion_worker_scorpio_base_url + scorpio_lfs_url = var.orion_worker_scorpio_lfs_url + rust_log = var.orion_worker_rust_log + + tolerations = [ + { + key = "dedicated" + operator = "Equal" + value = "orion-build" + effect = "NoSchedule" + } + ] + + node_selector = { + nodepool = var.orion_worker_nodepool_name + } + + cpu_request = var.orion_worker_cpu_request + memory_request = var.orion_worker_memory_request + cpu_limit = var.orion_worker_cpu_limit + memory_limit = var.orion_worker_memory_limit +} diff --git a/envs/gcp/prod/providers.tf b/envs/gcp/prod/providers.tf new file mode 100755 index 0000000..65f1f89 --- /dev/null +++ b/envs/gcp/prod/providers.tf @@ -0,0 +1,9 @@ +provider "google" { + project = var.project_id + region = var.region +} + +provider "google-beta" { + project = var.project_id + region = var.region +} diff --git a/envs/gcp/prod/terraform.tfvars.example b/envs/gcp/prod/terraform.tfvars.example new file mode 100755 index 0000000..02396a5 --- /dev/null +++ b/envs/gcp/prod/terraform.tfvars.example @@ -0,0 +1,82 @@ +# Required +project_id = "your-gcp-project-id" +base_domain = "mega.example.com" + +# Optional: Override defaults +# region = "us-central1" +# zone = "us-central1-b" + +# Storage (mapped from AWS s3_*) +storage_bucket = "mega-prod-storage" +# storage_key = "your-access-key" +# storage_secret_key = "your-secret-key" + +# Database +db_username = "mega_user" +db_password = "your-prod-db-password" +db_schema = "mega_prod" + +# Rails/UI +rails_master_key = "your-prod-rails-master-key" +rails_env = "production" +ui_env = "production" + +# Application +app_suffix = "prod" +app_service_name = "mega-app" +app_image = "us-central1-docker.pkg.dev/your-gcp-project-id/orion-worker-prod/mega:latest" +app_container_port = 80 +app_replicas = 3 + +# Ingress +ingress_name = "mega-ingress" +ingress_static_ip_name = "mega-prod-ip" +ingress_managed_certificate_domains = ["mega.example.com"] +ingress_rules = [ + { + host = "mega.example.com" + service_name = "mega-app" + service_port = 80 + } +] + +# Service Accounts (optional) +iam_service_accounts = { + mega_app = { + display_name = "Mega App Service Account" + roles = ["roles/cloudsql.client", "roles/storage.objectViewer"] + wi_bindings = [ + { + namespace = "default" + k8s_service_account_name = "mega-app-sa" + } + ] + } +} + +# Feature flags (set to false to disable components) +enable_build_env = true +enable_gcs = true +enable_cloud_sql = true +enable_redis = true +enable_filestore = true +enable_apps = true +enable_ingress = true +enable_logging = true +enable_monitoring = true +enable_alerts = true + +# Resource names (optional) +gcs_bucket = "mega-prod-storage" +cloud_sql_instance_name = "mega-prod-db" +redis_instance_name = "mega-prod-redis" +filestore_instance_name = "mega-prod-fs" + +# Orion Worker (optional) +enable_orion_worker = false +# orion_worker_image = "public.ecr.aws/m8q5m4u3/mega:orion-client-0.1.0-pre-release-amd64" +# orion_worker_server_ws = "wss://orion.gitmono.com/ws" +# orion_worker_nodepool_name = "prod-default" + +# Alert notification channels (optional) +# alert_notification_channels = ["projects/your-project/notificationChannels/1234567890"] diff --git a/envs/gcp/prod/variables.tf b/envs/gcp/prod/variables.tf new file mode 100755 index 0000000..1a72ca0 --- /dev/null +++ b/envs/gcp/prod/variables.tf @@ -0,0 +1,602 @@ +variable "project_id" { + type = string +} + +variable "region" { + type = string + default = "us-central1" +} + +variable "zone" { + type = string + description = "GCP zone for zonal resources (e.g. Filestore)." + default = "" +} + +variable "zones" { + type = list(string) + description = "Zones for the node pool. If empty, node_locations will not be set and GKE will choose." + default = [] +} + +variable "name_prefix" { + type = string + default = "mega-prod" +} + +variable "base_domain" { + type = string + default = "" +} + +variable "enable_build_env" { + type = bool + default = true +} + +variable "enable_gcs" { + type = bool + default = true +} + +variable "enable_cloud_sql" { + type = bool + default = true +} + +variable "enable_redis" { + type = bool + default = true +} + +variable "enable_filestore" { + type = bool + default = true +} + +variable "enable_apps" { + type = bool + default = true +} + +variable "enable_ingress" { + type = bool + default = true +} + +variable "enable_logging" { + type = bool + default = true +} + +variable "enable_monitoring" { + type = bool + default = true +} + +variable "enable_alerts" { + type = bool + default = true +} + +variable "alert_notification_channels" { + type = list(string) + default = [] + description = "List of notification channel IDs for alerts" +} + +variable "log_sink_name" { + type = string + default = "" + description = "Optional log sink name for exporting logs" +} + +variable "log_sink_destination" { + type = string + default = "" + description = "Optional log sink destination" +} + +variable "network_name" { + type = string + default = "mega-prod-net" +} + +variable "subnet_name" { + type = string + default = "mega-prod-subnet" +} + +variable "subnet_cidr" { + type = string + default = "10.40.0.0/16" +} + +variable "pods_secondary_range" { + type = string + default = "10.41.0.0/16" +} + +variable "services_secondary_range" { + type = string + default = "10.42.0.0/16" +} + +variable "cluster_name" { + type = string + default = "mega-prod" +} + +variable "artifact_registry_location" { + type = string + default = "us-central1" +} + +variable "artifact_registry_repo" { + type = string + default = "orion-worker-prod" +} + +variable "nodepool_name" { + type = string + default = "prod-default" +} + +variable "node_machine_type" { + type = string + default = "e2-standard-8" +} + +variable "node_disk_size_gb" { + type = number + default = 200 +} + +variable "node_min_count" { + type = number + default = 2 +} + +variable "node_max_count" { + type = number + default = 20 +} + +variable "gcs_bucket" { + type = string + description = "GCS bucket name" + default = "" +} + +variable "gcs_force_destroy" { + type = bool + description = "Allow force deletion of bucket objects" + default = false +} + +variable "gcs_uniform_bucket_level_access" { + type = bool + description = "Enable uniform bucket-level access" + default = true +} + +variable "cloud_sql_instance_name" { + type = string + description = "Cloud SQL instance name" + default = "" +} + +variable "cloud_sql_database_version" { + type = string + description = "Cloud SQL database version (e.g. POSTGRES_17, MYSQL_8_0)" + default = "POSTGRES_17" +} + +variable "cloud_sql_tier" { + type = string + description = "Cloud SQL instance tier" + default = "db-g1-small" +} + +variable "cloud_sql_disk_size" { + type = number + description = "Cloud SQL disk size in GB" + default = 100 +} + +variable "cloud_sql_disk_type" { + type = string + description = "Cloud SQL disk type" + default = "PD_SSD" +} + +variable "cloud_sql_availability_type" { + type = string + description = "Cloud SQL availability type (ZONAL or REGIONAL)" + default = "REGIONAL" +} + +variable "cloud_sql_private_ip_prefix_length" { + type = number + description = "Prefix length for private services range" + default = 16 +} + +variable "cloud_sql_enable_private_service_connection" { + type = bool + description = "Create private service networking connection" + default = true +} + +variable "cloud_sql_enable_public_ip" { + type = bool + description = "Enable public IPv4 for Cloud SQL" + default = false +} + +variable "cloud_sql_db_name" { + type = string + description = "Default database name" + default = "" +} + +variable "cloud_sql_backup_enabled" { + type = bool + description = "Enable automated backups" + default = true +} + +variable "cloud_sql_deletion_protection" { + type = bool + description = "Enable deletion protection" + default = true +} + +variable "redis_instance_name" { + type = string + description = "Memorystore instance name" + default = "" +} + +variable "redis_tier" { + type = string + description = "Memorystore tier" + default = "STANDARD_HA" +} + +variable "redis_memory_size_gb" { + type = number + description = "Memory size in GB" + default = 4 +} + +variable "redis_transit_encryption_mode" { + type = string + description = "Transit encryption mode" + default = "DISABLED" +} + +variable "filestore_instance_name" { + type = string + description = "Filestore instance name" + default = "" +} + +variable "filestore_tier" { + type = string + description = "Filestore tier" + default = "STANDARD" +} + +variable "filestore_capacity_gb" { + type = number + description = "Capacity in GB" + default = 1024 +} + +variable "filestore_file_share_name" { + type = string + description = "File share name" + default = "share1" +} + +variable "filestore_reserved_ip_range" { + type = string + description = "Optional reserved IP range (e.g. 10.0.20.0/29)" + default = null +} + +variable "app_service_name" { + type = string + description = "Kubernetes service name" + default = "" +} + +variable "app_namespace" { + type = string + description = "Kubernetes namespace" + default = "default" +} + +variable "app_image" { + type = string + description = "Container image" + default = "" +} + +variable "app_container_port" { + type = number + description = "Container port" + default = 80 +} + +variable "app_env" { + type = list(map(string)) + description = "Environment variables" + default = [] +} + +variable "app_volumes" { + type = list(object({ + name = string + nfs_server = string + nfs_path = string + })) + description = "Pod volumes (NFS only)" + default = [] +} + +variable "app_volume_mounts" { + type = list(object({ + name = string + mount_path = string + read_only = bool + })) + description = "Container volume mounts" + default = [] +} + +variable "app_replicas" { + type = number + description = "Number of replicas" + default = 3 +} + +variable "app_service_type" { + type = string + description = "Kubernetes service type" + default = "ClusterIP" +} + +variable "app_cpu_request" { + type = string + description = "CPU request" + default = null +} + +variable "app_memory_request" { + type = string + description = "Memory request" + default = null +} + +variable "app_cpu_limit" { + type = string + description = "CPU limit" + default = null +} + +variable "app_memory_limit" { + type = string + description = "Memory limit" + default = null +} + +variable "app_enable_hpa" { + type = bool + description = "Enable HorizontalPodAutoscaler" + default = true +} + +variable "app_hpa_min_replicas" { + type = number + description = "HPA minimum replicas" + default = 3 +} + +variable "app_hpa_max_replicas" { + type = number + description = "HPA maximum replicas" + default = 20 +} + +variable "app_hpa_cpu_utilization" { + type = number + description = "Target CPU utilization percentage" + default = 70 +} + +variable "ingress_name" { + type = string + description = "Ingress name" + default = "" +} + +variable "ingress_namespace" { + type = string + description = "Kubernetes namespace" + default = "default" +} + +variable "ingress_static_ip_name" { + type = string + description = "Global static IP name for GCE ingress" + default = null +} + +variable "ingress_class_name" { + type = string + description = "Ingress class name" + default = "gce" +} + +variable "ingress_managed_certificate_domains" { + type = list(string) + description = "Domains for GKE ManagedCertificate" + default = [] +} + +variable "ingress_rules" { + type = list(object({ + host = string + service_name = string + service_port = number + })) + description = "Ingress host rules" + default = [] +} + +variable "storage_key" { + type = string + description = "Storage access key (mapped from AWS s3_key)" + default = "" + sensitive = true +} + +variable "storage_secret_key" { + type = string + description = "Storage secret key (mapped from AWS s3_secret_key)" + default = "" + sensitive = true +} + +variable "storage_bucket" { + type = string + description = "Storage bucket name (mapped from AWS s3_bucket)" + default = "" +} + +variable "db_username" { + type = string + description = "Database username" + default = "" + sensitive = true +} + +variable "db_password" { + type = string + description = "Database password" + default = "" + sensitive = true +} + +variable "db_schema" { + type = string + description = "Database schema name (compat with AWS envs/dev)" + default = "" +} + +variable "rails_master_key" { + type = string + description = "Rails master key (compat with AWS envs/dev)" + default = "" + sensitive = true +} + +variable "rails_env" { + type = string + description = "Rails env (compat with AWS envs/dev)" + default = "" +} + +variable "ui_env" { + type = string + description = "UI env (compat with AWS envs/dev)" + default = "" +} + +variable "app_suffix" { + type = string + description = "Application suffix (compat with AWS envs/dev)" + default = "" +} + +variable "iam_service_accounts" { + type = map(object({ + display_name = optional(string) + description = optional(string) + roles = optional(list(string), []) + wi_bindings = optional(list(object({ + namespace = string + k8s_service_account_name = string + })), []) + })) + description = "Service accounts to create and their IAM roles / Workload Identity bindings" + default = {} +} + +variable "enable_orion_worker" { + type = bool + default = false + description = "Enable Orion Worker deployment" +} + +variable "orion_worker_image" { + type = string + default = "public.ecr.aws/m8q5m4u3/mega:orion-client-0.1.0-pre-release-amd64" + description = "Orion Worker container image" +} + +variable "orion_worker_server_ws" { + type = string + default = "wss://orion.gitmono.com/ws" + description = "Orion server WebSocket URL" +} + +variable "orion_worker_scorpio_base_url" { + type = string + default = "https://git.gitmono.com" + description = "Scorpio base URL" +} + +variable "orion_worker_scorpio_lfs_url" { + type = string + default = "https://git.gitmono.com" + description = "Scorpio LFS URL" +} + +variable "orion_worker_rust_log" { + type = string + default = "info" + description = "Rust log level" +} + +variable "orion_worker_nodepool_name" { + type = string + default = "prod-default" + description = "Node pool name for Orion Worker scheduling" +} + +variable "orion_worker_cpu_request" { + type = string + default = "6" + description = "CPU request for Orion Worker" +} + +variable "orion_worker_memory_request" { + type = string + default = "24Gi" + description = "Memory request for Orion Worker" +} + +variable "orion_worker_cpu_limit" { + type = string + default = "8" + description = "CPU limit for Orion Worker" +} + +variable "orion_worker_memory_limit" { + type = string + default = "30Gi" + description = "Memory limit for Orion Worker" +} diff --git a/envs/gcp/prod/versions.tf b/envs/gcp/prod/versions.tf new file mode 100755 index 0000000..7352bd2 --- /dev/null +++ b/envs/gcp/prod/versions.tf @@ -0,0 +1,14 @@ +terraform { + required_version = ">= 1.5.0" + + required_providers { + google = { + source = "hashicorp/google" + version = ">= 5.0" + } + google-beta = { + source = "hashicorp/google-beta" + version = ">= 5.0" + } + } +} diff --git a/envs/gcp/staging/main.tf b/envs/gcp/staging/main.tf new file mode 100755 index 0000000..63340c6 --- /dev/null +++ b/envs/gcp/staging/main.tf @@ -0,0 +1,295 @@ +locals { + enable_build_env = var.enable_build_env + enable_gcs = var.enable_gcs + enable_cloud_sql = var.enable_cloud_sql + enable_redis = var.enable_redis + enable_filestore = var.enable_filestore + enable_apps = var.enable_apps + enable_ingress = var.enable_ingress +} + +module "network" { + count = local.enable_build_env ? 1 : 0 + source = "../../../modules/gcp/network" + + name_prefix = var.name_prefix + region = var.region + network_name = var.network_name + subnet_name = var.subnet_name + subnet_cidr = var.subnet_cidr + pods_secondary_range = var.pods_secondary_range + services_secondary_range = var.services_secondary_range +} + +module "artifact_registry" { + count = local.enable_build_env ? 1 : 0 + source = "../../../modules/gcp/artifact_registry" + + location = var.artifact_registry_location + repo_name = var.artifact_registry_repo +} + +module "gke" { + count = local.enable_build_env ? 1 : 0 + source = "../../../modules/gcp/gke" + + project_id = var.project_id + region = var.region + cluster_name = var.cluster_name + + network_self_link = module.network[0].network_self_link + subnetwork_self_link = module.network[0].subnetwork_self_link + + ip_range_pods_name = module.network[0].pods_secondary_range_name + ip_range_services_name = module.network[0].services_secondary_range_name + + logging_service = var.enable_logging ? "logging.googleapis.com/kubernetes" : "none" + monitoring_service = var.enable_monitoring ? "monitoring.googleapis.com/kubernetes" : "none" +} + +module "nodepool" { + count = local.enable_build_env ? 1 : 0 + source = "../../../modules/gcp/gke/nodepool" + + project_id = var.project_id + region = var.region + cluster_name = module.gke[0].cluster_name + + name = var.nodepool_name + machine_type = var.node_machine_type + disk_size_gb = var.node_disk_size_gb + + min_count = var.node_min_count + max_count = var.node_max_count + + labels = { + nodepool = var.nodepool_name + } + + taints = [ + { + key = "dedicated" + value = "orion-build" + effect = "NO_SCHEDULE" + } + ] +} + +module "iam" { + source = "../../../modules/gcp/iam" + + project_id = var.project_id + prefix = coalesce(var.app_suffix, var.name_prefix) + service_accounts = var.iam_service_accounts +} + +module "monitoring" { + source = "../../../modules/gcp/monitoring" + + project_id = var.project_id + enable_logging = var.enable_logging + enable_monitoring = var.enable_monitoring + enable_alerts = var.enable_alerts + alert_notification_channels = var.alert_notification_channels + log_sink_name = var.log_sink_name + log_sink_destination = var.log_sink_destination +} + +module "gcs" { + count = local.enable_gcs ? 1 : 0 + source = "../../../modules/gcp/gcs" + + name = var.gcs_bucket + location = var.region + force_destroy = var.gcs_force_destroy + uniform_bucket_level_access = var.gcs_uniform_bucket_level_access +} + +module "cloud_sql" { + count = local.enable_cloud_sql ? 1 : 0 + source = "../../../modules/gcp/cloud_sql" + + name = var.cloud_sql_instance_name + database_version = var.cloud_sql_database_version + region = var.region + tier = var.cloud_sql_tier + disk_size = var.cloud_sql_disk_size + disk_type = var.cloud_sql_disk_type + availability_type = var.cloud_sql_availability_type + private_network = local.enable_build_env ? module.network[0].network_self_link : "" + private_ip_prefix_length = var.cloud_sql_private_ip_prefix_length + enable_private_service_connection = var.cloud_sql_enable_private_service_connection + enable_public_ip = var.cloud_sql_enable_public_ip + db_name = var.cloud_sql_db_name + db_username = var.db_username + db_password = var.db_password + backup_enabled = var.cloud_sql_backup_enabled + deletion_protection = var.cloud_sql_deletion_protection +} + +module "redis" { + count = local.enable_redis ? 1 : 0 + source = "../../../modules/gcp/redis" + + name = var.redis_instance_name + region = var.region + tier = var.redis_tier + memory_size_gb = var.redis_memory_size_gb + network = local.enable_build_env ? module.network[0].network_self_link : "" + transit_encryption_mode = var.redis_transit_encryption_mode +} + +module "filestore" { + count = local.enable_filestore ? 1 : 0 + source = "../../../modules/gcp/filestore" + + name = var.filestore_instance_name + location = var.zone != "" ? var.zone : "${var.region}-b" + network = local.enable_build_env ? module.network[0].network_self_link : "" + tier = var.filestore_tier + capacity_gb = var.filestore_capacity_gb + file_share_name = var.filestore_file_share_name + reserved_ip_range = var.filestore_reserved_ip_range +} + +module "gke_service" { + count = local.enable_apps ? 1 : 0 + source = "../../../modules/gcp/gke_service" + + name = var.app_service_name + namespace = var.app_namespace + image = var.app_image + container_port = var.app_container_port + env = var.app_env + volumes = var.app_volumes + volume_mounts = var.app_volume_mounts + replicas = var.app_replicas + service_type = var.app_service_type + cpu_request = var.app_cpu_request + memory_request = var.app_memory_request + cpu_limit = var.app_cpu_limit + memory_limit = var.app_memory_limit + enable_hpa = var.app_enable_hpa + hpa_min_replicas = var.app_hpa_min_replicas + hpa_max_replicas = var.app_hpa_max_replicas + hpa_cpu_utilization = var.app_hpa_cpu_utilization +} + +module "ingress" { + count = local.enable_ingress ? 1 : 0 + source = "../../../modules/gcp/ingress" + + name = var.ingress_name + namespace = var.ingress_namespace + static_ip_name = var.ingress_static_ip_name + ingress_class_name = var.ingress_class_name + managed_certificate_domains = var.ingress_managed_certificate_domains + rules = var.ingress_rules +} + +output "gke_cluster_name" { + value = local.enable_build_env ? module.gke[0].cluster_name : null +} + +output "gke_cluster_location" { + value = local.enable_build_env ? module.gke[0].location : null +} + +output "artifact_registry_repo" { + value = local.enable_build_env ? module.artifact_registry[0].repository : null +} + +output "gcs_bucket_name" { + value = local.enable_gcs ? module.gcs[0].bucket_name : null +} + +output "cloud_sql_db_endpoint" { + value = local.enable_cloud_sql ? module.cloud_sql[0].db_endpoint : null +} + +output "cloud_sql_connection_name" { + value = local.enable_cloud_sql ? module.cloud_sql[0].connection_name : null +} + +output "redis_host" { + value = local.enable_redis ? module.redis[0].host : null +} + +output "redis_port" { + value = local.enable_redis ? module.redis[0].port : null +} + +output "pg_endpoint" { + value = local.enable_cloud_sql ? module.cloud_sql[0].db_endpoint : null +} + +output "valkey_endpoint" { + value = local.enable_redis ? [{ address = module.redis[0].host, port = module.redis[0].port }] : null +} + +output "alb_dns_name" { + value = local.enable_ingress ? coalesce(module.ingress[0].ip_address, module.ingress[0].hostname) : null +} + +output "filestore_instance_name" { + value = local.enable_filestore ? module.filestore[0].instance_name : null +} + +output "filestore_file_share_name" { + value = local.enable_filestore ? module.filestore[0].file_share_name : null +} + +output "filestore_ip_address" { + value = local.enable_filestore ? module.filestore[0].ip_address : null +} + +output "iam_service_accounts" { + description = "Created service accounts with emails and names" + value = module.iam.service_accounts +} + +output "iam_workload_identity_bindings" { + description = "Workload Identity bindings (K8s SA -> GCP SA)" + value = module.iam.workload_identity_bindings +} + +output "project_id" { + description = "GCP project ID" + value = var.project_id +} + +output "monitoring_logging_api_enabled" { + description = "Whether Logging/Monitoring APIs are enabled" + value = module.monitoring.logging_api_enabled && module.monitoring.monitoring_api_enabled +} + +module "orion_worker" { + count = var.enable_orion_worker ? 1 : 0 + source = "../../../modules/gcp/orion_worker" + + namespace = "orion-worker" + image = var.orion_worker_image + server_ws = var.orion_worker_server_ws + + scorpio_base_url = var.orion_worker_scorpio_base_url + scorpio_lfs_url = var.orion_worker_scorpio_lfs_url + rust_log = var.orion_worker_rust_log + + tolerations = [ + { + key = "dedicated" + operator = "Equal" + value = "orion-build" + effect = "NoSchedule" + } + ] + + node_selector = { + nodepool = var.orion_worker_nodepool_name + } + + cpu_request = var.orion_worker_cpu_request + memory_request = var.orion_worker_memory_request + cpu_limit = var.orion_worker_cpu_limit + memory_limit = var.orion_worker_memory_limit +} diff --git a/envs/gcp/staging/providers.tf b/envs/gcp/staging/providers.tf new file mode 100755 index 0000000..65f1f89 --- /dev/null +++ b/envs/gcp/staging/providers.tf @@ -0,0 +1,9 @@ +provider "google" { + project = var.project_id + region = var.region +} + +provider "google-beta" { + project = var.project_id + region = var.region +} diff --git a/envs/gcp/staging/terraform.tfvars.example b/envs/gcp/staging/terraform.tfvars.example new file mode 100755 index 0000000..dc7a87c --- /dev/null +++ b/envs/gcp/staging/terraform.tfvars.example @@ -0,0 +1,79 @@ +# Required +project_id = "your-gcp-project-id" +base_domain = "staging.mega.example.com" + +# Optional: Override defaults +# region = "us-central1" +# zone = "us-central1-b" + +# Storage (mapped from AWS s3_*) +storage_bucket = "mega-staging-storage" +# storage_key = "your-access-key" +# storage_secret_key = "your-secret-key" + +# Database +db_username = "mega_user" +db_password = "your-db-password" +db_schema = "mega_staging" + +# Rails/UI +rails_master_key = "your-rails-master-key" +rails_env = "staging" +ui_env = "staging" + +# Application +app_suffix = "staging" +app_service_name = "mega-app" +app_image = "us-central1-docker.pkg.dev/your-gcp-project-id/orion-worker-staging/mega:latest" +app_container_port = 80 +app_replicas = 2 + +# Ingress +ingress_name = "mega-ingress" +ingress_static_ip_name = "mega-staging-ip" +ingress_managed_certificate_domains = ["staging.mega.example.com"] +ingress_rules = [ + { + host = "staging.mega.example.com" + service_name = "mega-app" + service_port = 80 + } +] + +# Service Accounts (optional) +iam_service_accounts = { + mega_app = { + display_name = "Mega App Service Account" + roles = ["roles/cloudsql.client", "roles/storage.objectViewer"] + wi_bindings = [ + { + namespace = "default" + k8s_service_account_name = "mega-app-sa" + } + ] + } +} + +# Feature flags (set to false to disable components) +enable_build_env = true +enable_gcs = true +enable_cloud_sql = true +enable_redis = true +enable_filestore = true +enable_apps = true +enable_ingress = true +enable_logging = true +enable_monitoring = true +enable_alerts = false + +# Resource names (optional) +gcs_bucket = "mega-staging-storage" +cloud_sql_instance_name = "mega-staging-db" +redis_instance_name = "mega-staging-redis" +filestore_instance_name = "mega-staging-fs" + +# Orion Worker (optional) +enable_orion_worker = false +# orion_worker_image = "public.ecr.aws/m8q5m4u3/mega:orion-client-0.1.0-pre-release-amd64" +# orion_worker_server_ws = "wss://orion.gitmono.com/ws" +# orion_worker_nodepool_name = "staging-default" diff --git a/envs/gcp/staging/variables.tf b/envs/gcp/staging/variables.tf new file mode 100755 index 0000000..0ed569e --- /dev/null +++ b/envs/gcp/staging/variables.tf @@ -0,0 +1,602 @@ +variable "project_id" { + type = string +} + +variable "region" { + type = string + default = "us-central1" +} + +variable "zone" { + type = string + description = "GCP zone for zonal resources (e.g. Filestore)." + default = "" +} + +variable "zones" { + type = list(string) + description = "Zones for the node pool. If empty, node_locations will not be set and GKE will choose." + default = [] +} + +variable "name_prefix" { + type = string + default = "mega-staging" +} + +variable "base_domain" { + type = string + default = "" +} + +variable "enable_build_env" { + type = bool + default = true +} + +variable "enable_gcs" { + type = bool + default = true +} + +variable "enable_cloud_sql" { + type = bool + default = true +} + +variable "enable_redis" { + type = bool + default = true +} + +variable "enable_filestore" { + type = bool + default = true +} + +variable "enable_apps" { + type = bool + default = true +} + +variable "enable_ingress" { + type = bool + default = true +} + +variable "enable_logging" { + type = bool + default = true +} + +variable "enable_monitoring" { + type = bool + default = true +} + +variable "enable_alerts" { + type = bool + default = false +} + +variable "alert_notification_channels" { + type = list(string) + default = [] + description = "List of notification channel IDs for alerts" +} + +variable "log_sink_name" { + type = string + default = "" + description = "Optional log sink name for exporting logs" +} + +variable "log_sink_destination" { + type = string + default = "" + description = "Optional log sink destination" +} + +variable "network_name" { + type = string + default = "mega-staging-net" +} + +variable "subnet_name" { + type = string + default = "mega-staging-subnet" +} + +variable "subnet_cidr" { + type = string + default = "10.30.0.0/16" +} + +variable "pods_secondary_range" { + type = string + default = "10.31.0.0/16" +} + +variable "services_secondary_range" { + type = string + default = "10.32.0.0/16" +} + +variable "cluster_name" { + type = string + default = "mega-staging" +} + +variable "artifact_registry_location" { + type = string + default = "us-central1" +} + +variable "artifact_registry_repo" { + type = string + default = "orion-worker-staging" +} + +variable "nodepool_name" { + type = string + default = "staging-default" +} + +variable "node_machine_type" { + type = string + default = "e2-standard-4" +} + +variable "node_disk_size_gb" { + type = number + default = 100 +} + +variable "node_min_count" { + type = number + default = 1 +} + +variable "node_max_count" { + type = number + default = 5 +} + +variable "gcs_bucket" { + type = string + description = "GCS bucket name" + default = "" +} + +variable "gcs_force_destroy" { + type = bool + description = "Allow force deletion of bucket objects" + default = false +} + +variable "gcs_uniform_bucket_level_access" { + type = bool + description = "Enable uniform bucket-level access" + default = true +} + +variable "cloud_sql_instance_name" { + type = string + description = "Cloud SQL instance name" + default = "" +} + +variable "cloud_sql_database_version" { + type = string + description = "Cloud SQL database version (e.g. POSTGRES_17, MYSQL_8_0)" + default = "POSTGRES_17" +} + +variable "cloud_sql_tier" { + type = string + description = "Cloud SQL instance tier" + default = "db-g1-small" +} + +variable "cloud_sql_disk_size" { + type = number + description = "Cloud SQL disk size in GB" + default = 20 +} + +variable "cloud_sql_disk_type" { + type = string + description = "Cloud SQL disk type" + default = "PD_SSD" +} + +variable "cloud_sql_availability_type" { + type = string + description = "Cloud SQL availability type (ZONAL or REGIONAL)" + default = "ZONAL" +} + +variable "cloud_sql_private_ip_prefix_length" { + type = number + description = "Prefix length for private services range" + default = 16 +} + +variable "cloud_sql_enable_private_service_connection" { + type = bool + description = "Create private service networking connection" + default = true +} + +variable "cloud_sql_enable_public_ip" { + type = bool + description = "Enable public IPv4 for Cloud SQL" + default = false +} + +variable "cloud_sql_db_name" { + type = string + description = "Default database name" + default = "" +} + +variable "cloud_sql_backup_enabled" { + type = bool + description = "Enable automated backups" + default = true +} + +variable "cloud_sql_deletion_protection" { + type = bool + description = "Enable deletion protection" + default = false +} + +variable "redis_instance_name" { + type = string + description = "Memorystore instance name" + default = "" +} + +variable "redis_tier" { + type = string + description = "Memorystore tier" + default = "STANDARD_HA" +} + +variable "redis_memory_size_gb" { + type = number + description = "Memory size in GB" + default = 2 +} + +variable "redis_transit_encryption_mode" { + type = string + description = "Transit encryption mode" + default = "DISABLED" +} + +variable "filestore_instance_name" { + type = string + description = "Filestore instance name" + default = "" +} + +variable "filestore_tier" { + type = string + description = "Filestore tier" + default = "STANDARD" +} + +variable "filestore_capacity_gb" { + type = number + description = "Capacity in GB" + default = 1024 +} + +variable "filestore_file_share_name" { + type = string + description = "File share name" + default = "share1" +} + +variable "filestore_reserved_ip_range" { + type = string + description = "Optional reserved IP range (e.g. 10.0.20.0/29)" + default = null +} + +variable "app_service_name" { + type = string + description = "Kubernetes service name" + default = "" +} + +variable "app_namespace" { + type = string + description = "Kubernetes namespace" + default = "default" +} + +variable "app_image" { + type = string + description = "Container image" + default = "" +} + +variable "app_container_port" { + type = number + description = "Container port" + default = 80 +} + +variable "app_env" { + type = list(map(string)) + description = "Environment variables" + default = [] +} + +variable "app_volumes" { + type = list(object({ + name = string + nfs_server = string + nfs_path = string + })) + description = "Pod volumes (NFS only)" + default = [] +} + +variable "app_volume_mounts" { + type = list(object({ + name = string + mount_path = string + read_only = bool + })) + description = "Container volume mounts" + default = [] +} + +variable "app_replicas" { + type = number + description = "Number of replicas" + default = 1 +} + +variable "app_service_type" { + type = string + description = "Kubernetes service type" + default = "ClusterIP" +} + +variable "app_cpu_request" { + type = string + description = "CPU request" + default = null +} + +variable "app_memory_request" { + type = string + description = "Memory request" + default = null +} + +variable "app_cpu_limit" { + type = string + description = "CPU limit" + default = null +} + +variable "app_memory_limit" { + type = string + description = "Memory limit" + default = null +} + +variable "app_enable_hpa" { + type = bool + description = "Enable HorizontalPodAutoscaler" + default = false +} + +variable "app_hpa_min_replicas" { + type = number + description = "HPA minimum replicas" + default = 1 +} + +variable "app_hpa_max_replicas" { + type = number + description = "HPA maximum replicas" + default = 5 +} + +variable "app_hpa_cpu_utilization" { + type = number + description = "Target CPU utilization percentage" + default = 80 +} + +variable "ingress_name" { + type = string + description = "Ingress name" + default = "" +} + +variable "ingress_namespace" { + type = string + description = "Kubernetes namespace" + default = "default" +} + +variable "ingress_static_ip_name" { + type = string + description = "Global static IP name for GCE ingress" + default = null +} + +variable "ingress_class_name" { + type = string + description = "Ingress class name" + default = "gce" +} + +variable "ingress_managed_certificate_domains" { + type = list(string) + description = "Domains for GKE ManagedCertificate" + default = [] +} + +variable "ingress_rules" { + type = list(object({ + host = string + service_name = string + service_port = number + })) + description = "Ingress host rules" + default = [] +} + +variable "storage_key" { + type = string + description = "Storage access key (mapped from AWS s3_key)" + default = "" + sensitive = true +} + +variable "storage_secret_key" { + type = string + description = "Storage secret key (mapped from AWS s3_secret_key)" + default = "" + sensitive = true +} + +variable "storage_bucket" { + type = string + description = "Storage bucket name (mapped from AWS s3_bucket)" + default = "" +} + +variable "db_username" { + type = string + description = "Database username" + default = "" + sensitive = true +} + +variable "db_password" { + type = string + description = "Database password" + default = "" + sensitive = true +} + +variable "db_schema" { + type = string + description = "Database schema name (compat with AWS envs/dev)" + default = "" +} + +variable "rails_master_key" { + type = string + description = "Rails master key (compat with AWS envs/dev)" + default = "" + sensitive = true +} + +variable "rails_env" { + type = string + description = "Rails env (compat with AWS envs/dev)" + default = "" +} + +variable "ui_env" { + type = string + description = "UI env (compat with AWS envs/dev)" + default = "" +} + +variable "app_suffix" { + type = string + description = "Application suffix (compat with AWS envs/dev)" + default = "" +} + +variable "iam_service_accounts" { + type = map(object({ + display_name = optional(string) + description = optional(string) + roles = optional(list(string), []) + wi_bindings = optional(list(object({ + namespace = string + k8s_service_account_name = string + })), []) + })) + description = "Service accounts to create and their IAM roles / Workload Identity bindings" + default = {} +} + +variable "enable_orion_worker" { + type = bool + default = false + description = "Enable Orion Worker deployment" +} + +variable "orion_worker_image" { + type = string + default = "public.ecr.aws/m8q5m4u3/mega:orion-client-0.1.0-pre-release-amd64" + description = "Orion Worker container image" +} + +variable "orion_worker_server_ws" { + type = string + default = "wss://orion.gitmono.com/ws" + description = "Orion server WebSocket URL" +} + +variable "orion_worker_scorpio_base_url" { + type = string + default = "https://git.gitmono.com" + description = "Scorpio base URL" +} + +variable "orion_worker_scorpio_lfs_url" { + type = string + default = "https://git.gitmono.com" + description = "Scorpio LFS URL" +} + +variable "orion_worker_rust_log" { + type = string + default = "info" + description = "Rust log level" +} + +variable "orion_worker_nodepool_name" { + type = string + default = "staging-default" + description = "Node pool name for Orion Worker scheduling" +} + +variable "orion_worker_cpu_request" { + type = string + default = "6" + description = "CPU request for Orion Worker" +} + +variable "orion_worker_memory_request" { + type = string + default = "24Gi" + description = "Memory request for Orion Worker" +} + +variable "orion_worker_cpu_limit" { + type = string + default = "8" + description = "CPU limit for Orion Worker" +} + +variable "orion_worker_memory_limit" { + type = string + default = "30Gi" + description = "Memory limit for Orion Worker" +} diff --git a/envs/gcp/staging/versions.tf b/envs/gcp/staging/versions.tf new file mode 100755 index 0000000..7352bd2 --- /dev/null +++ b/envs/gcp/staging/versions.tf @@ -0,0 +1,14 @@ +terraform { + required_version = ">= 1.5.0" + + required_providers { + google = { + source = "hashicorp/google" + version = ">= 5.0" + } + google-beta = { + source = "hashicorp/google-beta" + version = ">= 5.0" + } + } +} diff --git a/gcp/e2e/README.e2e.md b/gcp/e2e/README.e2e.md new file mode 100755 index 0000000..61c12e5 --- /dev/null +++ b/gcp/e2e/README.e2e.md @@ -0,0 +1,84 @@ +# Milestone D: E2E Validation (GKE / Orion Worker) + +This directory provides E2E validation jobs that run inside the cluster. + +Prerequisites: +- Orion Worker is deployed. + - Recommended: deploy via Terraform with `enable_orion_worker = true` in `deployment/envs/gcp/`. +- Worker configuration uses: + - `SERVER_WS=wss://orion.gitmono.com/ws` + - `SCORPIO_BASE_URL=https://git.gitmono.com` + - `SCORPIO_LFS_URL=https://git.gitmono.com` +- Cluster nodes have outbound internet access (NAT) and DNS works. + +## 0. Quick check: Orion Worker DaemonSet + +```bash +kubectl -n orion-worker get ds/orion-worker +kubectl -n orion-worker get pods -l app=orion-worker -o wide +``` + +Successful criteria: +- DaemonSet `DESIRED` equals `READY`. +- Pods are scheduled onto the expected build nodepool nodes. + +## 1. Connectivity validation (DNS / HTTPS / WS / Mono) + +File: `connectivity-check-job.yaml` + +Run: +```bash +kubectl apply -f deployment/gcp/e2e/connectivity-check-job.yaml +kubectl -n orion-worker wait --for=condition=complete job/orion-worker-connectivity-check --timeout=120s +kubectl -n orion-worker logs job/orion-worker-connectivity-check +``` + +Successful criteria: +- Logs contain `ALL_CHECKS_PASSED`. + +Failure criteria (common): +- DNS failure: `getent hosts` has no output / non-zero exit +- HTTPS failure: `curl` timeout or cert errors +- WS failure: TLS handshake failure / `/ws` endpoint unreachable + +## 2. Task execution validation (submit task -> worker executes -> status readback) + +File: `task-e2e-trigger-job.yaml` + +Notes: +- This job creates a task via Orion-server HTTP API `POST /task`. +- Then it polls task status via `GET /tasks/{cl}` until it returns `"status":"Completed"`. +- The repo/target values are environment-specific. You MUST update envs before applying: + - `ORION_API_BASE`: Orion-server base URL (default `https://orion.gitmono.com`) + - `ORION_TASK_CL`: CL number for query + - `ORION_TASK_CL_LINK`: any traceable link + - `ORION_REPO`: repo name as understood by the server + - `ORION_TARGET`: a small, known-existing target + - `ORION_POLL_SECONDS`: timeout seconds (default 300s) + +Run: +```bash +kubectl apply -f deployment/gcp/e2e/task-e2e-trigger-job.yaml +kubectl -n orion-worker logs -f job/orion-task-e2e-trigger +``` + +Successful criteria: +- Job logs contain `TASK_E2E_PASSED`. +- Worker logs show: + - connected to `/ws` + - received tasks + - build finished successfully + +Failure criteria: +- `TASK_E2E_FAILED`: build status becomes Failed/Interrupted +- timeout: worker did not pick up the task + +## 3. Suggested troubleshooting commands + +```bash +kubectl -n orion-worker get pods -o wide +kubectl -n orion-worker logs -l app=orion-worker --tail=200 +kubectl -n orion-worker describe ds/orion-worker +``` + +If Orion-server `/task` requires authentication (Authorization header), update `task-e2e-trigger-job.yaml` accordingly. diff --git a/gcp/e2e/connectivity-check-job.yaml b/gcp/e2e/connectivity-check-job.yaml new file mode 100755 index 0000000..ef53107 --- /dev/null +++ b/gcp/e2e/connectivity-check-job.yaml @@ -0,0 +1,59 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: orion-worker-connectivity-check + namespace: orion-worker +spec: + backoffLimit: 0 + template: + metadata: + labels: + app: orion-worker-connectivity-check + spec: + restartPolicy: Never + + tolerations: + - key: "dedicated" + operator: "Equal" + value: "orion-build" + effect: "NoSchedule" + + + containers: + - name: check + image: curlimages/curl:8.6.0 + imagePullPolicy: IfNotPresent + command: + - /bin/sh + - -lc + - | + set -euo pipefail + echo "== DNS check ==" + getent hosts orion.gitmono.com + getent hosts git.gitmono.com + + echo "== HTTPS check (Mono) ==" + curl -fsSIL --max-time 10 https://git.gitmono.com + + echo "== HTTPS check (Orion host, via WS endpoint URL but over HTTPS) ==" + curl -fsSIL --max-time 10 https://orion.gitmono.com + + echo "== WebSocket endpoint reachability (TLS handshake + HTTP upgrade attempt) ==" + # curl can't fully validate WS app logic, but can at least reach the endpoint and attempt upgrade. + curl -vk --http1.1 --max-time 10 \ + -H "Connection: Upgrade" \ + -H "Upgrade: websocket" \ + -H "Sec-WebSocket-Key: dGhlIHNhbXBsZSBub25jZQ==" \ + -H "Sec-WebSocket-Version: 13" \ + https://orion.gitmono.com/ws \ + -o /dev/null + + echo "ALL_CHECKS_PASSED" + resources: + requests: + cpu: "100m" + memory: "128Mi" + limits: + cpu: "500m" + memory: "256Mi" + diff --git a/gcp/e2e/full-test-guide.md b/gcp/e2e/full-test-guide.md new file mode 100755 index 0000000..3cf8e7c --- /dev/null +++ b/gcp/e2e/full-test-guide.md @@ -0,0 +1,100 @@ +# Orion Worker (GCP) – End-to-End 验收流程 + +> 适用目录:`deployment/k8s/gcp/*` +> 目标:验证 GKE 上的 Orion Worker 已正确接入现有 AWS Orion-server/Mono(`gitmono.com`) 并能成功执行一次构建任务。 + +--- + +## Phase 0:前提检查 + +1. `kubectl config current-context` 指向目标 GKE 集群。 +2. 已创建 NodePool `build-default`,节点具备出网能力(Cloud NAT 或 Public IP)。 +3. 如需私有镜像仓库,请先确保节点可拉取 `public.ecr.aws/.../mega:*` 镜像。 + +--- + +## Phase 1:连通性快速自检 + +```bash +# 1) 创建 namespace(如未创建) +kubectl apply -f deployment/k8s/gcp/orion-worker-namespace.yaml + +# 2) 运行连通性检查 Job +kubectl apply -f deployment/k8s/gcp/e2e/connectivity-check-job.yaml + +# 3) 查看日志 +kubectl logs -n orion-worker job/orion-worker-connectivity-check --tail=200 | cat +``` + +**期望输出** +- `getent hosts orion.gitmono.com` / `git.gitmono.com` 均返回解析结果。 +- `curl -I https://git.gitmono.com` / `https://orion.gitmono.com` 返回 2xx/3xx/4xx 均视为“可达”。 +- TLS handshake 与 `/ws` HTTP Upgrade 没有超时/证书错误。 + +--- + +## Phase 2:部署 Orion Worker DaemonSet + +```bash +kubectl apply -f deployment/k8s/gcp/orion-worker-serviceaccount.yaml +kubectl apply -f deployment/k8s/gcp/orion-worker-configmap.yaml +kubectl apply -f deployment/k8s/gcp/orion-worker-secret.yaml # 如有敏感信息可跳过 +kubectl apply -f deployment/k8s/gcp/orion-worker-daemonset.yaml + +# 查看 DaemonSet / Pod 状态 +kubectl get ds -n orion-worker -o wide +kubectl get pods -n orion-worker -o wide +``` + +**期望** +- `DESIRED/CURRENT/READY` 数量与构建节点数一致。 +- Pod 状态为 `Running`。若 `Pending/CrashLoopBackOff`,请 `kubectl describe pod` 调试。 + +### 2.1 关键依赖验收 + +```bash +POD=$(kubectl get pods -n orion-worker -o jsonpath='{.items[0].metadata.name}') + +# WebSocket 连接是否成功 +a.kubectl logs -n orion-worker $POD --tail=200 | cat + +# FUSE 设备是否存在 +kubectl exec -n orion-worker $POD -- ls -l /dev/fuse +``` + +日志应出现 `connected to wss://orion.gitmono.com/ws` 及心跳/idle 信息。 + +--- + +## Phase 3:触发一次 E2E 构建任务 + +仓库已提供触发 Job,并默认指向 `orion.gitmono.com`。 + +```bash +kubectl apply -f deployment/k8s/gcp/e2e/task-e2e-trigger-job.yaml +kubectl logs -n orion-worker job/task-e2e-trigger --tail=200 | cat +``` + +随后在 Worker Pod 内实时查看: + +```bash +kubectl logs -n orion-worker $POD -f | cat +``` + +**期望** +- Worker 收到任务(`starting job …`)。 +- `scorpio mount` 成功,`buck2` 执行无错误。 +- 任务结果 `success` / exit 0,Orion-server 侧状态变更为 `success`。 + +--- + +## Phase 4:收集验收证据 + +```bash +kubectl get nodes +kubectl get ds/pods -n orion-worker -o wide +kubectl logs -n orion-worker job/orion-worker-connectivity-check | tail -20 +kubectl logs -n orion-worker $POD --since=30m | grep -E "connected|success" +``` + +将以上输出保存,作为本次 GCP Worker 部署 E2E 成功的佐证。 diff --git a/gcp/e2e/minimal-validation.sh b/gcp/e2e/minimal-validation.sh new file mode 100755 index 0000000..9907d3c --- /dev/null +++ b/gcp/e2e/minimal-validation.sh @@ -0,0 +1,121 @@ +#!/bin/bash +set -euo pipefail + +# Minimal validation script for GCP deployment +# Usage: ./minimal-validation.sh +# Example: ./minimal-validation.sh dev + +ENV=${1:-dev} +PROJECT_ID=$(terraform output -raw project_id 2>/dev/null || echo "") +REGION="us-central1" + +echo "=== Minimal validation for environment: $ENV ===" + +if [[ -z "$PROJECT_ID" ]]; then + echo "ERROR: Cannot read project_id from terraform output. Run from envs/gcp/$ENV directory." + exit 1 +fi + +# 1) Check GKE cluster exists and is running +echo "1. Checking GKE cluster..." +CLUSTER_NAME="mega-gke" +if [[ "$ENV" == "staging" ]]; then + CLUSTER_NAME="mega-staging" +elif [[ "$ENV" == "prod" ]]; then + CLUSTER_NAME="mega-prod" +fi + +gcloud container clusters describe "$CLUSTER_NAME" --region "$REGION" --project "$PROJECT_ID" --format="value(status)" > /dev/null +echo "✅ GKE cluster $CLUSTER_NAME exists" + +# 2) Get credentials and check node pools +echo "2. Getting credentials and checking node pools..." +gcloud container clusters get-credentials "$CLUSTER_NAME" --region "$REGION" --project "$PROJECT_ID" + +NODE_COUNT=$(kubectl get nodes --no-headers | wc -l) +if [[ "$NODE_COUNT" -eq 0 ]]; then + echo "❌ No nodes found in cluster" + exit 1 +fi +echo "✅ Found $NODE_COUNT nodes" + +# 3) Check Cloud SQL (if enabled) +echo "3. Checking Cloud SQL (if enabled)..." +SQL_INSTANCE_NAME="mega-gke-db" +if [[ "$ENV" == "staging" ]]; then + SQL_INSTANCE_NAME="mega-staging-db" +elif [[ "$ENV" == "prod" ]]; then + SQL_INSTANCE_NAME="mega-prod-db" +fi + +if gcloud sql instances describe "$SQL_INSTANCE_NAME" --project "$PROJECT_ID" --format="value(state)" > /dev/null 2>&1; then + SQL_STATE=$(gcloud sql instances describe "$SQL_INSTANCE_NAME" --project "$PROJECT_ID" --format="value(state)") + echo "✅ Cloud SQL instance $SQL_INSTANCE_NAME state: $SQL_STATE" +else + echo "ℹ️ Cloud SQL instance $SQL_INSTANCE_NAME not found (may be disabled)" +fi + +# 4) Check Redis (if enabled) +echo "4. Checking Redis (if enabled)..." +REDIS_INSTANCE_NAME="mega-gke-redis" +if [[ "$ENV" == "staging" ]]; then + REDIS_INSTANCE_NAME="mega-staging-redis" +elif [[ "$ENV" == "prod" ]]; then + REDIS_INSTANCE_NAME="mega-prod-redis" +fi + +if gcloud redis instances describe "$REDIS_INSTANCE_NAME" --region "$REGION" --project "$PROJECT_ID" --format="value(state)" > /dev/null 2>&1; then + REDIS_STATE=$(gcloud redis instances describe "$REDIS_INSTANCE_NAME" --region "$REGION" --project "$PROJECT_ID" --format="value(state)") + echo "✅ Redis instance $REDIS_INSTANCE_NAME state: $REDIS_STATE" +else + echo "ℹ️ Redis instance $REDIS_INSTANCE_NAME not found (may be disabled)" +fi + +# 5) Check GCS bucket (if enabled) +echo "5. Checking GCS bucket (if enabled)..." +BUCKET_NAME="mega-gke-storage" +if [[ "$ENV" == "staging" ]]; then + BUCKET_NAME="mega-staging-storage" +elif [[ "$ENV" == "prod" ]]; then + BUCKET_NAME="mega-prod-storage" +fi + +if gsutil ls "gs://$BUCKET_NAME" > /dev/null 2>&1; then + echo "✅ GCS bucket $BUCKET_NAME exists" +else + echo "ℹ️ GCS bucket $BUCKET_NAME not found (may be disabled)" +fi + +# 6) Check Artifact Registry (if enabled) +echo "6. Checking Artifact Registry (if enabled)..." +REPO_NAME="orion-worker" +if [[ "$ENV" == "staging" ]]; then + REPO_NAME="orion-worker-staging" +elif [[ "$ENV" == "prod" ]]; then + REPO_NAME="orion-worker-prod" +fi + +if gcloud artifacts repositories describe "$REPO_NAME" --location "$REGION" --project "$PROJECT_ID" > /dev/null 2>&1; then + echo "✅ Artifact Registry repository $REPO_NAME exists" +else + echo "ℹ️ Artifact Registry repository $REPO_NAME not found (may be disabled)" +fi + +# 7) Check Logging/Monitoring APIs enabled +echo "7. Checking Logging/Monitoring APIs..." +LOGGING_ENABLED=$(gcloud services list --enabled --project "$PROJECT_ID" | grep "logging.googleapis.com" || echo "") +MONITORING_ENABLED=$(gcloud services list --enabled --project "$PROJECT_ID" | grep "monitoring.googleapis.com" || echo "") + +if [[ -n "$LOGGING_ENABLED" ]]; then + echo "✅ Cloud Logging API enabled" +else + echo "❌ Cloud Logging API not enabled" +fi + +if [[ -n "$MONITORING_ENABLED" ]]; then + echo "✅ Cloud Monitoring API enabled" +else + echo "❌ Cloud Monitoring API not enabled" +fi + +echo "=== Minimal validation completed for $ENV ===" diff --git a/gcp/e2e/task-e2e-trigger-job.yaml b/gcp/e2e/task-e2e-trigger-job.yaml new file mode 100755 index 0000000..63f4849 --- /dev/null +++ b/gcp/e2e/task-e2e-trigger-job.yaml @@ -0,0 +1,101 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: orion-task-e2e-trigger + namespace: orion-worker +spec: + backoffLimit: 0 + template: + metadata: + labels: + app: orion-task-e2e-trigger + spec: + restartPolicy: Never + tolerations: + - key: dedicated + operator: Equal + value: orion-build + effect: NoSchedule + containers: + - name: trigger + image: curlimages/curl:8.6.0 + env: + - name: ORION_API_BASE + value: "https://orion.gitmono.com" + - name: ORION_TASK_CL + value: "18410001" + - name: ORION_TASK_CL_LINK + value: "https://example.invalid/cl/18410001" + - name: ORION_REPO + value: "mega" + - name: ORION_TARGET + value: "//:noop" + - name: ORION_POLL_SECONDS + value: "300" + command: + - /bin/sh + - -c + - |- + set -ex + echo "== Create task ==" + payload=$(cat <<'JSON' + { + "repo": "REPO_PLACEHOLDER", + "cl_link": "CL_LINK_PLACEHOLDER", + "cl": CL_PLACEHOLDER, + "builds": [ + { "target": "TARGET_PLACEHOLDER", "args": null, "changes": [] } + ] + } + JSON + ) + payload=$(echo "$payload" | sed -e "s/REPO_PLACEHOLDER/${ORION_REPO}/g" \ + -e "s|CL_LINK_PLACEHOLDER|${ORION_TASK_CL_LINK}|g" \ + -e "s/CL_PLACEHOLDER/${ORION_TASK_CL}/g" \ + -e "s|TARGET_PLACEHOLDER|${ORION_TARGET}|g") + + resp=$(curl -fsS -X POST "${ORION_API_BASE}/task" -H 'content-type: application/json' --data "$payload") + echo "$resp" + + task_id=$(echo "$resp" | sed -n 's/.*"task_id"[[:space:]]*:[[:space:]]*"\([^\"]*\)".*/\1/p') + [ -z "$task_id" ] && { echo "parse task_id fail"; exit 2; } + echo "task_id=$task_id" + + deadline=$(( $(date +%s) + ORION_POLL_SECONDS )) + while :; do + now=$(date +%s) + [ "$now" -ge "$deadline" ] && { echo "timeout" >&2; exit 3; } + + all_tasks=$(curl -fsS "${ORION_API_BASE}/tasks/${ORION_TASK_CL}") + + # Use sed + grep to reliably extract the JSON object for the current task_id + current_task=$(echo "$all_tasks" | sed 's/},{/}\n{/g' | grep "\"task_id\":\"$task_id\"" || true) + + if [ -z "$current_task" ]; then + echo "poll: task_not_visible_yet" + sleep 3 + continue + fi + + completed=$(echo "$current_task" | grep -c '"status":"Completed"' || true) + building=$(echo "$current_task" | grep -c '"status":"Building"' || true) + failed=$(echo "$current_task" | grep -c '"status":"Failed"' || true) + interrupted=$(echo "$current_task" | grep -c '"status":"Interrupted"' || true) + + echo "poll(current_task): completed=$completed building=$building failed=$failed interrupted=$interrupted" + [ "$completed" -gt 0 ] && { echo "TASK_E2E_PASSED"; exit 0; } + { [ "$failed" -gt 0 ] || [ "$interrupted" -gt 0 ]; } && { + echo "TASK_E2E_FAILED" >&2 + echo "$current_task" | cat >&2 + exit 4 + } + + sleep 5 + done + resources: + requests: + cpu: "100m" + memory: "128Mi" + limits: + cpu: "500m" + memory: "256Mi" diff --git a/gcp/orion-worker-configmap.yaml b/gcp/orion-worker-configmap.yaml new file mode 100755 index 0000000..a71cd05 --- /dev/null +++ b/gcp/orion-worker-configmap.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: orion-worker-config + namespace: orion-worker +data: + SERVER_WS: "wss://orion.gitmono.com/ws" + SCORPIO_BASE_URL: "https://git.gitmono.com" + SCORPIO_LFS_URL: "https://git.gitmono.com" + RUST_LOG: "info" + diff --git a/gcp/orion-worker-daemonset.yaml b/gcp/orion-worker-daemonset.yaml new file mode 100755 index 0000000..ec3dc3b --- /dev/null +++ b/gcp/orion-worker-daemonset.yaml @@ -0,0 +1,148 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: orion-worker + namespace: orion-worker + labels: + app: orion-worker +spec: + selector: + matchLabels: + app: orion-worker + template: + metadata: + labels: + app: orion-worker + spec: + serviceAccountName: orion-worker-sa + terminationGracePeriodSeconds: 300 + + tolerations: + - key: "dedicated" + operator: "Equal" + value: "orion-build" + effect: "NoSchedule" + + nodeSelector: + nodepool: build-default + + containers: + - name: orion-worker + image: public.ecr.aws/m8q5m4u3/mega:orion-client-0.1.0-pre-release-amd64 + # Start scorpio first, print its startup logs, then exec orion + command: ["/bin/sh", "-c"] + args: + - | + set -e + # Force localhost to resolve to IPv4 to avoid IPv6 connection issues + echo "127.0.0.1 localhost" >> /etc/hosts + + echo "Generating scorpio config via envsubst..." + envsubst < /etc/scorpio/scorpio.toml.template > /tmp/scorpio.toml + + echo "Attempting to start scorpio in background..." + # Run scorpio with config, listening on default 0.0.0.0:2725 + /app/bin/scorpio --config-path /tmp/scorpio.toml > /tmp/scorpio.log 2>&1 & + sleep 2 + echo "--- Scorpio Log Start ---" + cat /tmp/scorpio.log || true + echo "--- Scorpio Log End ---" + exec orion + + envFrom: + - configMapRef: + name: orion-worker-config + - secretRef: + name: orion-worker-secret + optional: true + + env: + - name: ORION_WORKER_START_SCORPIO + value: "true" + - name: SCORPIO_STORE_PATH + value: "/data/scorpio/store" + - name: SCORPIO_WORKSPACE + value: "/workspace/mount" + - name: BUCK_PROJECT_ROOT + value: "/workspace" + - name: BUILD_TMP + value: "/tmp/orion-builds" + + - name: SCORPIO_GIT_AUTHOR + value: "orion" + - name: SCORPIO_GIT_EMAIL + value: "orion@local" + + - name: SCORPIO_DICFUSE_READABLE + value: "true" + - name: SCORPIO_LOAD_DIR_DEPTH + value: "2" + - name: SCORPIO_FETCH_FILE_THREAD + value: "8" + - name: SCORPIO_DICFUSE_IMPORT_CONCURRENCY + value: "8" + - name: SCORPIO_DICFUSE_DIR_SYNC_TTL_SECS + value: "60" + - name: SCORPIO_DICFUSE_STAT_MODE + value: "fast" + - name: SCORPIO_DICFUSE_OPEN_BUFF_MAX_BYTES + value: "134217728" + - name: SCORPIO_DICFUSE_OPEN_BUFF_MAX_FILES + value: "2048" + + - name: ANTARES_LOAD_DIR_DEPTH + value: "2" + - name: ANTARES_DICFUSE_STAT_MODE + value: "fast" + - name: ANTARES_DICFUSE_OPEN_BUFF_MAX_BYTES + value: "134217728" + - name: ANTARES_DICFUSE_OPEN_BUFF_MAX_FILES + value: "2048" + - name: ANTARES_DICFUSE_DIR_SYNC_TTL_SECS + value: "60" + - name: ANTARES_UPPER_ROOT + value: "/data/scorpio/antares/upper" + - name: ANTARES_CL_ROOT + value: "/data/scorpio/antares/cl" + - name: ANTARES_MOUNT_ROOT + value: "/workspace/mount" + - name: ANTARES_STATE_FILE + value: "/data/scorpio/antares/state.json" + + securityContext: + privileged: true + + resources: + requests: + cpu: "6" + memory: "24Gi" + limits: + cpu: "8" + memory: "30Gi" + + lifecycle: + preStop: + exec: + command: ["/bin/sh", "-lc", "sleep 10"] + + volumeMounts: + - name: orion-data-cache + mountPath: /data + - name: orion-workspace-cache + mountPath: /workspace + - name: scorpio-config + mountPath: /etc/scorpio/scorpio.toml.template + subPath: scorpio.toml.template + + volumes: + - name: orion-data-cache + hostPath: + path: /var/lib/orion/data + type: DirectoryOrCreate + - name: orion-workspace-cache + hostPath: + path: /var/lib/orion/workspace + type: DirectoryOrCreate + - name: scorpio-config + configMap: + name: scorpio-config diff --git a/gcp/orion-worker-namespace.yaml b/gcp/orion-worker-namespace.yaml new file mode 100755 index 0000000..3107744 --- /dev/null +++ b/gcp/orion-worker-namespace.yaml @@ -0,0 +1,5 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: orion-worker + diff --git a/gcp/orion-worker-secret.yaml b/gcp/orion-worker-secret.yaml new file mode 100755 index 0000000..2713152 --- /dev/null +++ b/gcp/orion-worker-secret.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +kind: Secret +metadata: + name: orion-worker-secret + namespace: orion-worker +type: Opaque +data: {} + diff --git a/gcp/orion-worker-serviceaccount.yaml b/gcp/orion-worker-serviceaccount.yaml new file mode 100755 index 0000000..12bd3e7 --- /dev/null +++ b/gcp/orion-worker-serviceaccount.yaml @@ -0,0 +1,6 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: orion-worker-sa + namespace: orion-worker + diff --git a/gcp/scorpio-configmap.yaml b/gcp/scorpio-configmap.yaml new file mode 100755 index 0000000..2640379 --- /dev/null +++ b/gcp/scorpio-configmap.yaml @@ -0,0 +1,48 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: scorpio-config + namespace: orion-worker +data: + scorpio.toml.template: | + # ============================================================================= + # Scorpio Configuration Template + # ============================================================================= + # This template uses simple environment variable placeholders like `${VAR_NAME}`. + # The entrypoint script sets defaults and then substitutes the variables via `envsubst`. + # ============================================================================= + + # Mega/Mono service URLs + base_url = "${SCORPIO_BASE_URL}" + lfs_url = "${SCORPIO_LFS_URL}" + + # Storage paths + store_path = "${SCORPIO_STORE_PATH}" + workspace = "${SCORPIO_WORKSPACE}" + config_file = "config.toml" + + # Git author configuration + git_author = "${SCORPIO_GIT_AUTHOR}" + git_email = "${SCORPIO_GIT_EMAIL}" + + # DicFuse (dictionary-based FUSE) settings + dicfuse_readable = "${SCORPIO_DICFUSE_READABLE}" + load_dir_depth = "${SCORPIO_LOAD_DIR_DEPTH}" + fetch_file_thread = "${SCORPIO_FETCH_FILE_THREAD}" + dicfuse_import_concurrency = "${SCORPIO_DICFUSE_IMPORT_CONCURRENCY}" + dicfuse_dir_sync_ttl_secs = "${SCORPIO_DICFUSE_DIR_SYNC_TTL_SECS}" + dicfuse_stat_mode = "${SCORPIO_DICFUSE_STAT_MODE}" + dicfuse_open_buff_max_bytes = "${SCORPIO_DICFUSE_OPEN_BUFF_MAX_BYTES}" + dicfuse_open_buff_max_files = "${SCORPIO_DICFUSE_OPEN_BUFF_MAX_FILES}" + + # Antares (overlay filesystem) settings + antares_load_dir_depth = "${ANTARES_LOAD_DIR_DEPTH}" + antares_dicfuse_stat_mode = "${ANTARES_DICFUSE_STAT_MODE}" + antares_dicfuse_open_buff_max_bytes = "${ANTARES_DICFUSE_OPEN_BUFF_MAX_BYTES}" + antares_dicfuse_open_buff_max_files = "${ANTARES_DICFUSE_OPEN_BUFF_MAX_FILES}" + antares_dicfuse_dir_sync_ttl_secs = "${ANTARES_DICFUSE_DIR_SYNC_TTL_SECS}" + antares_upper_root = "${ANTARES_UPPER_ROOT}" + antares_cl_root = "${ANTARES_CL_ROOT}" + antares_mount_root = "${ANTARES_MOUNT_ROOT}" + antares_state_file = "${ANTARES_STATE_FILE}" + diff --git a/modules/gcp/artifact_registry/main.tf b/modules/gcp/artifact_registry/main.tf new file mode 100755 index 0000000..2b76452 --- /dev/null +++ b/modules/gcp/artifact_registry/main.tf @@ -0,0 +1,6 @@ +resource "google_artifact_registry_repository" "this" { + location = var.location + repository_id = var.repo_name + format = "DOCKER" +} + diff --git a/modules/gcp/artifact_registry/outputs.tf b/modules/gcp/artifact_registry/outputs.tf new file mode 100755 index 0000000..e6fd709 --- /dev/null +++ b/modules/gcp/artifact_registry/outputs.tf @@ -0,0 +1,8 @@ +output "repository" { + value = google_artifact_registry_repository.this.id +} + +output "repository_url" { + value = "${var.location}-docker.pkg.dev/${google_artifact_registry_repository.this.project}/${google_artifact_registry_repository.this.repository_id}" +} + diff --git a/modules/gcp/artifact_registry/variables.tf b/modules/gcp/artifact_registry/variables.tf new file mode 100755 index 0000000..81a40c4 --- /dev/null +++ b/modules/gcp/artifact_registry/variables.tf @@ -0,0 +1,8 @@ +variable "location" { + type = string +} + +variable "repo_name" { + type = string +} + diff --git a/modules/gcp/cloud_sql/main.tf b/modules/gcp/cloud_sql/main.tf new file mode 100755 index 0000000..42cf81c --- /dev/null +++ b/modules/gcp/cloud_sql/main.tf @@ -0,0 +1,52 @@ +resource "google_compute_global_address" "private_service_range" { + count = var.enable_private_service_connection ? 1 : 0 + name = "${var.name}-private-range" + purpose = "VPC_PEERING" + address_type = "INTERNAL" + prefix_length = var.private_ip_prefix_length + network = var.private_network +} + +resource "google_service_networking_connection" "private_vpc_connection" { + count = var.enable_private_service_connection ? 1 : 0 + network = var.private_network + service = "servicenetworking.googleapis.com" + reserved_peering_ranges = [google_compute_global_address.private_service_range[0].name] +} + +resource "google_sql_database_instance" "this" { + name = var.name + database_version = var.database_version + region = var.region + deletion_protection = var.deletion_protection + + settings { + tier = var.tier + availability_type = var.availability_type + disk_size = var.disk_size + disk_type = var.disk_type + + backup_configuration { + enabled = var.backup_enabled + } + + ip_configuration { + ipv4_enabled = var.enable_public_ip + private_network = var.private_network + } + } + + depends_on = [google_service_networking_connection.private_vpc_connection] +} + +resource "google_sql_database" "db" { + name = var.db_name + instance = google_sql_database_instance.this.name +} + +resource "google_sql_user" "user" { + name = var.db_username + instance = google_sql_database_instance.this.name + password = var.db_password +} + diff --git a/modules/gcp/cloud_sql/outputs.tf b/modules/gcp/cloud_sql/outputs.tf new file mode 100755 index 0000000..168c773 --- /dev/null +++ b/modules/gcp/cloud_sql/outputs.tf @@ -0,0 +1,12 @@ +output "db_endpoint" { + value = google_sql_database_instance.this.private_ip_address +} + +output "connection_name" { + value = google_sql_database_instance.this.connection_name +} + +output "db_name" { + value = google_sql_database.db.name +} + diff --git a/modules/gcp/cloud_sql/variables.tf b/modules/gcp/cloud_sql/variables.tf new file mode 100755 index 0000000..daedfd6 --- /dev/null +++ b/modules/gcp/cloud_sql/variables.tf @@ -0,0 +1,89 @@ +variable "name" { + type = string + description = "Cloud SQL instance name" +} + +variable "database_version" { + type = string + description = "Cloud SQL database version (e.g. POSTGRES_17, MYSQL_8_0)" +} + +variable "region" { + type = string + description = "GCP region" +} + +variable "tier" { + type = string + description = "Instance tier" +} + +variable "disk_size" { + type = number + default = 20 + description = "Disk size in GB" +} + +variable "disk_type" { + type = string + default = "PD_SSD" + description = "Disk type" +} + +variable "availability_type" { + type = string + default = "ZONAL" + description = "Availability type (ZONAL or REGIONAL)" +} + +variable "private_network" { + type = string + description = "VPC self link" +} + +variable "private_ip_prefix_length" { + type = number + default = 16 + description = "Prefix length for private services range" +} + +variable "enable_private_service_connection" { + type = bool + default = true + description = "Create private service networking connection" +} + +variable "enable_public_ip" { + type = bool + default = false + description = "Enable public IPv4" +} + +variable "db_name" { + type = string + description = "Default database name" +} + +variable "db_username" { + type = string + description = "Database username" +} + +variable "db_password" { + type = string + description = "Database password" + sensitive = true +} + +variable "backup_enabled" { + type = bool + default = true + description = "Enable automated backups" +} + +variable "deletion_protection" { + type = bool + default = false + description = "Enable deletion protection" +} + diff --git a/modules/gcp/filestore/main.tf b/modules/gcp/filestore/main.tf new file mode 100755 index 0000000..77af941 --- /dev/null +++ b/modules/gcp/filestore/main.tf @@ -0,0 +1,17 @@ +resource "google_filestore_instance" "this" { + name = var.name + location = var.location + tier = var.tier + + file_shares { + name = var.file_share_name + capacity_gb = var.capacity_gb + } + + networks { + network = var.network + modes = ["MODE_IPV4"] + reserved_ip_range = var.reserved_ip_range + } +} + diff --git a/modules/gcp/filestore/outputs.tf b/modules/gcp/filestore/outputs.tf new file mode 100755 index 0000000..b331021 --- /dev/null +++ b/modules/gcp/filestore/outputs.tf @@ -0,0 +1,11 @@ +output "instance_name" { + value = google_filestore_instance.this.name +} + +output "file_share_name" { + value = google_filestore_instance.this.file_shares[0].name +} + +output "ip_address" { + value = google_filestore_instance.this.networks[0].ip_addresses[0] +} diff --git a/modules/gcp/filestore/variables.tf b/modules/gcp/filestore/variables.tf new file mode 100755 index 0000000..85b0fb8 --- /dev/null +++ b/modules/gcp/filestore/variables.tf @@ -0,0 +1,38 @@ +variable "name" { + type = string + description = "Filestore instance name" +} + +variable "location" { + type = string + description = "Filestore zone (e.g. us-central1-b)" +} + +variable "network" { + type = string + description = "VPC self link" +} + +variable "tier" { + type = string + default = "STANDARD" + description = "Filestore tier" +} + +variable "capacity_gb" { + type = number + default = 1024 + description = "Capacity in GB" +} + +variable "file_share_name" { + type = string + default = "share1" + description = "File share name" +} + +variable "reserved_ip_range" { + type = string + default = null + description = "Optional reserved IP range (e.g. 10.0.20.0/29)" +} diff --git a/modules/gcp/gcs/main.tf b/modules/gcp/gcs/main.tf new file mode 100755 index 0000000..a0293c3 --- /dev/null +++ b/modules/gcp/gcs/main.tf @@ -0,0 +1,7 @@ +resource "google_storage_bucket" "this" { + name = var.name + location = var.location + force_destroy = var.force_destroy + uniform_bucket_level_access = var.uniform_bucket_level_access +} + diff --git a/modules/gcp/gcs/outputs.tf b/modules/gcp/gcs/outputs.tf new file mode 100755 index 0000000..12be643 --- /dev/null +++ b/modules/gcp/gcs/outputs.tf @@ -0,0 +1,8 @@ +output "bucket_name" { + value = google_storage_bucket.this.name +} + +output "bucket_url" { + value = google_storage_bucket.this.url +} + diff --git a/modules/gcp/gcs/variables.tf b/modules/gcp/gcs/variables.tf new file mode 100755 index 0000000..e8bf568 --- /dev/null +++ b/modules/gcp/gcs/variables.tf @@ -0,0 +1,22 @@ +variable "name" { + type = string + description = "Bucket name" +} + +variable "location" { + type = string + description = "Bucket location" +} + +variable "force_destroy" { + type = bool + default = false + description = "Allow force deletion of bucket objects" +} + +variable "uniform_bucket_level_access" { + type = bool + default = true + description = "Enable uniform bucket-level access" +} + diff --git a/modules/gcp/gke/main.tf b/modules/gcp/gke/main.tf new file mode 100755 index 0000000..6227b88 --- /dev/null +++ b/modules/gcp/gke/main.tf @@ -0,0 +1,36 @@ +resource "google_container_cluster" "this" { + name = var.cluster_name + location = var.region + + network = var.network_self_link + subnetwork = var.subnetwork_self_link + + remove_default_node_pool = true + initial_node_count = 1 + + ip_allocation_policy { + cluster_secondary_range_name = var.ip_range_pods_name + services_secondary_range_name = var.ip_range_services_name + } + + dynamic "workload_identity_config" { + for_each = var.enable_workload_identity ? [1] : [] + content { + workload_pool = coalesce(var.workload_pool, "${var.project_id}.svc.id.goog") + } + } + + release_channel { + channel = var.release_channel + } + + private_cluster_config { + enable_private_nodes = var.enable_private_nodes + enable_private_endpoint = var.enable_private_endpoint + master_ipv4_cidr_block = var.master_ipv4_cidr_block + } + + logging_service = var.logging_service + monitoring_service = var.monitoring_service +} + diff --git a/modules/gcp/gke/nodepool/main.tf b/modules/gcp/gke/nodepool/main.tf new file mode 100755 index 0000000..240488a --- /dev/null +++ b/modules/gcp/gke/nodepool/main.tf @@ -0,0 +1,60 @@ +resource "google_service_account" "this" { + count = var.create_service_account ? 1 : 0 + + account_id = coalesce(var.service_account_id, "${var.name}-node-sa") + display_name = "Node pool service account for ${var.name}" +} + +resource "google_container_node_pool" "this" { + name = var.name + location = var.region + cluster = var.cluster_name + + initial_node_count = var.min_count > 0 ? var.min_count : 1 + + dynamic "autoscaling" { + for_each = var.enable_autoscaling ? [1] : [] + content { + min_node_count = var.min_count + max_node_count = var.max_count + } + } + + management { + auto_repair = true + auto_upgrade = true + } + + node_config { + machine_type = var.machine_type + disk_size_gb = var.disk_size_gb + service_account = var.service_account != null ? var.service_account : (var.create_service_account ? google_service_account.this[0].email : null) + tags = var.tags + + labels = var.labels + + dynamic "taint" { + for_each = var.taints + content { + key = taint.value.key + value = taint.value.value + effect = taint.value.effect + } + } + + oauth_scopes = [ + "https://www.googleapis.com/auth/cloud-platform" + ] + + metadata = { + disable-legacy-endpoints = "true" + } + } + + lifecycle { + ignore_changes = [ + initial_node_count + ] + } +} + diff --git a/modules/gcp/gke/nodepool/outputs.tf b/modules/gcp/gke/nodepool/outputs.tf new file mode 100755 index 0000000..81b90c3 --- /dev/null +++ b/modules/gcp/gke/nodepool/outputs.tf @@ -0,0 +1,8 @@ +output "name" { + value = google_container_node_pool.this.name +} + +output "service_account" { + value = var.service_account != null ? var.service_account : (var.create_service_account ? google_service_account.this[0].email : null) +} + diff --git a/modules/gcp/gke/nodepool/variables.tf b/modules/gcp/gke/nodepool/variables.tf new file mode 100755 index 0000000..27e172c --- /dev/null +++ b/modules/gcp/gke/nodepool/variables.tf @@ -0,0 +1,71 @@ +variable "project_id" { + type = string +} + +variable "region" { + type = string +} + +variable "cluster_name" { + type = string +} + +variable "name" { + type = string +} + +variable "machine_type" { + type = string +} + +variable "disk_size_gb" { + type = number +} + +variable "min_count" { + type = number +} + +variable "max_count" { + type = number +} + +variable "service_account" { + type = string + default = null +} + +variable "create_service_account" { + type = bool + default = false +} + +variable "service_account_id" { + type = string + default = null +} + +variable "tags" { + type = list(string) + default = [] +} + +variable "enable_autoscaling" { + type = bool + default = true +} + +variable "labels" { + type = map(string) + default = {} +} + +variable "taints" { + type = list(object({ + key = string + value = string + effect = string + })) + default = [] +} + diff --git a/modules/gcp/gke/outputs.tf b/modules/gcp/gke/outputs.tf new file mode 100755 index 0000000..8b00aa1 --- /dev/null +++ b/modules/gcp/gke/outputs.tf @@ -0,0 +1,24 @@ +output "cluster_name" { + value = google_container_cluster.this.name +} + +output "location" { + value = google_container_cluster.this.location +} + +output "endpoint" { + value = google_container_cluster.this.endpoint +} + +output "master_auth" { + value = google_container_cluster.this.master_auth +} + +output "network" { + value = google_container_cluster.this.network +} + +output "subnetwork" { + value = google_container_cluster.this.subnetwork +} + diff --git a/modules/gcp/gke/variables.tf b/modules/gcp/gke/variables.tf new file mode 100755 index 0000000..4ade1de --- /dev/null +++ b/modules/gcp/gke/variables.tf @@ -0,0 +1,70 @@ +variable "project_id" { + type = string +} + +variable "region" { + type = string +} + +variable "cluster_name" { + type = string +} + +variable "network_self_link" { + type = string +} + +variable "subnetwork_self_link" { + type = string +} + +variable "ip_range_pods_name" { + type = string +} + +variable "ip_range_services_name" { + type = string +} + +variable "enable_workload_identity" { + type = bool + default = true +} + +variable "workload_pool" { + type = string + default = null +} + +variable "release_channel" { + type = string + default = "REGULAR" +} + +variable "enable_private_nodes" { + type = bool + default = false +} + +variable "enable_private_endpoint" { + type = bool + default = false +} + +variable "master_ipv4_cidr_block" { + type = string + default = null +} + +variable "logging_service" { + type = string + default = "logging.googleapis.com/kubernetes" + description = "Logging service for GKE cluster" +} + +variable "monitoring_service" { + type = string + default = "monitoring.googleapis.com/kubernetes" + description = "Monitoring service for GKE cluster" +} + diff --git a/modules/gcp/gke_service/main.tf b/modules/gcp/gke_service/main.tf new file mode 100755 index 0000000..7bca45c --- /dev/null +++ b/modules/gcp/gke_service/main.tf @@ -0,0 +1,127 @@ +locals { + labels = { + app = var.name + } + limits = { for k, v in { cpu = var.cpu_limit, memory = var.memory_limit } : k => v if v != null } + requests = { + for k, v in { cpu = var.cpu_request, memory = var.memory_request } : k => v if v != null + } +} + +resource "kubernetes_deployment_v1" "this" { + metadata { + name = var.name + namespace = var.namespace + labels = local.labels + } + + spec { + replicas = var.replicas + + selector { + match_labels = local.labels + } + + template { + metadata { + labels = local.labels + } + + spec { + service_account_name = var.service_account_name + + dynamic "volume" { + for_each = var.volumes + content { + name = volume.value.name + nfs { + server = volume.value.nfs_server + path = volume.value.nfs_path + } + } + } + + container { + name = var.name + image = var.image + + port { + container_port = var.container_port + } + + dynamic "env" { + for_each = var.env + content { + name = env.value.name + value = env.value.value + } + } + + dynamic "volume_mount" { + for_each = var.volume_mounts + content { + name = volume_mount.value.name + mount_path = volume_mount.value.mount_path + read_only = volume_mount.value.read_only + } + } + + resources { + limits = local.limits + requests = local.requests + } + } + } + } + } +} + +resource "kubernetes_service_v1" "this" { + metadata { + name = var.name + namespace = var.namespace + labels = local.labels + } + + spec { + selector = local.labels + type = var.service_type + + port { + port = var.container_port + target_port = var.container_port + protocol = "TCP" + } + } +} + +resource "kubernetes_horizontal_pod_autoscaler_v2" "this" { + count = var.enable_hpa ? 1 : 0 + + metadata { + name = "${var.name}-hpa" + namespace = var.namespace + } + + spec { + min_replicas = var.hpa_min_replicas + max_replicas = var.hpa_max_replicas + + scale_target_ref { + api_version = "apps/v1" + kind = "Deployment" + name = kubernetes_deployment_v1.this.metadata[0].name + } + + metric { + type = "Resource" + resource { + name = "cpu" + target { + type = "Utilization" + average_utilization = var.hpa_cpu_utilization + } + } + } + } +} diff --git a/modules/gcp/gke_service/outputs.tf b/modules/gcp/gke_service/outputs.tf new file mode 100755 index 0000000..c8a231a --- /dev/null +++ b/modules/gcp/gke_service/outputs.tf @@ -0,0 +1,12 @@ +output "service_name" { + value = kubernetes_service_v1.this.metadata[0].name +} + +output "service_port" { + value = kubernetes_service_v1.this.spec[0].port[0].port +} + +output "deployment_name" { + value = kubernetes_deployment_v1.this.metadata[0].name +} + diff --git a/modules/gcp/gke_service/variables.tf b/modules/gcp/gke_service/variables.tf new file mode 100755 index 0000000..bcefe90 --- /dev/null +++ b/modules/gcp/gke_service/variables.tf @@ -0,0 +1,112 @@ +variable "name" { + type = string + description = "Service name" +} + +variable "namespace" { + type = string + default = "default" + description = "Kubernetes namespace" +} + +variable "image" { + type = string + description = "Container image" +} + +variable "container_port" { + type = number + description = "Container port" +} + +variable "env" { + type = list(map(string)) + default = [] + description = "Environment variables" +} + +variable "volumes" { + type = list(object({ + name = string + nfs_server = string + nfs_path = string + })) + default = [] + description = "Pod volumes (NFS only)" +} + +variable "volume_mounts" { + type = list(object({ + name = string + mount_path = string + read_only = bool + })) + default = [] + description = "Container volume mounts" +} + +variable "replicas" { + type = number + default = 1 + description = "Number of replicas" +} + +variable "service_type" { + type = string + default = "ClusterIP" + description = "Kubernetes service type" +} + +variable "cpu_request" { + type = string + default = null + description = "CPU request" +} + +variable "memory_request" { + type = string + default = null + description = "Memory request" +} + +variable "cpu_limit" { + type = string + default = null + description = "CPU limit" +} + +variable "memory_limit" { + type = string + default = null + description = "Memory limit" +} + +variable "enable_hpa" { + type = bool + default = false + description = "Enable HorizontalPodAutoscaler" +} + +variable "hpa_min_replicas" { + type = number + default = 1 + description = "HPA minimum replicas" +} + +variable "hpa_max_replicas" { + type = number + default = 5 + description = "HPA maximum replicas" +} + +variable "hpa_cpu_utilization" { + type = number + default = 80 + description = "Target CPU utilization percentage" +} + +variable "service_account_name" { + type = string + default = "default" + description = "Kubernetes service account name to use for the pod" +} diff --git a/modules/gcp/iam/main.tf b/modules/gcp/iam/main.tf new file mode 100755 index 0000000..ab2f3ce --- /dev/null +++ b/modules/gcp/iam/main.tf @@ -0,0 +1,49 @@ +locals { + service_accounts = var.service_accounts +} + +resource "google_service_account" "this" { + for_each = local.service_accounts + + account_id = "${var.prefix}-${each.key}" + display_name = coalesce(try(each.value.display_name, null), each.key) + description = try(each.value.description, null) +} + +resource "google_project_iam_member" "sa_roles" { + for_each = { + for pair in flatten([ + for sa_key, sa_cfg in local.service_accounts : [ + for role in try(sa_cfg.roles, []) : { + key = "${sa_key}:${role}" + sa = sa_key + role = role + } + ] + ]) : pair.key => pair + } + + project = var.project_id + role = each.value.role + member = "serviceAccount:${google_service_account.this[each.value.sa].email}" +} + +resource "google_service_account_iam_member" "workload_identity" { + for_each = { + for pair in flatten([ + for sa_key, sa_cfg in local.service_accounts : [ + for b in try(sa_cfg.wi_bindings, []) : { + key = "${sa_key}:${b.namespace}:${b.k8s_service_account_name}" + sa = sa_key + namespace = b.namespace + ksa = b.k8s_service_account_name + } + ] + ]) : pair.key => pair + } + + service_account_id = google_service_account.this[each.value.sa].name + role = "roles/iam.workloadIdentityUser" + member = "serviceAccount:${var.project_id}.svc.id.goog[${each.value.namespace}/${each.value.ksa}]" +} + diff --git a/modules/gcp/iam/outputs.tf b/modules/gcp/iam/outputs.tf new file mode 100755 index 0000000..3dbf0b4 --- /dev/null +++ b/modules/gcp/iam/outputs.tf @@ -0,0 +1,21 @@ +output "service_accounts" { + description = "Created service accounts with emails and names" + value = { + for k, v in google_service_account.this : k => { + email = v.email + name = v.name + } + } +} + +output "workload_identity_bindings" { + description = "Workload Identity bindings (K8s SA -> GCP SA)" + value = { + for k, v in google_service_account_iam_member.workload_identity : k => { + gcp_sa_email = google_service_account.this[split(":", k)[0]].email + k8s_ns = split(":", k)[1] + k8s_sa = split(":", k)[2] + } + } +} + diff --git a/modules/gcp/iam/variables.tf b/modules/gcp/iam/variables.tf new file mode 100755 index 0000000..54e5369 --- /dev/null +++ b/modules/gcp/iam/variables.tf @@ -0,0 +1,25 @@ +variable "project_id" { + type = string + description = "GCP project ID" +} + +variable "prefix" { + type = string + description = "Prefix for resource names" + default = "mega" +} + +variable "service_accounts" { + type = map(object({ + display_name = optional(string) + description = optional(string) + roles = optional(list(string), []) + wi_bindings = optional(list(object({ + namespace = string + k8s_service_account_name = string + })), []) + })) + description = "Service accounts to create and their IAM roles / Workload Identity bindings" + default = {} +} + diff --git a/modules/gcp/ingress/main.tf b/modules/gcp/ingress/main.tf new file mode 100755 index 0000000..e4fc54d --- /dev/null +++ b/modules/gcp/ingress/main.tf @@ -0,0 +1,56 @@ +locals { + annotations = merge( + var.static_ip_name != null ? { "kubernetes.io/ingress.global-static-ip-name" = var.static_ip_name } : {}, + length(var.managed_certificate_domains) > 0 ? { "networking.gke.io/managed-certificates" = "${var.name}-cert" } : {} + ) +} + +resource "kubernetes_manifest" "managed_cert" { + count = length(var.managed_certificate_domains) > 0 ? 1 : 0 + + manifest = { + apiVersion = "networking.gke.io/v1" + kind = "ManagedCertificate" + metadata = { + name = "${var.name}-cert" + namespace = var.namespace + } + spec = { + domains = var.managed_certificate_domains + } + } +} + +resource "kubernetes_ingress_v1" "this" { + metadata { + name = var.name + namespace = var.namespace + annotations = local.annotations + } + + spec { + ingress_class_name = var.ingress_class_name + + dynamic "rule" { + for_each = var.rules + content { + host = rule.value.host + http { + path { + path = "/" + path_type = "Prefix" + backend { + service { + name = rule.value.service_name + port { + number = rule.value.service_port + } + } + } + } + } + } + } + } +} + diff --git a/modules/gcp/ingress/outputs.tf b/modules/gcp/ingress/outputs.tf new file mode 100755 index 0000000..0f1da33 --- /dev/null +++ b/modules/gcp/ingress/outputs.tf @@ -0,0 +1,12 @@ +output "ingress_name" { + value = kubernetes_ingress_v1.this.metadata[0].name +} + +output "ip_address" { + value = try(kubernetes_ingress_v1.this.status[0].load_balancer[0].ingress[0].ip, null) +} + +output "hostname" { + value = try(kubernetes_ingress_v1.this.status[0].load_balancer[0].ingress[0].hostname, null) +} + diff --git a/modules/gcp/ingress/variables.tf b/modules/gcp/ingress/variables.tf new file mode 100755 index 0000000..496a7a1 --- /dev/null +++ b/modules/gcp/ingress/variables.tf @@ -0,0 +1,37 @@ +variable "name" { + type = string + description = "Ingress name" +} + +variable "namespace" { + type = string + default = "default" + description = "Kubernetes namespace" +} + +variable "static_ip_name" { + type = string + default = null + description = "Global static IP name for GCE ingress" +} + +variable "ingress_class_name" { + type = string + default = "gce" + description = "Ingress class name" +} + +variable "managed_certificate_domains" { + type = list(string) + default = [] + description = "Domains for GKE ManagedCertificate" +} + +variable "rules" { + type = list(object({ + host = string + service_name = string + service_port = number + })) + description = "Ingress host rules" +} diff --git a/modules/gcp/monitoring/main.tf b/modules/gcp/monitoring/main.tf new file mode 100755 index 0000000..9965bae --- /dev/null +++ b/modules/gcp/monitoring/main.tf @@ -0,0 +1,82 @@ +locals { + enable_logging = var.enable_logging + enable_monitoring = var.enable_monitoring + enable_alerts = var.enable_alerts +} + +resource "google_project_service" "logging" { + count = local.enable_logging ? 1 : 0 + project = var.project_id + service = "logging.googleapis.com" +} + +resource "google_project_service" "monitoring" { + count = local.enable_monitoring ? 1 : 0 + project = var.project_id + service = "monitoring.googleapis.com" +} + +resource "google_logging_project_sink" "this" { + count = var.log_sink_name != "" && var.log_sink_destination != "" ? 1 : 0 + name = var.log_sink_name + project = var.project_id + destination = var.log_sink_destination + filter = "resource.type=\"k8s_container\"" +} + +resource "google_monitoring_alert_policy" "pod_restart_high" { + count = local.enable_alerts ? 1 : 0 + display_name = "GKE Pod Restart Rate High" + combiner = "OR" + enabled = true + notification_channels = var.alert_notification_channels + + conditions { + display_name = "Pod restart rate > 5/min" + condition_threshold { + filter = "metric.type=\"kubernetes.io/container/restart_count\" resource.type=\"k8s_container\"" + duration = "300s" + comparison = "COMPARISON_GT" + threshold_value = 5 + aggregations { + alignment_period = "60s" + per_series_aligner = "ALIGN_RATE" + cross_series_reducer = "REDUCE_SUM" + group_by_fields = ["resource.label.namespace_name", "resource.label.pod_name", "resource.label.container_name"] + } + } + } + + documentation { + content = "High pod restart rate detected. Check pod logs and events." + } +} + +resource "google_monitoring_alert_policy" "sql_connection_failures" { + count = local.enable_alerts ? 1 : 0 + display_name = "Cloud SQL Connection Failures" + combiner = "OR" + enabled = true + notification_channels = var.alert_notification_channels + + conditions { + display_name = "SQL connection errors > 0" + condition_threshold { + filter = "metric.type=\"cloudsql.googleapis.com/database/network/received_bytes_count\" resource.type=\"cloudsql_database\"" + duration = "300s" + comparison = "COMPARISON_GT" + threshold_value = 0 + aggregations { + alignment_period = "60s" + per_series_aligner = "ALIGN_RATE" + cross_series_reducer = "REDUCE_SUM" + group_by_fields = ["resource.label.database_id"] + } + } + } + + documentation { + content = "Cloud SQL connection issues detected. Check network connectivity and IAM permissions." + } +} + diff --git a/modules/gcp/monitoring/outputs.tf b/modules/gcp/monitoring/outputs.tf new file mode 100755 index 0000000..6e197f7 --- /dev/null +++ b/modules/gcp/monitoring/outputs.tf @@ -0,0 +1,23 @@ +output "logging_api_enabled" { + description = "Whether Cloud Logging API is enabled" + value = length(google_project_service.logging) > 0 +} + +output "monitoring_api_enabled" { + description = "Whether Cloud Monitoring API is enabled" + value = length(google_project_service.monitoring) > 0 +} + +output "log_sink_writer_identity" { + description = "Writer identity for the optional log sink" + value = try(google_logging_project_sink.this[0].writer_identity, null) +} + +output "alert_policy_ids" { + description = "Created alert policy IDs" + value = { + pod_restart_high = try(google_monitoring_alert_policy.pod_restart_high[0].name, null) + sql_connection_failures = try(google_monitoring_alert_policy.sql_connection_failures[0].name, null) + } +} + diff --git a/modules/gcp/monitoring/variables.tf b/modules/gcp/monitoring/variables.tf new file mode 100755 index 0000000..1fe5f97 --- /dev/null +++ b/modules/gcp/monitoring/variables.tf @@ -0,0 +1,41 @@ +variable "project_id" { + type = string + description = "GCP project ID" +} + +variable "enable_logging" { + type = bool + default = true + description = "Enable Cloud Logging for GKE" +} + +variable "enable_monitoring" { + type = bool + default = true + description = "Enable Cloud Monitoring for GKE" +} + +variable "enable_alerts" { + type = bool + default = false + description = "Enable example alert policies" +} + +variable "alert_notification_channels" { + type = list(string) + default = [] + description = "List of notification channel IDs for alerts" +} + +variable "log_sink_name" { + type = string + default = "" + description = "Optional log sink name for exporting logs" +} + +variable "log_sink_destination" { + type = string + default = "" + description = "Optional log sink destination (e.g., bigquery.googleapis.com/projects/PROJECT_ID/datasets/DATASET_ID)" +} + diff --git a/modules/gcp/network/main.tf b/modules/gcp/network/main.tf new file mode 100755 index 0000000..eb14e5f --- /dev/null +++ b/modules/gcp/network/main.tf @@ -0,0 +1,136 @@ +locals { + has_public_subnets = length(var.public_subnet_cidrs) > 0 + has_private_subnets = length(var.private_subnet_cidrs) > 0 + use_multi_subnets = local.has_public_subnets || local.has_private_subnets + + public_subnet_map = { for idx, cidr in var.public_subnet_cidrs : idx => cidr } + private_subnet_map = { for idx, cidr in var.private_subnet_cidrs : idx => cidr } + + default_gke_node_tags = ["${var.name_prefix}-gke"] + effective_gke_node_tags = length(var.gke_node_tags) > 0 ? var.gke_node_tags : local.default_gke_node_tags + + health_check_port_numbers = [for p in var.health_check_ports : tonumber(p)] +} + +resource "google_compute_network" "this" { + name = var.network_name + auto_create_subnetworks = false + routing_mode = "REGIONAL" +} + +resource "google_compute_subnetwork" "this" { + count = local.use_multi_subnets ? 0 : 1 + + name = var.subnet_name + ip_cidr_range = var.subnet_cidr + region = var.region + network = google_compute_network.this.id + + secondary_ip_range { + range_name = "${var.name_prefix}-pods" + ip_cidr_range = var.pods_secondary_range + } + + secondary_ip_range { + range_name = "${var.name_prefix}-services" + ip_cidr_range = var.services_secondary_range + } +} + +resource "google_compute_subnetwork" "public" { + for_each = local.use_multi_subnets ? local.public_subnet_map : {} + + name = "${var.network_name}-public-${each.key}" + ip_cidr_range = each.value + region = var.region + network = google_compute_network.this.id +} + +resource "google_compute_subnetwork" "private" { + for_each = local.use_multi_subnets ? local.private_subnet_map : {} + + name = "${var.network_name}-private-${each.key}" + ip_cidr_range = each.value + region = var.region + network = google_compute_network.this.id + private_ip_google_access = var.enable_private_google_access +} + +resource "google_compute_router" "this" { + count = var.create_nat && (local.use_multi_subnets ? local.has_private_subnets : true) ? 1 : 0 + + name = "${var.name_prefix}-gke-router" + network = google_compute_network.this.id + region = var.region +} + +resource "google_compute_router_nat" "this" { + count = var.create_nat && (local.use_multi_subnets ? local.has_private_subnets : true) ? 1 : 0 + + name = "${var.name_prefix}-gke-nat" + router = google_compute_router.this[0].name + region = var.region + nat_ip_allocate_option = "AUTO_ONLY" + + source_subnetwork_ip_ranges_to_nat = local.use_multi_subnets ? "LIST_OF_SUBNETWORKS" : "ALL_SUBNETWORKS_ALL_IP_RANGES" + + dynamic "subnetwork" { + for_each = local.use_multi_subnets ? google_compute_subnetwork.private : {} + content { + name = subnetwork.value.self_link + source_ip_ranges_to_nat = ["ALL_IP_RANGES"] + } + } + + log_config { + enable = true + filter = "ERRORS_ONLY" + } +} + +resource "google_compute_firewall" "allow_internal" { + name = "${var.network_name}-allow-internal" + network = google_compute_network.this.self_link + + allow { + protocol = "tcp" + ports = ["0-65535"] + } + + allow { + protocol = "udp" + ports = ["0-65535"] + } + + allow { + protocol = "icmp" + } + + source_ranges = [var.subnet_cidr] +} + +resource "google_compute_firewall" "allow_ssh" { + count = var.allow_ssh ? 1 : 0 + name = "${var.network_name}-allow-ssh" + network = google_compute_network.this.self_link + + allow { + protocol = "tcp" + ports = ["22"] + } + + source_ranges = ["0.0.0.0/0"] +} + +resource "google_compute_firewall" "allow_health_checks" { + name = "${var.network_name}-allow-health-checks" + network = google_compute_network.this.self_link + + allow { + protocol = "tcp" + ports = var.health_check_ports + } + + source_ranges = var.health_check_source_ranges + target_tags = local.effective_gke_node_tags +} diff --git a/modules/gcp/network/outputs.tf b/modules/gcp/network/outputs.tf new file mode 100755 index 0000000..3eb63ad --- /dev/null +++ b/modules/gcp/network/outputs.tf @@ -0,0 +1,59 @@ +output "network_self_link" { + value = google_compute_network.this.self_link +} + +output "network_id" { + value = google_compute_network.this.id +} + +output "subnetwork_self_link" { + value = local.use_multi_subnets ? ( + length(google_compute_subnetwork.private) > 0 ? google_compute_subnetwork.private[0].self_link : google_compute_subnetwork.public[0].self_link + ) : google_compute_subnetwork.this[0].self_link +} + +output "subnetwork_name" { + value = local.use_multi_subnets ? ( + length(google_compute_subnetwork.private) > 0 ? google_compute_subnetwork.private[0].name : google_compute_subnetwork.public[0].name + ) : google_compute_subnetwork.this[0].name +} + +output "pods_secondary_range_name" { + value = local.use_multi_subnets ? ( + length(google_compute_subnetwork.private) > 0 ? ( + length(google_compute_subnetwork.private[0].secondary_ip_range) > 0 ? google_compute_subnetwork.private[0].secondary_ip_range[0].range_name : "${var.name_prefix}-pods" + ) : "${var.name_prefix}-pods" + ) : google_compute_subnetwork.this[0].secondary_ip_range[0].range_name +} + +output "services_secondary_range_name" { + value = local.use_multi_subnets ? ( + length(google_compute_subnetwork.private) > 0 ? ( + length(google_compute_subnetwork.private[0].secondary_ip_range) > 1 ? google_compute_subnetwork.private[0].secondary_ip_range[1].range_name : "${var.name_prefix}-services" + ) : "${var.name_prefix}-services" + ) : google_compute_subnetwork.this[0].secondary_ip_range[1].range_name +} + +output "public_subnetwork_names" { + value = local.use_multi_subnets ? [for s in google_compute_subnetwork.public : s.name] : [] +} + +output "private_subnetwork_names" { + value = local.use_multi_subnets ? [for s in google_compute_subnetwork.private : s.name] : [] +} + +output "public_subnetwork_self_links" { + value = local.use_multi_subnets ? [for s in google_compute_subnetwork.public : s.self_link] : [] +} + +output "private_subnetwork_self_links" { + value = local.use_multi_subnets ? [for s in google_compute_subnetwork.private : s.self_link] : [] +} + +output "router_name" { + value = var.create_nat && (local.use_multi_subnets ? local.has_private_subnets : true) ? google_compute_router.this[0].name : null +} + +output "nat_name" { + value = var.create_nat && (local.use_multi_subnets ? local.has_private_subnets : true) ? google_compute_router_nat.this[0].name : null +} diff --git a/modules/gcp/network/variables.tf b/modules/gcp/network/variables.tf new file mode 100755 index 0000000..038463d --- /dev/null +++ b/modules/gcp/network/variables.tf @@ -0,0 +1,67 @@ +variable "name_prefix" { + type = string +} + +variable "region" { + type = string +} + +variable "network_name" { + type = string +} + +variable "subnet_name" { + type = string +} + +variable "subnet_cidr" { + type = string +} + +variable "pods_secondary_range" { + type = string +} + +variable "services_secondary_range" { + type = string +} + +variable "public_subnet_cidrs" { + type = list(string) + default = [] +} + +variable "private_subnet_cidrs" { + type = list(string) + default = [] +} + +variable "enable_private_google_access" { + type = bool + default = true +} + +variable "create_nat" { + type = bool + default = true +} + +variable "allow_ssh" { + type = bool + default = false +} + +variable "gke_node_tags" { + type = list(string) + default = [] +} + +variable "health_check_source_ranges" { + type = list(string) + default = ["130.211.0.0/22", "35.191.0.0/16"] +} + +variable "health_check_ports" { + type = list(string) + default = ["80", "443"] +} diff --git a/modules/gcp/orion_worker/main.tf b/modules/gcp/orion_worker/main.tf new file mode 100755 index 0000000..603faa0 --- /dev/null +++ b/modules/gcp/orion_worker/main.tf @@ -0,0 +1,302 @@ +locals { + namespace = var.namespace + + container_env = merge( + { + ORION_WORKER_START_SCORPIO = var.orion_worker_start_scorpio ? "true" : "false" + SCORPIO_STORE_PATH = var.scorpio_store_path + SCORPIO_WORKSPACE = var.scorpio_workspace + BUCK_PROJECT_ROOT = var.buck_project_root + BUILD_TMP = var.build_tmp + + SCORPIO_GIT_AUTHOR = var.scorpio_git_author + SCORPIO_GIT_EMAIL = var.scorpio_git_email + + SCORPIO_DICFUSE_READABLE = var.scorpio_dicfuse_readable + SCORPIO_LOAD_DIR_DEPTH = var.scorpio_load_dir_depth + SCORPIO_FETCH_FILE_THREAD = var.scorpio_fetch_file_thread + SCORPIO_DICFUSE_IMPORT_CONCURRENCY = var.scorpio_dicfuse_import_concurrency + SCORPIO_DICFUSE_DIR_SYNC_TTL_SECS = var.scorpio_dicfuse_dir_sync_ttl_secs + SCORPIO_DICFUSE_STAT_MODE = var.scorpio_dicfuse_stat_mode + SCORPIO_DICFUSE_OPEN_BUFF_MAX_BYTES = var.scorpio_dicfuse_open_buff_max_bytes + SCORPIO_DICFUSE_OPEN_BUFF_MAX_FILES = var.scorpio_dicfuse_open_buff_max_files + + ANTARES_LOAD_DIR_DEPTH = var.antares_load_dir_depth + ANTARES_DICFUSE_STAT_MODE = var.antares_dicfuse_stat_mode + ANTARES_DICFUSE_OPEN_BUFF_MAX_BYTES = var.antares_dicfuse_open_buff_max_bytes + ANTARES_DICFUSE_OPEN_BUFF_MAX_FILES = var.antares_dicfuse_open_buff_max_files + ANTARES_DICFUSE_DIR_SYNC_TTL_SECS = var.antares_dicfuse_dir_sync_ttl_secs + ANTARES_UPPER_ROOT = var.antares_upper_root + ANTARES_CL_ROOT = var.antares_cl_root + ANTARES_MOUNT_ROOT = var.antares_mount_root + ANTARES_STATE_FILE = var.antares_state_file + }, + var.worker_env + ) +} + +resource "kubernetes_namespace_v1" "this" { + metadata { + name = local.namespace + } +} + +resource "kubernetes_service_account_v1" "this" { + metadata { + name = var.service_account_name + namespace = kubernetes_namespace_v1.this.metadata[0].name + } +} + +resource "kubernetes_config_map_v1" "this" { + metadata { + name = "orion-worker-config" + namespace = kubernetes_namespace_v1.this.metadata[0].name + } + + data = { + SERVER_WS = var.server_ws + SCORPIO_BASE_URL = var.scorpio_base_url + SCORPIO_LFS_URL = var.scorpio_lfs_url + RUST_LOG = var.rust_log + } +} + +resource "kubernetes_secret_v1" "this" { + metadata { + name = "orion-worker-secret" + namespace = kubernetes_namespace_v1.this.metadata[0].name + } + + type = "Opaque" + data = var.secret_data +} + +resource "kubernetes_config_map_v1" "scorpio" { + metadata { + name = "scorpio-config" + namespace = kubernetes_namespace_v1.this.metadata[0].name + } + + data = { + "scorpio.toml.template" = <<-EOT + # ============================================================================= + # Scorpio Configuration Template + # ============================================================================= + # This template uses simple environment variable placeholders like `$${VAR_NAME}`. + # The entrypoint script sets defaults and then substitutes the variables via `envsubst`. + # ============================================================================= + + # Mega/Mono service URLs + base_url = "$${SCORPIO_BASE_URL}" + lfs_url = "$${SCORPIO_LFS_URL}" + + # Storage paths + store_path = "$${SCORPIO_STORE_PATH}" + workspace = "$${SCORPIO_WORKSPACE}" + config_file = "config.toml" + + # Git author configuration + git_author = "$${SCORPIO_GIT_AUTHOR}" + git_email = "$${SCORPIO_GIT_EMAIL}" + + # DicFuse (dictionary-based FUSE) settings + dicfuse_readable = "$${SCORPIO_DICFUSE_READABLE}" + load_dir_depth = "$${SCORPIO_LOAD_DIR_DEPTH}" + fetch_file_thread = "$${SCORPIO_FETCH_FILE_THREAD}" + dicfuse_import_concurrency = "$${SCORPIO_DICFUSE_IMPORT_CONCURRENCY}" + dicfuse_dir_sync_ttl_secs = "$${SCORPIO_DICFUSE_DIR_SYNC_TTL_SECS}" + dicfuse_stat_mode = "$${SCORPIO_DICFUSE_STAT_MODE}" + dicfuse_open_buff_max_bytes = "$${SCORPIO_DICFUSE_OPEN_BUFF_MAX_BYTES}" + dicfuse_open_buff_max_files = "$${SCORPIO_DICFUSE_OPEN_BUFF_MAX_FILES}" + + # Antares (overlay filesystem) settings + antares_load_dir_depth = "$${ANTARES_LOAD_DIR_DEPTH}" + antares_dicfuse_stat_mode = "$${ANTARES_DICFUSE_STAT_MODE}" + antares_dicfuse_open_buff_max_bytes = "$${ANTARES_DICFUSE_OPEN_BUFF_MAX_BYTES}" + antares_dicfuse_open_buff_max_files = "$${ANTARES_DICFUSE_OPEN_BUFF_MAX_FILES}" + antares_dicfuse_dir_sync_ttl_secs = "$${ANTARES_DICFUSE_DIR_SYNC_TTL_SECS}" + antares_upper_root = "$${ANTARES_UPPER_ROOT}" + antares_cl_root = "$${ANTARES_CL_ROOT}" + antares_mount_root = "$${ANTARES_MOUNT_ROOT}" + antares_state_file = "$${ANTARES_STATE_FILE}" + EOT + } +} + +resource "kubernetes_daemon_set_v1" "this" { + metadata { + name = "orion-worker" + namespace = kubernetes_namespace_v1.this.metadata[0].name + labels = { + app = "orion-worker" + } + } + + spec { + selector { + match_labels = { + app = "orion-worker" + } + } + + template { + metadata { + labels = { + app = "orion-worker" + } + } + + spec { + service_account_name = var.service_account_name + termination_grace_period_seconds = var.termination_grace_period_seconds + + dynamic "toleration" { + for_each = var.tolerations + content { + key = toleration.value.key + operator = toleration.value.operator + value = toleration.value.value + effect = toleration.value.effect + } + } + + node_selector = var.node_selector + + dynamic "affinity" { + for_each = var.node_affinity_required != null ? [1] : [] + content { + node_affinity { + required_during_scheduling_ignored_during_execution { + node_selector_term { + match_expressions { + key = var.node_affinity_required.key + operator = var.node_affinity_required.operator + values = var.node_affinity_required.values + } + } + } + } + } + } + + container { + name = "orion-worker" + image = var.image + + command = ["/bin/sh", "-c"] + args = [ + <<-EOT + set -e + # Force localhost to resolve to IPv4 to avoid IPv6 connection issues + echo "127.0.0.1 localhost" >> /etc/hosts + + echo "Generating scorpio config via envsubst..." + envsubst < /etc/scorpio/scorpio.toml.template > /tmp/scorpio.toml + + echo "Attempting to start scorpio in background..." + # Run scorpio with config, listening on default 0.0.0.0:2725 + /app/bin/scorpio --config-path /tmp/scorpio.toml > /tmp/scorpio.log 2>&1 & + sleep 2 + echo "--- Scorpio Log Start ---" + cat /tmp/scorpio.log || true + echo "--- Scorpio Log End ---" + exec orion + EOT + ] + + env_from { + config_map_ref { + name = kubernetes_config_map_v1.this.metadata[0].name + } + } + + env_from { + secret_ref { + name = kubernetes_secret_v1.this.metadata[0].name + optional = true + } + } + + dynamic "env" { + for_each = local.container_env + content { + name = env.key + value = env.value + } + } + + security_context { + privileged = var.privileged + } + + resources { + requests = { + cpu = var.cpu_request + memory = var.memory_request + } + limits = { + cpu = var.cpu_limit + memory = var.memory_limit + } + } + + lifecycle { + pre_stop { + exec { + command = ["/bin/sh", "-lc", "sleep 10"] + } + } + } + + volume_mount { + name = "orion-data-cache" + mount_path = "/data" + } + + volume_mount { + name = "orion-workspace-cache" + mount_path = "/workspace" + } + + volume_mount { + name = "scorpio-config" + mount_path = "/etc/scorpio/scorpio.toml.template" + sub_path = "scorpio.toml.template" + } + } + + volume { + name = "orion-data-cache" + host_path { + path = var.host_path_data + type = "DirectoryOrCreate" + } + } + + volume { + name = "orion-workspace-cache" + host_path { + path = var.host_path_workspace + type = "DirectoryOrCreate" + } + } + + volume { + name = "scorpio-config" + config_map { + name = kubernetes_config_map_v1.scorpio.metadata[0].name + } + } + } + } + } + + depends_on = [ + kubernetes_namespace_v1.this, + kubernetes_service_account_v1.this, + kubernetes_config_map_v1.this, + kubernetes_secret_v1.this, + kubernetes_config_map_v1.scorpio + ] +} diff --git a/modules/gcp/orion_worker/variables.tf b/modules/gcp/orion_worker/variables.tf new file mode 100755 index 0000000..3e5a169 --- /dev/null +++ b/modules/gcp/orion_worker/variables.tf @@ -0,0 +1,282 @@ +variable "namespace" { + type = string + description = "Kubernetes namespace for Orion Worker" + default = "orion-worker" +} + +variable "service_account_name" { + type = string + description = "Kubernetes service account name" + default = "orion-worker-sa" +} + +variable "image" { + type = string + description = "Orion Worker container image" + default = "public.ecr.aws/m8q5m4u3/mega:orion-client-0.1.0-pre-release-amd64" +} + +variable "server_ws" { + type = string + description = "Orion server WebSocket URL" + default = "wss://orion.gitmono.com/ws" +} + +variable "scorpio_base_url" { + type = string + description = "Scorpio base URL" + default = "https://git.gitmono.com" +} + +variable "scorpio_lfs_url" { + type = string + description = "Scorpio LFS URL" + default = "https://git.gitmono.com" +} + +variable "rust_log" { + type = string + description = "Rust log level" + default = "info" +} + +variable "secret_data" { + type = map(string) + description = "Optional secret data for orion-worker-secret (plain values, will be base64-encoded by provider)." + default = {} + sensitive = true +} + +variable "worker_env" { + type = map(string) + description = "Extra environment variables for Orion Worker container" + default = {} +} + +variable "tolerations" { + type = list(object({ + key = string + operator = string + value = string + effect = string + })) + description = "Tolerations for worker pods" + default = [ + { + key = "dedicated" + operator = "Equal" + value = "orion-build" + effect = "NoSchedule" + } + ] +} + +variable "node_selector" { + type = map(string) + description = "Node selector for worker pods" + default = { + nodepool = "build-default" + } +} + +variable "node_affinity_required" { + type = object({ + key = string + operator = string + values = list(string) + }) + description = "Optional requiredDuringScheduling node affinity. If null, affinity is not set." + default = null +} + +variable "orion_worker_start_scorpio" { + type = bool + description = "Whether to start scorpio sidecar process before orion" + default = true +} + +variable "scorpio_store_path" { + type = string + description = "Scorpio store path" + default = "/data/scorpio/store" +} + +variable "scorpio_workspace" { + type = string + description = "Scorpio workspace path" + default = "/workspace/mount" +} + +variable "buck_project_root" { + type = string + description = "Buck project root" + default = "/workspace" +} + +variable "build_tmp" { + type = string + description = "Build temp directory" + default = "/tmp/orion-builds" +} + +variable "scorpio_git_author" { + type = string + description = "Scorpio git author" + default = "orion" +} + +variable "scorpio_git_email" { + type = string + description = "Scorpio git email" + default = "orion@local" +} + +variable "scorpio_dicfuse_readable" { + type = string + description = "Scorpio dicfuse readable" + default = "true" +} + +variable "scorpio_load_dir_depth" { + type = string + description = "Scorpio load dir depth" + default = "2" +} + +variable "scorpio_fetch_file_thread" { + type = string + description = "Scorpio fetch file thread" + default = "8" +} + +variable "scorpio_dicfuse_import_concurrency" { + type = string + description = "Scorpio dicfuse import concurrency" + default = "8" +} + +variable "scorpio_dicfuse_dir_sync_ttl_secs" { + type = string + description = "Scorpio dicfuse dir sync ttl secs" + default = "60" +} + +variable "scorpio_dicfuse_stat_mode" { + type = string + description = "Scorpio dicfuse stat mode" + default = "fast" +} + +variable "scorpio_dicfuse_open_buff_max_bytes" { + type = string + description = "Scorpio dicfuse open buff max bytes" + default = "134217728" +} + +variable "scorpio_dicfuse_open_buff_max_files" { + type = string + description = "Scorpio dicfuse open buff max files" + default = "2048" +} + +variable "antares_load_dir_depth" { + type = string + description = "Antares load dir depth" + default = "2" +} + +variable "antares_dicfuse_stat_mode" { + type = string + description = "Antares dicfuse stat mode" + default = "fast" +} + +variable "antares_dicfuse_open_buff_max_bytes" { + type = string + description = "Antares dicfuse open buff max bytes" + default = "134217728" +} + +variable "antares_dicfuse_open_buff_max_files" { + type = string + description = "Antares dicfuse open buff max files" + default = "2048" +} + +variable "antares_dicfuse_dir_sync_ttl_secs" { + type = string + description = "Antares dicfuse dir sync ttl secs" + default = "60" +} + +variable "antares_upper_root" { + type = string + description = "Antares upper root" + default = "/data/scorpio/antares/upper" +} + +variable "antares_cl_root" { + type = string + description = "Antares cl root" + default = "/data/scorpio/antares/cl" +} + +variable "antares_mount_root" { + type = string + description = "Antares mount root" + default = "/workspace/mount" +} + +variable "antares_state_file" { + type = string + description = "Antares state file" + default = "/data/scorpio/antares/state.json" +} + +variable "privileged" { + type = bool + description = "Run worker container in privileged mode" + default = true +} + +variable "host_path_data" { + type = string + description = "HostPath for /data cache" + default = "/var/lib/orion/data" +} + +variable "host_path_workspace" { + type = string + description = "HostPath for /workspace cache" + default = "/var/lib/orion/workspace" +} + +variable "cpu_request" { + type = string + description = "CPU request for worker container" + default = "6" +} + +variable "memory_request" { + type = string + description = "Memory request for worker container" + default = "24Gi" +} + +variable "cpu_limit" { + type = string + description = "CPU limit for worker container" + default = "8" +} + +variable "memory_limit" { + type = string + description = "Memory limit for worker container" + default = "30Gi" +} + +variable "termination_grace_period_seconds" { + type = number + description = "Termination grace period seconds" + default = 300 +} diff --git a/modules/gcp/redis/main.tf b/modules/gcp/redis/main.tf new file mode 100755 index 0000000..61e97b6 --- /dev/null +++ b/modules/gcp/redis/main.tf @@ -0,0 +1,9 @@ +resource "google_redis_instance" "this" { + name = var.name + region = var.region + tier = var.tier + memory_size_gb = var.memory_size_gb + authorized_network = var.network + transit_encryption_mode = var.transit_encryption_mode +} + diff --git a/modules/gcp/redis/outputs.tf b/modules/gcp/redis/outputs.tf new file mode 100755 index 0000000..3f8f348 --- /dev/null +++ b/modules/gcp/redis/outputs.tf @@ -0,0 +1,8 @@ +output "host" { + value = google_redis_instance.this.host +} + +output "port" { + value = google_redis_instance.this.port +} + diff --git a/modules/gcp/redis/variables.tf b/modules/gcp/redis/variables.tf new file mode 100755 index 0000000..05015aa --- /dev/null +++ b/modules/gcp/redis/variables.tf @@ -0,0 +1,33 @@ +variable "name" { + type = string + description = "Memorystore instance name" +} + +variable "region" { + type = string + description = "GCP region" +} + +variable "tier" { + type = string + default = "STANDARD_HA" + description = "Memorystore tier" +} + +variable "memory_size_gb" { + type = number + default = 1 + description = "Memory size in GB" +} + +variable "network" { + type = string + description = "VPC self link" +} + +variable "transit_encryption_mode" { + type = string + default = "DISABLED" + description = "Transit encryption mode" +} +