diff --git a/.github/workflows/terraform-aws-example.yml b/.github/workflows/terraform-aws-example.yml new file mode 100644 index 0000000..613054a --- /dev/null +++ b/.github/workflows/terraform-aws-example.yml @@ -0,0 +1,36 @@ +name: Validate AWS Example + +on: + push: + branches: [main] + paths: + - examples/aws/** + - .github/workflows/terraform-aws-example.yml + pull_request: + paths: + - examples/aws/** + - .github/workflows/terraform-aws-example.yml + +permissions: + contents: read + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + validate: + runs-on: ubuntu-latest + defaults: + run: + working-directory: examples/aws + steps: + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + - uses: hashicorp/setup-terraform@dfe3c3f87815947d99a8997f908cb6525fc44e9e # v4.0.1 + with: + terraform_wrapper: false + terraform_version: "1.15.3" + - run: terraform init -backend=false + - run: terraform fmt -check + - run: terraform validate + - run: terraform test diff --git a/examples/aws/.gitignore b/examples/aws/.gitignore new file mode 100644 index 0000000..7d126a9 --- /dev/null +++ b/examples/aws/.gitignore @@ -0,0 +1,6 @@ +*.tfstate +*.tfstate.* +*.tfplan +.terraform/ +.terraform.lock.hcl +terraform.tfvars diff --git a/examples/aws/README.md b/examples/aws/README.md new file mode 100644 index 0000000..685f100 --- /dev/null +++ b/examples/aws/README.md @@ -0,0 +1,157 @@ +# Lore on AWS + +Deploy Lore on AWS with NVMe-cached edge nodes for high-throughput game asset delivery. + +This example uses **c8gd.8xlarge** Graviton instances (32 vCPU, 64 GB RAM, 1.9 TB NVMe, 25 Gbps network) — the recommended instance type for Lore. The NVMe instance store serves as a local fragment cache, delivering sub-millisecond reads for `lore clone` while S3 provides durable storage. + +> Region is configurable via `var.region` (default: `us-west-2`). + +## Quick start + +### 1. Build and push the container image + +From the Lore repo root: + +```sh +docker buildx build --platform linux/arm64 -f lore-server/Dockerfile -t loreserver . +``` + +> If building on an x86 host, [register QEMU](https://docs.docker.com/build/building/multi-platform/#qemu) first: +> `docker run --rm --privileged multiarch/qemu-user-static --reset -p yes` + +Push to ECR (replace `` and ``): + +```sh +aws ecr get-login-password --region | docker login --username AWS --password-stdin .dkr.ecr..amazonaws.com +aws ecr create-repository --repository-name loreserver --region +docker tag loreserver:latest .dkr.ecr..amazonaws.com/loreserver:latest +docker push .dkr.ecr..amazonaws.com/loreserver:latest +``` + +### 2. Deploy + +```sh +cd examples/aws +cp terraform.tfvars.example terraform.tfvars +``` + +Edit `terraform.tfvars`: + +```hcl +region = "us-west-2" +container_image = ".dkr.ecr.us-west-2.amazonaws.com/loreserver:latest" +allowed_cidrs = ["10.0.0.0/8"] # Your VPC or VPN CIDR +``` + +```sh +terraform init +terraform apply +``` + +First apply may need a second run (DynamoDB PITR timing race). + +### 3. Connect + +Services run in private subnets. Access requires connectivity to the VPC (e.g., NLB in public subnets, AWS Client VPN, VPC peering, or a bastion host). + +Export the CA certificate so the Lore client trusts the server: + +```sh +terraform output -raw ca_certificate_pem > lore-ca.pem +cat /etc/ssl/certs/ca-certificates.crt lore-ca.pem > combined-ca.pem +export SSL_CERT_FILE=combined-ca.pem +``` + +Create a repository and push your first asset: + +```sh +lore repository create lores://edge.lore.internal:41337/my-game +lore clone lores://edge.lore.internal:41337/my-game ./my-game +cp /path/to/assets/* ./my-game/ +cd my-game +lore stage . +lore commit "initial import" +lore push +``` + +Clone from another machine: + +```sh +lore clone lores://edge.lore.internal:41337/my-game ./my-game +``` + +**Note:** `edge.lore.internal` resolves via Cloud Map private DNS inside the VPC. + +## Architecture + +| Component | Instance | Purpose | +|-----------|----------|---------| +| Primary (ECS on EC2) | c8gd.8xlarge | Composite store: NVMe cache + S3 durable. Serves replication to edge. | +| Edge (ECS on EC2) | c8gd.8xlarge | Composite store: NVMe cache + replicated durable (QUIC to primary). Client-facing. | +| Cloud Map DNS | — | Service discovery (`primary.lore.internal`, `edge.lore.internal`) | +| VPC | — | Private subnets, NAT, S3/DynamoDB gateway endpoints | +| TLS CA | — | Self-signed; establishes trust between nodes and clients | + +**Startup:** Health check grace periods allow the primary (120s) and edge (300s) to initialize without being marked unhealthy. The edge's retry configuration handles Cloud Map DNS propagation delays automatically. On first deploy, edge nodes may restart 1-2 times while DNS propagates — this is expected and self-resolving. + +### Data flow + +``` +Client ──lores://──→ Edge (NVMe cache hit → instant response) + │ cache miss + ├──QUIC:41340──→ Primary (NVMe cache → S3 fallback) + └──gRPC:41337──→ Primary (branch resolution) +``` + +> **Instance sizing:** Use node sizes without network bandwidth caps (32+ vCPU) for production. This example uses c8gd.8xlarge (NVMe + Graviton). + +## Verify + +```sh +aws ecs describe-services --cluster lore-cluster --services lore lore-edge \ + --query 'services[].{name:serviceName,running:runningCount}' --region us-west-2 +``` + +```sh +aws logs tail /ecs/lore --since 5m --region us-west-2 +``` + +## Customize + +| Need | What to change | +|------|----------------| +| Smaller instances (dev/test) | Set `instance_type = "c8gd.xlarge"` — same architecture, less capacity | +| External access | Add an NLB in public subnets | +| Authentication | Set `LORE__SERVER__AUTH__JWK__ENDPOINT` ([docs](https://epicgames.github.io/lore/reference/lore-server-config/#authentication)) | +| More edge nodes | Increase ASG `max_size` + edge service `desired_count` | +| Faster edge startup | Consider adding a startup probe that polls `primary.lore.internal` before starting loreserver | +| Presigned URLs | Already configured via HMAC key in Secrets Manager | +| Production hardening | Add `deletion_protection_enabled = true` to DynamoDB tables | + +Full server configuration: [Lore Server config reference](https://epicgames.github.io/lore/reference/lore-server-config/) + +## Destroy + +The S3 bucket has `force_destroy = false` (prevents accidental data loss). Teardown takes ~6 minutes (capacity provider reconciliation). To destroy: + +```sh +aws s3 rm s3://$(terraform output -raw s3_bucket) --recursive +terraform destroy +``` + +If destroy fails on Cloud Map services ("Service contains registered instances"), scale to zero first: + +```sh +aws ecs update-service --cluster lore-cluster --service lore --desired-count 0 --region us-west-2 +aws ecs update-service --cluster lore-cluster --service lore-edge --desired-count 0 --region us-west-2 +sleep 30 +terraform destroy +``` + +For dev/test where you want one-command teardown, add `force_destroy = true` to the `aws_s3_bucket` resource. + +## Prerequisites + +- [Terraform](https://developer.hashicorp.com/terraform/install) >= 1.7 +- AWS credentials with VPC, ECS, EC2, S3, DynamoDB, IAM, Secrets Manager, Cloud Map, Auto Scaling permissions +- Docker (to build the ARM64 container image) diff --git a/examples/aws/compute.tf b/examples/aws/compute.tf new file mode 100644 index 0000000..031e574 --- /dev/null +++ b/examples/aws/compute.tf @@ -0,0 +1,482 @@ +# ============================================================================= +# ECS on EC2 — c8gd.8xlarge with NVMe instance store for fragment caching +# +# This is the recommended deployment for Lore. The NVMe instance store provides +# sub-millisecond fragment reads for clones, while S3 provides durability. +# c8gd.8xlarge: 32 vCPU, 64 GB RAM, 1x 1.9 TB NVMe, 25 Gbps network. +# ============================================================================= + +data "aws_ssm_parameter" "ecs_ami" { + name = "/aws/service/ecs/optimized-ami/amazon-linux-2023/arm64/recommended/image_id" +} + +resource "aws_ecs_cluster" "this" { + name = "${local.name}-cluster" + + setting { + name = "containerInsights" + value = "enabled" + } + + tags = local.tags +} + +resource "aws_cloudwatch_log_group" "lore" { + name = "/ecs/${local.name}" + retention_in_days = 7 + tags = local.tags +} + +# ============================================================================= +# Launch Template + ASG — ECS-managed instances with NVMe setup +# ============================================================================= + +resource "aws_launch_template" "ecs" { + name_prefix = "${local.name}-ecs-" + image_id = data.aws_ssm_parameter.ecs_ami.value + instance_type = var.instance_type + + iam_instance_profile { + arn = aws_iam_instance_profile.ecs_instance.arn + } + + vpc_security_group_ids = [aws_security_group.lore.id] + + user_data = base64encode(templatefile("${path.module}/user_data.sh.tpl", { + cluster_name = aws_ecs_cluster.this.name + mount_path = "/srv/urc" + })) + + metadata_options { + http_endpoint = "enabled" + http_tokens = "required" + http_put_response_hop_limit = 1 + } + + tag_specifications { + resource_type = "instance" + tags = merge(local.tags, { Name = "${local.name}-ecs" }) + } + + tags = local.tags +} + +resource "aws_autoscaling_group" "ecs" { + name_prefix = "${local.name}-ecs-" + min_size = 2 + max_size = 2 + desired_capacity = 2 + vpc_zone_identifier = aws_subnet.private[*].id + + launch_template { + id = aws_launch_template.ecs.id + version = "$Latest" + } + + protect_from_scale_in = true + + # Allows terraform destroy to delete the ASG without waiting for capacity + # provider reconciliation (~6 min). Remove for production if you want + # graceful drain before ASG deletion. + force_delete = true + + tag { + key = "AmazonECSManaged" + value = "true" + propagate_at_launch = true + } + + tag { + key = "Name" + value = "${local.name}-ecs" + propagate_at_launch = true + } + + lifecycle { + ignore_changes = [desired_capacity] + } +} + +# ============================================================================= +# Capacity Provider — links ASG to ECS cluster +# ============================================================================= + +resource "aws_ecs_capacity_provider" "ec2" { + name = "${local.name}-ec2" + + auto_scaling_group_provider { + auto_scaling_group_arn = aws_autoscaling_group.ecs.arn + managed_termination_protection = "ENABLED" + + managed_scaling { + status = "ENABLED" + target_capacity = 100 + minimum_scaling_step_size = 1 + maximum_scaling_step_size = 1 + } + } + + tags = local.tags +} + +resource "aws_ecs_cluster_capacity_providers" "this" { + cluster_name = aws_ecs_cluster.this.name + capacity_providers = [aws_ecs_capacity_provider.ec2.name] + + default_capacity_provider_strategy { + capacity_provider = aws_ecs_capacity_provider.ec2.name + weight = 100 + } +} + +# ============================================================================= +# Primary — Composite store (NVMe cache + durable S3), serves replication +# ============================================================================= + +resource "aws_ecs_task_definition" "lore" { + family = local.name + requires_compatibilities = ["EC2"] + network_mode = "awsvpc" + execution_role_arn = aws_iam_role.execution.arn + task_role_arn = aws_iam_role.task.arn + + runtime_platform { + operating_system_family = "LINUX" + cpu_architecture = "ARM64" + } + + volume { + name = "instance-store-cache" + host_path = "/srv/urc" + } + + volume { + name = "certs" + } + + container_definitions = jsonencode([ + { + name = "init-certs" + image = "public.ecr.aws/amazonlinux/amazonlinux:minimal" + essential = false + command = ["sh", "-c", "echo \"$CERT\" > /certs/fullchain.crt && echo \"$KEY\" > /certs/server.key && chmod 600 /certs/server.key && echo \"$CA\" > /certs/ca.pem"] + + secrets = [ + { name = "CERT", valueFrom = "${aws_secretsmanager_secret.tls.arn}:fullchain::" }, + { name = "KEY", valueFrom = "${aws_secretsmanager_secret.tls.arn}:key::" }, + { name = "CA", valueFrom = "${aws_secretsmanager_secret.tls.arn}:ca::" }, + ] + + mountPoints = [{ sourceVolume = "certs", containerPath = "/certs", readOnly = false }] + memoryReservation = 64 + + logConfiguration = { + logDriver = "awslogs" + options = { + "awslogs-group" = aws_cloudwatch_log_group.lore.name + "awslogs-region" = var.region + "awslogs-stream-prefix" = "init" + } + } + }, + { + name = "loreserver" + image = var.container_image + essential = true + + dependsOn = [{ containerName = "init-certs", condition = "SUCCESS" }] + memoryReservation = 8192 + + portMappings = [ + { containerPort = local.port_quic_grpc, protocol = "tcp" }, + { containerPort = local.port_quic_grpc, protocol = "udp" }, + { containerPort = local.port_http, protocol = "tcp" }, + { containerPort = local.port_replication, protocol = "udp" }, + ] + + mountPoints = [ + { sourceVolume = "instance-store-cache", containerPath = "/srv/urc", readOnly = false }, + { sourceVolume = "certs", containerPath = "/certs", readOnly = true }, + ] + + secrets = [ + { name = "LORE__SERVER__HTTP__PRESIGNED_URL_HMAC_KEY", valueFrom = aws_secretsmanager_secret.hmac.arn }, + ] + + environment = [ + { name = "LORE_ENV", value = "docker" }, + { name = "LORE_CONFIG_PATH", value = "/etc/lore/config" }, + + # TLS + { name = "LORE__SERVER__QUIC__CERTIFICATE__CERT_FILE", value = "/certs/fullchain.crt" }, + { name = "LORE__SERVER__QUIC__CERTIFICATE__PKEY_FILE", value = "/certs/server.key" }, + { name = "LORE__SERVER__GRPC__CERTIFICATE__CERT_FILE", value = "/certs/fullchain.crt" }, + { name = "LORE__SERVER__GRPC__CERTIFICATE__PKEY_FILE", value = "/certs/server.key" }, + { name = "LORE__SERVER__GRPC__VERIFY_CLIENT_CERTS", value = "false" }, + + # Internal QUIC for edge replication + { name = "LORE__SERVER__QUIC_INTERNAL__ENABLED", value = "true" }, + { name = "LORE__SERVER__QUIC_INTERNAL__CERTIFICATE__CERT_FILE", value = "/certs/fullchain.crt" }, + { name = "LORE__SERVER__QUIC_INTERNAL__CERTIFICATE__PKEY_FILE", value = "/certs/server.key" }, + { name = "LORE__SERVER__QUIC_INTERNAL__VERIFY_CLIENT_CERTS", value = "false" }, + + # Storage: composite (NVMe cache + S3 durable) + { name = "LORE__IMMUTABLE_STORE__MODE", value = "composite" }, + { name = "LORE__IMMUTABLE_STORE__COMPOSITE__LOCAL__MODE", value = "local" }, + { name = "LORE__IMMUTABLE_STORE__COMPOSITE__LOCAL__LOCAL__PATH", value = "/srv/urc" }, + # 80% of c8gd.8xlarge NVMe (1.9 TB). Reserves 20% for xfs metadata/journal. + # The fragment cache is the only consumer of the instance store. + { name = "LORE__IMMUTABLE_STORE__COMPOSITE__LOCAL__LOCAL__MAX_SIZE", value = "1520000000000" }, + { name = "LORE__IMMUTABLE_STORE__COMPOSITE__LOCAL__LOCAL__FLUSH_DELAY_SECONDS", value = "10" }, + { name = "LORE__IMMUTABLE_STORE__COMPOSITE__DURABLE__MODE", value = "aws" }, + { name = "LORE__MUTABLE_STORE__MODE", value = "aws" }, + { name = "LORE__LOCK_STORE__MODE", value = "aws" }, + + # AWS plugin config + { name = "LORE__PLUGINS__AWS__IMMUTABLE_STORE__S3_BUCKET", value = aws_s3_bucket.fragments.id }, + { name = "LORE__PLUGINS__AWS__IMMUTABLE_STORE__DYNAMODB_FRAGMENTS_TABLE", value = aws_dynamodb_table.fragments.name }, + { name = "LORE__PLUGINS__AWS__IMMUTABLE_STORE__DYNAMODB_METADATA_TABLE", value = aws_dynamodb_table.metadata.name }, + { name = "LORE__PLUGINS__AWS__MUTABLE_STORE__DYNAMODB_TABLE", value = aws_dynamodb_table.mutable.name }, + { name = "LORE__PLUGINS__AWS__LOCK_STORE__DYNAMODB_TABLE", value = aws_dynamodb_table.locks.name }, + ] + + logConfiguration = { + logDriver = "awslogs" + options = { + "awslogs-group" = aws_cloudwatch_log_group.lore.name + "awslogs-region" = var.region + "awslogs-stream-prefix" = "lore" + } + } + }, + ]) + + tags = local.tags +} + +resource "aws_ecs_service" "lore" { + name = local.name + cluster = aws_ecs_cluster.this.id + task_definition = aws_ecs_task_definition.lore.arn + desired_count = 1 + + health_check_grace_period_seconds = 120 + + capacity_provider_strategy { + capacity_provider = aws_ecs_capacity_provider.ec2.name + weight = 100 + } + + network_configuration { + subnets = aws_subnet.private[*].id + security_groups = [aws_security_group.lore.id] + } + + service_registries { + registry_arn = aws_service_discovery_service.lore.arn + } + + placement_constraints { + type = "distinctInstance" + } + + tags = local.tags +} + +# ============================================================================= +# Cloud Map — Service discovery for edge → primary and client → edge +# +# NOTE: terraform destroy may fail if ECS tasks are still registered. If this +# happens, scale services to 0 and wait 30s before re-running destroy. +# ============================================================================= + +resource "aws_service_discovery_private_dns_namespace" "this" { + name = "${local.name}.internal" + vpc = aws_vpc.this.id + tags = local.tags +} + +resource "aws_service_discovery_service" "lore" { + name = "primary" + + dns_config { + namespace_id = aws_service_discovery_private_dns_namespace.this.id + dns_records { + ttl = 10 + type = "A" + } + routing_policy = "MULTIVALUE" + } + + health_check_custom_config {} + + tags = local.tags +} + +resource "aws_service_discovery_service" "edge" { + name = "edge" + + dns_config { + namespace_id = aws_service_discovery_private_dns_namespace.this.id + dns_records { + ttl = 10 + type = "A" + } + routing_policy = "MULTIVALUE" + } + + health_check_custom_config {} + + tags = local.tags +} + +# ============================================================================= +# Edge — Composite store (NVMe cache + replicated durable via QUIC to primary) +# ============================================================================= + +resource "aws_ecs_task_definition" "edge" { + family = "${local.name}-edge" + requires_compatibilities = ["EC2"] + network_mode = "awsvpc" + execution_role_arn = aws_iam_role.execution.arn + task_role_arn = aws_iam_role.edge_task.arn + + runtime_platform { + operating_system_family = "LINUX" + cpu_architecture = "ARM64" + } + + volume { + name = "instance-store-cache" + host_path = "/srv/urc" + } + + volume { + name = "certs" + } + + container_definitions = jsonencode([ + { + name = "init-certs" + image = "public.ecr.aws/amazonlinux/amazonlinux:minimal" + essential = false + command = ["sh", "-c", "echo \"$CERT\" > /certs/fullchain.crt && echo \"$KEY\" > /certs/server.key && chmod 600 /certs/server.key && cat /etc/pki/tls/certs/ca-bundle.crt > /certs/ca.pem && echo \"$CA\" >> /certs/ca.pem"] + + secrets = [ + { name = "CERT", valueFrom = "${aws_secretsmanager_secret.tls.arn}:fullchain::" }, + { name = "KEY", valueFrom = "${aws_secretsmanager_secret.tls.arn}:key::" }, + { name = "CA", valueFrom = "${aws_secretsmanager_secret.tls.arn}:ca::" }, + ] + + mountPoints = [{ sourceVolume = "certs", containerPath = "/certs", readOnly = false }] + memoryReservation = 64 + + logConfiguration = { + logDriver = "awslogs" + options = { + "awslogs-group" = aws_cloudwatch_log_group.lore.name + "awslogs-region" = var.region + "awslogs-stream-prefix" = "edge-init" + } + } + }, + { + name = "loreserver" + image = var.container_image + essential = true + + dependsOn = [{ containerName = "init-certs", condition = "SUCCESS" }] + memoryReservation = 8192 + + portMappings = [ + { containerPort = local.port_quic_grpc, protocol = "tcp" }, + { containerPort = local.port_quic_grpc, protocol = "udp" }, + { containerPort = local.port_http, protocol = "tcp" }, + ] + + mountPoints = [ + { sourceVolume = "instance-store-cache", containerPath = "/srv/urc", readOnly = false }, + { sourceVolume = "certs", containerPath = "/certs", readOnly = true }, + ] + + secrets = [ + { name = "LORE__SERVER__HTTP__PRESIGNED_URL_HMAC_KEY", valueFrom = aws_secretsmanager_secret.hmac.arn }, + ] + + environment = [ + { name = "LORE_ENV", value = "docker" }, + { name = "LORE_CONFIG_PATH", value = "/etc/lore/config" }, + { name = "SSL_CERT_FILE", value = "/certs/ca.pem" }, + + # TLS for client-facing endpoints + { name = "LORE__SERVER__QUIC__CERTIFICATE__CERT_FILE", value = "/certs/fullchain.crt" }, + { name = "LORE__SERVER__QUIC__CERTIFICATE__PKEY_FILE", value = "/certs/server.key" }, + { name = "LORE__SERVER__GRPC__CERTIFICATE__CERT_FILE", value = "/certs/fullchain.crt" }, + { name = "LORE__SERVER__GRPC__CERTIFICATE__PKEY_FILE", value = "/certs/server.key" }, + { name = "LORE__SERVER__GRPC__VERIFY_CLIENT_CERTS", value = "false" }, + + # Storage: composite (NVMe cache + replicated durable via QUIC to primary) + { name = "LORE__IMMUTABLE_STORE__MODE", value = "composite" }, + { name = "LORE__IMMUTABLE_STORE__COMPOSITE__LOCAL__MODE", value = "local" }, + { name = "LORE__IMMUTABLE_STORE__COMPOSITE__LOCAL__LOCAL__PATH", value = "/srv/urc" }, + # 80% of c8gd.8xlarge NVMe (1.9 TB). Reserves 20% for xfs metadata/journal. + # The fragment cache is the only consumer of the instance store. + { name = "LORE__IMMUTABLE_STORE__COMPOSITE__LOCAL__LOCAL__MAX_SIZE", value = "1520000000000" }, + { name = "LORE__IMMUTABLE_STORE__COMPOSITE__LOCAL__LOCAL__FLUSH_DELAY_SECONDS", value = "10" }, + { name = "LORE__IMMUTABLE_STORE__COMPOSITE__DURABLE__MODE", value = "replicated" }, + { name = "LORE__IMMUTABLE_STORE__COMPOSITE__DURABLE__REPLICATED__REMOTE_URL", value = "lore://primary.${local.name}.internal:${local.port_replication}" }, + { name = "LORE__IMMUTABLE_STORE__COMPOSITE__DURABLE__REPLICATED__PERIODIC_CLIENT_REFRESH_SECS", value = "180" }, + { name = "LORE__IMMUTABLE_STORE__COMPOSITE__DURABLE__REPLICATED__REGENERATE_RETRY__INITIAL_BACKOFF_MS", value = "100" }, + { name = "LORE__IMMUTABLE_STORE__COMPOSITE__DURABLE__REPLICATED__REGENERATE_RETRY__MAX_BACKOFF_MS", value = "1000" }, + { name = "LORE__IMMUTABLE_STORE__COMPOSITE__DURABLE__REPLICATED__REGENERATE_RETRY__MAX_ATTEMPTS", value = "10" }, + + # Branch resolution proxied to primary + { name = "LORE__MUTABLE_STORE__MODE", value = "remote" }, + { name = "LORE__MUTABLE_STORE__REMOTE__REMOTE_URL", value = "lores://primary.${local.name}.internal:${local.port_quic_grpc}" }, + { name = "LORE__LOCK_STORE__MODE", value = "local" }, + ] + + logConfiguration = { + logDriver = "awslogs" + options = { + "awslogs-group" = aws_cloudwatch_log_group.lore.name + "awslogs-region" = var.region + "awslogs-stream-prefix" = "edge" + } + } + }, + ]) + + tags = local.tags +} + +resource "aws_ecs_service" "edge" { + name = "${local.name}-edge" + cluster = aws_ecs_cluster.this.id + task_definition = aws_ecs_task_definition.edge.arn + desired_count = 1 + + health_check_grace_period_seconds = 300 + + capacity_provider_strategy { + capacity_provider = aws_ecs_capacity_provider.ec2.name + weight = 100 + } + + network_configuration { + subnets = aws_subnet.private[*].id + security_groups = [aws_security_group.lore.id] + } + + service_registries { + registry_arn = aws_service_discovery_service.edge.arn + } + + placement_constraints { + type = "distinctInstance" + } + + depends_on = [aws_ecs_service.lore] + + tags = local.tags +} diff --git a/examples/aws/iam.tf b/examples/aws/iam.tf new file mode 100644 index 0000000..f87f39e --- /dev/null +++ b/examples/aws/iam.tf @@ -0,0 +1,162 @@ +# ============================================================================= +# IAM — EC2 instance role, ECS task roles, execution role +# ============================================================================= + +# EC2 instance role — ECS agent needs to communicate with the ECS API +resource "aws_iam_role" "ecs_instance" { + name_prefix = "${local.name}-instance-" + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Action = "sts:AssumeRole" + Effect = "Allow" + Principal = { Service = "ec2.amazonaws.com" } + }] + }) + tags = local.tags +} + +resource "aws_iam_role_policy_attachment" "ecs_instance_role" { + role = aws_iam_role.ecs_instance.name + policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonEC2ContainerServiceforEC2Role" +} + +resource "aws_iam_role_policy_attachment" "ecs_instance_ssm" { + role = aws_iam_role.ecs_instance.name + policy_arn = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" +} + +resource "aws_iam_instance_profile" "ecs_instance" { + name_prefix = "${local.name}-instance-" + role = aws_iam_role.ecs_instance.name + tags = local.tags +} + +# Primary task role — S3 + DynamoDB access for durable storage +resource "aws_iam_role" "task" { + name_prefix = "${local.name}-task-" + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Action = "sts:AssumeRole" + Effect = "Allow" + Principal = { Service = "ecs-tasks.amazonaws.com" } + }] + }) + tags = local.tags +} + +# Edge task role — intentionally empty. Edge proxies all storage operations +# through the primary via gRPC/QUIC, so it needs no direct S3 or DynamoDB access. +resource "aws_iam_role" "edge_task" { + name_prefix = "${local.name}-edge-task-" + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Action = "sts:AssumeRole" + Effect = "Allow" + Principal = { Service = "ecs-tasks.amazonaws.com" } + }] + }) + tags = local.tags +} + +resource "aws_iam_role_policy" "task_s3" { + name_prefix = "s3-" + role = aws_iam_role.task.id + policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Effect = "Allow" + Action = [ + "s3:GetObject", + "s3:PutObject", + "s3:DeleteObject", + "s3:DeleteObjectVersion", + "s3:ListBucket", + "s3:ListBucketVersions", + ] + Resource = [ + aws_s3_bucket.fragments.arn, + "${aws_s3_bucket.fragments.arn}/*", + ] + }] + }) +} + +resource "aws_iam_role_policy" "task_dynamodb" { + name_prefix = "dynamodb-" + role = aws_iam_role.task.id + policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Effect = "Allow" + Action = [ + "dynamodb:GetItem", + "dynamodb:PutItem", + "dynamodb:DeleteItem", + "dynamodb:Query", + "dynamodb:BatchGetItem", + "dynamodb:DescribeTable", + "dynamodb:TransactWriteItems", + ] + Resource = [ + aws_dynamodb_table.fragments.arn, + aws_dynamodb_table.metadata.arn, + aws_dynamodb_table.mutable.arn, + aws_dynamodb_table.locks.arn, + "${aws_dynamodb_table.locks.arn}/index/*", + ] + }] + }) +} + +# Execution role — ECS agent pulls images, writes logs, reads secrets +resource "aws_iam_role" "execution" { + name_prefix = "${local.name}-exec-" + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Action = "sts:AssumeRole" + Effect = "Allow" + Principal = { Service = "ecs-tasks.amazonaws.com" } + }] + }) + tags = local.tags +} + +resource "aws_iam_role_policy_attachment" "execution_ecr" { + role = aws_iam_role.execution.name + policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy" +} + +resource "aws_iam_role_policy" "execution_secrets" { + name_prefix = "secrets-" + role = aws_iam_role.execution.id + policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Effect = "Allow" + Action = ["secretsmanager:GetSecretValue"] + Resource = [aws_secretsmanager_secret.tls.arn, aws_secretsmanager_secret.hmac.arn] + }] + }) +} + +# ============================================================================= +# HMAC Key — presigned URL feature for fragment transfer between nodes +# ============================================================================= + +resource "random_id" "hmac" { + byte_length = 32 +} + +resource "aws_secretsmanager_secret" "hmac" { + name_prefix = "${local.name}-hmac-" + tags = local.tags +} + +resource "aws_secretsmanager_secret_version" "hmac" { + secret_id = aws_secretsmanager_secret.hmac.id + secret_string = random_id.hmac.hex +} diff --git a/examples/aws/main.tf b/examples/aws/main.tf new file mode 100644 index 0000000..b644544 --- /dev/null +++ b/examples/aws/main.tf @@ -0,0 +1,17 @@ +provider "aws" { + region = var.region +} + +locals { + name = var.name + tags = { ManagedBy = "terraform", Project = "lore" } + + # Ports — match lore-server/config/default.toml + port_quic_grpc = 41337 # QUIC (UDP) + gRPC (TCP) + port_http = 41339 # Health checks, presigned URLs + port_replication = 41340 # QUIC internal replication (UDP) +} + +data "aws_availability_zones" "available" { + state = "available" +} diff --git a/examples/aws/network.tf b/examples/aws/network.tf new file mode 100644 index 0000000..48b109d --- /dev/null +++ b/examples/aws/network.tf @@ -0,0 +1,177 @@ +# ============================================================================= +# VPC — minimal 2-AZ layout with public + private subnets +# ============================================================================= + +resource "aws_vpc" "this" { + cidr_block = "10.0.0.0/16" + enable_dns_hostnames = true + enable_dns_support = true + tags = merge(local.tags, { Name = "${local.name}-vpc" }) +} + +resource "aws_internet_gateway" "this" { + vpc_id = aws_vpc.this.id + tags = merge(local.tags, { Name = "${local.name}-igw" }) +} + +resource "aws_subnet" "public" { + count = 2 + vpc_id = aws_vpc.this.id + cidr_block = cidrsubnet(aws_vpc.this.cidr_block, 8, count.index) + availability_zone = data.aws_availability_zones.available.names[count.index] + map_public_ip_on_launch = true + tags = merge(local.tags, { Name = "${local.name}-public-${count.index}" }) +} + +resource "aws_subnet" "private" { + count = 2 + vpc_id = aws_vpc.this.id + cidr_block = cidrsubnet(aws_vpc.this.cidr_block, 8, count.index + 10) + availability_zone = data.aws_availability_zones.available.names[count.index] + tags = merge(local.tags, { Name = "${local.name}-private-${count.index}" }) +} + +resource "aws_eip" "nat" { + domain = "vpc" + tags = merge(local.tags, { Name = "${local.name}-nat-eip" }) +} + +resource "aws_nat_gateway" "this" { + allocation_id = aws_eip.nat.id + subnet_id = aws_subnet.public[0].id + tags = merge(local.tags, { Name = "${local.name}-nat" }) +} + +resource "aws_route_table" "public" { + vpc_id = aws_vpc.this.id + tags = merge(local.tags, { Name = "${local.name}-public-rt" }) +} + +resource "aws_route" "public_internet" { + route_table_id = aws_route_table.public.id + destination_cidr_block = "0.0.0.0/0" + gateway_id = aws_internet_gateway.this.id +} + +resource "aws_route_table_association" "public" { + count = 2 + subnet_id = aws_subnet.public[count.index].id + route_table_id = aws_route_table.public.id +} + +resource "aws_route_table" "private" { + vpc_id = aws_vpc.this.id + tags = merge(local.tags, { Name = "${local.name}-private-rt" }) +} + +resource "aws_route" "private_nat" { + route_table_id = aws_route_table.private.id + destination_cidr_block = "0.0.0.0/0" + nat_gateway_id = aws_nat_gateway.this.id +} + +resource "aws_route_table_association" "private" { + count = 2 + subnet_id = aws_subnet.private[count.index].id + route_table_id = aws_route_table.private.id +} + +# ============================================================================= +# Security Group — Lore server +# ============================================================================= + +resource "aws_security_group" "lore" { + name_prefix = "${local.name}-server-" + description = "Lore server ports" + vpc_id = aws_vpc.this.id + tags = merge(local.tags, { Name = "${local.name}-server-sg" }) + + lifecycle { create_before_destroy = true } +} + +# Client access: QUIC (UDP) + gRPC (TCP) on 41337 +resource "aws_vpc_security_group_ingress_rule" "client_quic" { + for_each = toset(var.allowed_cidrs) + security_group_id = aws_security_group.lore.id + from_port = local.port_quic_grpc + to_port = local.port_quic_grpc + ip_protocol = "udp" + cidr_ipv4 = each.value + description = "Client QUIC" +} + +resource "aws_vpc_security_group_ingress_rule" "client_grpc" { + for_each = toset(var.allowed_cidrs) + security_group_id = aws_security_group.lore.id + from_port = local.port_quic_grpc + to_port = local.port_quic_grpc + ip_protocol = "tcp" + cidr_ipv4 = each.value + description = "Client gRPC" +} + +# HTTP health checks + presigned URLs +resource "aws_vpc_security_group_ingress_rule" "client_http" { + for_each = toset(var.allowed_cidrs) + security_group_id = aws_security_group.lore.id + from_port = local.port_http + to_port = local.port_http + ip_protocol = "tcp" + cidr_ipv4 = each.value + description = "Client HTTP" +} + +# Internal: QUIC replication (edge → primary on 41340 UDP) +resource "aws_vpc_security_group_ingress_rule" "replication_quic" { + security_group_id = aws_security_group.lore.id + from_port = 41340 + to_port = 41340 + ip_protocol = "udp" + referenced_security_group_id = aws_security_group.lore.id + description = "QUIC replication between Lore nodes" +} + +# Internal: gRPC (edge → primary on 41337 TCP for remote mutable store) +resource "aws_vpc_security_group_ingress_rule" "internal_grpc" { + security_group_id = aws_security_group.lore.id + from_port = 41337 + to_port = 41337 + ip_protocol = "tcp" + referenced_security_group_id = aws_security_group.lore.id + description = "gRPC between Lore nodes" +} + +# Internal: QUIC (edge → primary on 41337 UDP for replicated immutable store) +resource "aws_vpc_security_group_ingress_rule" "internal_quic" { + security_group_id = aws_security_group.lore.id + from_port = 41337 + to_port = 41337 + ip_protocol = "udp" + referenced_security_group_id = aws_security_group.lore.id + description = "QUIC between Lore nodes" +} + +resource "aws_vpc_security_group_egress_rule" "all" { + security_group_id = aws_security_group.lore.id + ip_protocol = "-1" + cidr_ipv4 = "0.0.0.0/0" + description = "All outbound" +} + +# ============================================================================= +# VPC Endpoints — S3 and DynamoDB (avoid NAT costs for AWS API traffic) +# ============================================================================= + +resource "aws_vpc_endpoint" "s3" { + vpc_id = aws_vpc.this.id + service_name = "com.amazonaws.${var.region}.s3" + route_table_ids = [aws_route_table.private.id] + tags = merge(local.tags, { Name = "${local.name}-s3-endpoint" }) +} + +resource "aws_vpc_endpoint" "dynamodb" { + vpc_id = aws_vpc.this.id + service_name = "com.amazonaws.${var.region}.dynamodb" + route_table_ids = [aws_route_table.private.id] + tags = merge(local.tags, { Name = "${local.name}-dynamodb-endpoint" }) +} diff --git a/examples/aws/outputs.tf b/examples/aws/outputs.tf new file mode 100644 index 0000000..ef00c59 --- /dev/null +++ b/examples/aws/outputs.tf @@ -0,0 +1,39 @@ +output "cluster_name" { + description = "ECS cluster name" + value = aws_ecs_cluster.this.name +} + +output "service_name" { + description = "ECS service name (primary)" + value = aws_ecs_service.lore.name +} + +output "edge_service_name" { + description = "ECS service name (edge)" + value = aws_ecs_service.edge.name +} + +output "primary_dns" { + description = "Cloud Map DNS for primary (used by edge pods)" + value = "primary.${aws_service_discovery_private_dns_namespace.this.name}" +} + +output "edge_dns" { + description = "Cloud Map DNS for edge (used by clients)" + value = "edge.${aws_service_discovery_private_dns_namespace.this.name}" +} + +output "s3_bucket" { + description = "S3 bucket for fragment storage" + value = aws_s3_bucket.fragments.id +} + +output "log_group" { + description = "CloudWatch log group" + value = aws_cloudwatch_log_group.lore.name +} + +output "ca_certificate_pem" { + description = "CA certificate — clients need this to trust the server's TLS cert" + value = local.ca_pem +} diff --git a/examples/aws/scripts/e2e-test.sh b/examples/aws/scripts/e2e-test.sh new file mode 100755 index 0000000..29eaa5f --- /dev/null +++ b/examples/aws/scripts/e2e-test.sh @@ -0,0 +1,118 @@ +#!/usr/bin/env bash +# scripts/e2e-test.sh — End-to-end validation of Lore push + clone via the edge node. +# Requires: terraform apply completed, AWS credentials, SSM access to instances. +# Platforms: Linux, macOS (runs remotely on Graviton instance via SSM) +# +# Usage: ./scripts/e2e-test.sh [region] +# +# Builds the Lore CLI from source inside a Docker container on one of the +# ECS instances, then pushes a 10MB test file and clones it back to verify +# data integrity through the full storage chain (NVMe cache → S3 → replication). +set -euo pipefail + +REGION="${1:-us-west-2}" +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +EXAMPLE_DIR="$SCRIPT_DIR/.." + +cd "$EXAMPLE_DIR" + +echo "=== E2E Test: Lore on AWS ===" + +# Get deployment info from terraform +S3_BUCKET=$(terraform output -raw s3_bucket) +CA_CERT=$(terraform output -raw ca_certificate_pem) +EDGE_DNS=$(terraform output -raw edge_dns) +PRIMARY_DNS=$(terraform output -raw primary_dns) +echo " Bucket: $S3_BUCKET" +echo " Edge: $EDGE_DNS" +echo " Primary: $PRIMARY_DNS" + +# Find an instance to run on (uses ECS-managed tag, not the Name tag) +CLUSTER=$(terraform output -raw cluster_name) +INSTANCE_ID=$(aws ec2 describe-instances \ + --filters "Name=tag:aws:ecs:clusterName,Values=$CLUSTER" 'Name=instance-state-name,Values=running' \ + --query 'Reservations[0].Instances[0].InstanceId' \ + --output text --region "$REGION") +echo " Instance: $INSTANCE_ID" + +# Upload source if not already present +if ! aws s3 ls "s3://$S3_BUCKET/build/lore-src.tar.gz" --region "$REGION" >/dev/null 2>&1; then + echo " Uploading Lore source to S3..." + REPO_ROOT="$(cd "$EXAMPLE_DIR/../.." && pwd)" + tar -czf /tmp/lore-src.tar.gz -C "$REPO_ROOT" \ + --exclude=target --exclude=.git --exclude='examples/aws/.terraform*' \ + --exclude='*.tfstate*' --exclude='*.tfvars' . + aws s3 cp /tmp/lore-src.tar.gz "s3://$S3_BUCKET/build/lore-src.tar.gz" --region "$REGION" +fi + +PRESIGNED_URL=$(aws s3 presign "s3://$S3_BUCKET/build/lore-src.tar.gz" --expires-in 900 --region "$REGION") + +# Write the CA cert for the combined bundle +echo "$CA_CERT" > /tmp/e2e-ca.pem + +echo "" +echo "=== Building Lore CLI on $INSTANCE_ID (takes ~4 min) ===" + +COMMAND_ID=$(aws ssm send-command \ + --instance-ids "$INSTANCE_ID" \ + --document-name "AWS-RunShellScript" \ + --parameters "commands=[ + \"set -ex\", + \"curl -sSo /tmp/lore-src.tar.gz '$PRESIGNED_URL'\", + \"rm -rf /tmp/lore-src && mkdir -p /tmp/lore-src && tar -xzf /tmp/lore-src.tar.gz -C /tmp/lore-src\", + \"echo '$CA_CERT' > /tmp/lore-ca.pem && cat /etc/pki/tls/certs/ca-bundle.crt /tmp/lore-ca.pem > /tmp/combined-ca.pem\", + \"docker run --rm --network host -v /tmp/lore-src:/src -v /tmp/combined-ca.pem:/certs/ca.pem -w /src -e SSL_CERT_FILE=/certs/ca.pem rust:latest bash -c 'apt-get update -qq && apt-get install -y -qq pkg-config libssl-dev protobuf-compiler >/dev/null 2>&1 && cargo build --release -p lore-client 2>&1 | tail -3 && echo BUILD_OK && REPO=e2e-\$(date +%s) && ./target/release/lore --version && echo === CREATE REPO \$REPO === && ./target/release/lore repository create lores://$PRIMARY_DNS:41337/\$REPO && echo === CLONE === && ./target/release/lore clone lores://$PRIMARY_DNS:41337/\$REPO /tmp/e2e && echo === ADD 10MB FILE === && dd if=/dev/urandom of=/tmp/e2e/asset.bin bs=1M count=10 2>&1 && cd /tmp/e2e && echo === STAGE === && /src/target/release/lore stage asset.bin && echo === COMMIT === && /src/target/release/lore commit --non-interactive e2e-test && echo === PUSH === && /src/target/release/lore push && echo === CLONE BACK === && rm -rf /tmp/clone && /src/target/release/lore clone lores://$PRIMARY_DNS:41337/\$REPO /tmp/clone && echo === VERIFY === && md5sum /tmp/e2e/asset.bin /tmp/clone/asset.bin'\" + ]" \ + --timeout-seconds 900 \ + --query 'Command.CommandId' \ + --output text \ + --region "$REGION") + +echo " Command: $COMMAND_ID" +echo " Waiting for completion..." + +# Poll until done +while true; do + sleep 30 + STATUS=$(aws ssm get-command-invocation \ + --command-id "$COMMAND_ID" \ + --instance-id "$INSTANCE_ID" \ + --query 'Status' \ + --output text \ + --region "$REGION" 2>/dev/null || echo "Pending") + + case "$STATUS" in + InProgress|Pending) echo " ... still running" ;; + Success) + echo "" + echo "=== SUCCESS ===" + aws ssm get-command-invocation \ + --command-id "$COMMAND_ID" \ + --instance-id "$INSTANCE_ID" \ + --query 'StandardOutputContent' \ + --output text \ + --region "$REGION" | grep -A1 "VERIFY" + echo "" + echo "✓ Push + Clone verified. MD5 checksums match." + exit 0 + ;; + *) + echo "" + echo "=== FAILED (status: $STATUS) ===" + aws ssm get-command-invocation \ + --command-id "$COMMAND_ID" \ + --instance-id "$INSTANCE_ID" \ + --query 'StandardOutputContent' \ + --output text \ + --region "$REGION" | tail -20 + echo "---STDERR---" + aws ssm get-command-invocation \ + --command-id "$COMMAND_ID" \ + --instance-id "$INSTANCE_ID" \ + --query 'StandardErrorContent' \ + --output text \ + --region "$REGION" | grep -v "^++" | grep -v "MII\|BEGIN\|END" | tail -10 + exit 1 + ;; + esac +done diff --git a/examples/aws/storage.tf b/examples/aws/storage.tf new file mode 100644 index 0000000..6fe8b46 --- /dev/null +++ b/examples/aws/storage.tf @@ -0,0 +1,203 @@ +# ============================================================================= +# S3 — Fragment payloads (immutable store) +# ============================================================================= + +# force_destroy defaults to false — the bucket cannot be destroyed with data inside. +# For dev/test teardown, set force_destroy = true or empty the bucket before destroy. +resource "aws_s3_bucket" "fragments" { + bucket_prefix = "${local.name}-fragments-" + tags = local.tags +} + +resource "aws_s3_bucket_versioning" "fragments" { + bucket = aws_s3_bucket.fragments.id + versioning_configuration { status = "Enabled" } +} + +resource "aws_s3_bucket_server_side_encryption_configuration" "fragments" { + bucket = aws_s3_bucket.fragments.id + rule { + apply_server_side_encryption_by_default { sse_algorithm = "AES256" } + } +} + +resource "aws_s3_bucket_public_access_block" "fragments" { + bucket = aws_s3_bucket.fragments.id + block_public_acls = true + block_public_policy = true + ignore_public_acls = true + restrict_public_buckets = true +} + +resource "aws_s3_bucket_lifecycle_configuration" "fragments" { + bucket = aws_s3_bucket.fragments.id + + rule { + id = "abort-incomplete-multipart" + status = "Enabled" + filter {} + abort_incomplete_multipart_upload { + days_after_initiation = 7 + } + } +} + +# ============================================================================= +# DynamoDB — Fragment associations +# Key schema from lore-aws/src/store/immutable_store.rs +# ============================================================================= + +resource "aws_dynamodb_table" "fragments" { + name = "${local.name}-fragments" + billing_mode = "PAY_PER_REQUEST" + hash_key = "hash" + range_key = "repository_context" + + attribute { + name = "hash" + type = "B" + } + attribute { + name = "repository_context" + type = "B" + } + + point_in_time_recovery { enabled = true } + + tags = local.tags +} + +# ============================================================================= +# DynamoDB — Fragment metadata (hash-only key, no sort key) +# Key schema from lore-aws/src/store/immutable_store.rs +# ============================================================================= + +resource "aws_dynamodb_table" "metadata" { + name = "${local.name}-metadata" + billing_mode = "PAY_PER_REQUEST" + hash_key = "hash" + + attribute { + name = "hash" + type = "B" + } + + point_in_time_recovery { enabled = true } + + tags = local.tags +} + +# ============================================================================= +# DynamoDB — Mutable store (branch pointers) +# Key schema from lore-aws/src/store/mutable_store.rs +# ============================================================================= + +resource "aws_dynamodb_table" "mutable" { + name = "${local.name}-mutable" + billing_mode = "PAY_PER_REQUEST" + hash_key = "repository_id" + range_key = "key" + + attribute { + name = "repository_id" + type = "B" + } + attribute { + name = "key" + type = "B" + } + + point_in_time_recovery { enabled = true } + + tags = local.tags +} + +# ============================================================================= +# DynamoDB — Distributed locks +# Key schema + GSIs from lore-aws/src/store/lock_store.rs +# ============================================================================= + +# NOTE: Table-level hash_key/range_key emits a deprecation warning suggesting key_schema, +# but key_schema blocks don't exist at the table level in the provider schema (only in GSIs). +# The warning is premature — no migration path exists yet for table primary keys. + +# Deletion protection disabled for teardown convenience. +# Production: add deletion_protection_enabled = true to each table. + +resource "aws_dynamodb_table" "locks" { + name = "${local.name}-locks" + billing_mode = "PAY_PER_REQUEST" + hash_key = "hash" + range_key = "repositoryBranch" + + attribute { + name = "hash" + type = "B" + } + attribute { + name = "repositoryBranch" + type = "B" + } + attribute { + name = "ownerId" + type = "S" + } + attribute { + name = "repository" + type = "B" + } + attribute { + name = "branch" + type = "B" + } + attribute { + name = "description" + type = "S" + } + + global_secondary_index { + name = "owner-repo-branch" + projection_type = "ALL" + + key_schema { + attribute_name = "ownerId" + key_type = "HASH" + } + key_schema { + attribute_name = "repositoryBranch" + key_type = "RANGE" + } + } + + global_secondary_index { + name = "repo-branch" + projection_type = "ALL" + + key_schema { + attribute_name = "repository" + key_type = "HASH" + } + key_schema { + attribute_name = "branch" + key_type = "RANGE" + } + } + + global_secondary_index { + name = "repo-branch-description" + projection_type = "ALL" + + key_schema { + attribute_name = "repositoryBranch" + key_type = "HASH" + } + key_schema { + attribute_name = "description" + key_type = "RANGE" + } + } + + point_in_time_recovery { enabled = true } + + tags = local.tags +} diff --git a/examples/aws/terraform.tfvars.example b/examples/aws/terraform.tfvars.example new file mode 100644 index 0000000..f102229 --- /dev/null +++ b/examples/aws/terraform.tfvars.example @@ -0,0 +1,4 @@ +region = "us-west-2" +container_image = "123456789012.dkr.ecr.us-west-2.amazonaws.com/loreserver:latest" # must support linux/arm64 +allowed_cidrs = ["10.0.0.0/8"] +# instance_type = "c8gd.8xlarge" # default — 32 vCPU, 64 GB, 1.9 TB NVMe, 25 Gbps diff --git a/examples/aws/tests/plan.tftest.hcl b/examples/aws/tests/plan.tftest.hcl new file mode 100644 index 0000000..8fca227 --- /dev/null +++ b/examples/aws/tests/plan.tftest.hcl @@ -0,0 +1,111 @@ +# Plan-level validation — runs without AWS credentials. +# Catches stale resource schemas, broken variable wiring, and +# Terraform/provider version incompatibilities. +# +# Run: terraform test + +mock_provider "aws" {} +mock_provider "tls" {} +mock_provider "random" {} + +override_data { + target = data.aws_availability_zones.available + values = { + names = ["us-west-2a", "us-west-2b"] + } +} + +override_data { + target = data.aws_ssm_parameter.ecs_ami + values = { + value = "ami-0123456789abcdef0" + } +} + +variables { + container_image = "123456789012.dkr.ecr.us-west-2.amazonaws.com/loreserver:latest" + allowed_cidrs = ["10.0.0.0/8"] + region = "us-west-2" + name = "lore" +} + +run "cluster_and_services_configured" { + command = plan + + assert { + condition = aws_ecs_cluster.this.name == "lore-cluster" + error_message = "Cluster name should be 'lore-cluster'" + } + + assert { + condition = aws_ecs_service.lore.name == "lore" + error_message = "Primary service name should be 'lore'" + } + + assert { + condition = aws_ecs_service.edge.name == "lore-edge" + error_message = "Edge service name should be 'lore-edge'" + } +} + +run "storage_schemas_correct" { + command = plan + + assert { + condition = aws_dynamodb_table.fragments.hash_key == "hash" + error_message = "Fragments table hash key must be 'hash'" + } + + assert { + condition = aws_dynamodb_table.fragments.range_key == "repository_context" + error_message = "Fragments table range key must be 'repository_context'" + } + + assert { + condition = aws_dynamodb_table.metadata.hash_key == "hash" + error_message = "Metadata table hash key must be 'hash'" + } + + assert { + condition = aws_dynamodb_table.mutable.hash_key == "repository_id" + error_message = "Mutable table hash key must be 'repository_id'" + } + + assert { + condition = aws_dynamodb_table.locks.hash_key == "hash" + error_message = "Locks table hash key must be 'hash'" + } + + assert { + condition = aws_dynamodb_table.locks.range_key == "repositoryBranch" + error_message = "Locks table range key must be 'repositoryBranch'" + } +} + +run "service_discovery_configured" { + command = plan + + assert { + condition = aws_service_discovery_private_dns_namespace.this.name == "lore.internal" + error_message = "Cloud Map namespace should be 'lore.internal'" + } + + assert { + condition = aws_service_discovery_service.lore.name == "primary" + error_message = "Cloud Map service name should be 'primary'" + } +} + +run "ec2_infrastructure_configured" { + command = plan + + assert { + condition = aws_launch_template.ecs.instance_type == "c8gd.8xlarge" + error_message = "Launch template should use c8gd.8xlarge" + } + + assert { + condition = aws_autoscaling_group.ecs.min_size == 2 + error_message = "ASG min size should be 2 (primary + edge)" + } +} diff --git a/examples/aws/tls.tf b/examples/aws/tls.tf new file mode 100644 index 0000000..18c7d4f --- /dev/null +++ b/examples/aws/tls.tf @@ -0,0 +1,73 @@ +# ============================================================================= +# TLS — CA + server certificate for QUIC and gRPC between nodes +# +# The public QUIC endpoint generates an ephemeral cert if none is configured, +# but the internal replication endpoint (quic_internal) requires an explicit +# certificate. We generate a CA + server cert here so both primary and edge +# can establish trusted QUIC connections. +# ============================================================================= + +resource "tls_private_key" "ca" { + algorithm = "ECDSA" + ecdsa_curve = "P384" +} + +resource "tls_self_signed_cert" "ca" { + private_key_pem = tls_private_key.ca.private_key_pem + + subject { + common_name = "${local.name}-ca" + organization = "Lore Example" + } + + validity_period_hours = 8760 + is_ca_certificate = true + allowed_uses = ["cert_signing", "crl_signing"] +} + +resource "tls_private_key" "server" { + algorithm = "ECDSA" + ecdsa_curve = "P384" +} + +resource "tls_cert_request" "server" { + private_key_pem = tls_private_key.server.private_key_pem + + subject { + common_name = "lore-server" + organization = "Lore Example" + } + + # Cloud Map DNS names used by clients and inter-node communication + dns_names = ["primary.${local.name}.internal", "edge.${local.name}.internal", "localhost"] +} + +resource "tls_locally_signed_cert" "server" { + cert_request_pem = tls_cert_request.server.cert_request_pem + ca_private_key_pem = tls_private_key.ca.private_key_pem + ca_cert_pem = tls_self_signed_cert.ca.cert_pem + + validity_period_hours = 8760 + allowed_uses = ["digital_signature", "key_encipherment", "server_auth"] +} + +# Fullchain = server cert + CA cert +locals { + fullchain_pem = "${tls_locally_signed_cert.server.cert_pem}${tls_self_signed_cert.ca.cert_pem}" + server_key = tls_private_key.server.private_key_pem + ca_pem = tls_self_signed_cert.ca.cert_pem +} + +resource "aws_secretsmanager_secret" "tls" { + name_prefix = "${local.name}-tls-" + tags = local.tags +} + +resource "aws_secretsmanager_secret_version" "tls" { + secret_id = aws_secretsmanager_secret.tls.id + secret_string = jsonencode({ + fullchain = local.fullchain_pem + key = local.server_key + ca = local.ca_pem + }) +} diff --git a/examples/aws/user_data.sh.tpl b/examples/aws/user_data.sh.tpl new file mode 100644 index 0000000..477122c --- /dev/null +++ b/examples/aws/user_data.sh.tpl @@ -0,0 +1,38 @@ +#!/bin/bash +set -euo pipefail + +# Format NVMe instance store and register with ECS cluster. +# c8gd.8xlarge has 1x 1.9 TB NVMe SSD. + +MOUNT_PATH="${mount_path}" +ECS_CLUSTER="${cluster_name}" + +# Detect NVMe instance store devices (exclude EBS) +INSTANCE_STORE_DEVICES=() +for device in /dev/nvme*n1; do + [ -e "$device" ] || continue + devname=$(basename "$device") + model=$(cat "/sys/block/$devname/device/model" 2>/dev/null || echo "") + if [[ "$model" == *"Instance Storage"* ]]; then + INSTANCE_STORE_DEVICES+=("$device") + fi +done + +# Format and mount +if [ $${#INSTANCE_STORE_DEVICES[@]} -gt 0 ]; then + mkfs.xfs -f "$${INSTANCE_STORE_DEVICES[0]}" + mkdir -p "$MOUNT_PATH" + mount -o noatime,nodiratime,discard "$${INSTANCE_STORE_DEVICES[0]}" "$MOUNT_PATH" + chmod 777 "$MOUNT_PATH" +else + mkdir -p "$MOUNT_PATH" + echo "WARNING: No NVMe instance store found. Using root volume at $MOUNT_PATH" +fi + +# Configure ECS agent +cat >> /etc/ecs/ecs.config <