From d20978b466c9958f29538318e20f25d568e424e4 Mon Sep 17 00:00:00 2001 From: Sam Biggins Date: Wed, 17 Jun 2026 22:39:39 +0000 Subject: [PATCH 01/10] Add Terraform example for AWS deployment Deploys loreserver on ECS Fargate with S3/DynamoDB storage. DynamoDB schemas and IAM permissions verified against lore-aws source. Signed-off-by: Sam Biggins --- examples/aws/.gitignore | 6 + examples/aws/README.md | 89 +++++++++++++++ examples/aws/compute.tf | 87 +++++++++++++++ examples/aws/iam.tf | 85 ++++++++++++++ examples/aws/main.tf | 16 +++ examples/aws/network.tf | 147 +++++++++++++++++++++++++ examples/aws/outputs.tf | 19 ++++ examples/aws/storage.tf | 152 ++++++++++++++++++++++++++ examples/aws/terraform.tfvars.example | 3 + examples/aws/variables.tf | 21 ++++ examples/aws/versions.tf | 10 ++ 11 files changed, 635 insertions(+) create mode 100644 examples/aws/.gitignore create mode 100644 examples/aws/README.md create mode 100644 examples/aws/compute.tf create mode 100644 examples/aws/iam.tf create mode 100644 examples/aws/main.tf create mode 100644 examples/aws/network.tf create mode 100644 examples/aws/outputs.tf create mode 100644 examples/aws/storage.tf create mode 100644 examples/aws/terraform.tfvars.example create mode 100644 examples/aws/variables.tf create mode 100644 examples/aws/versions.tf diff --git a/examples/aws/.gitignore b/examples/aws/.gitignore new file mode 100644 index 0000000..7d126a9 --- /dev/null +++ b/examples/aws/.gitignore @@ -0,0 +1,6 @@ +*.tfstate +*.tfstate.* +*.tfplan +.terraform/ +.terraform.lock.hcl +terraform.tfvars diff --git a/examples/aws/README.md b/examples/aws/README.md new file mode 100644 index 0000000..313b72a --- /dev/null +++ b/examples/aws/README.md @@ -0,0 +1,89 @@ +# Lore on AWS + +Terraform configuration that deploys a Lore server on AWS with durable S3/DynamoDB storage using ECS Fargate. + +## What this creates + +- VPC with public and private subnets (2 AZs) +- S3 bucket for fragment storage (immutable store) +- 4 DynamoDB tables (fragments, metadata, mutable store, locks) +- ECS Fargate service running the loreserver container +- VPC endpoints for S3 and DynamoDB (reduces NAT costs) +- CloudWatch log group + +## Prerequisites + +- [Terraform](https://developer.hashicorp.com/terraform/install) >= 1.5 +- AWS credentials configured (`aws configure` or environment variables) +- A `loreserver` container image in ECR — build from the repo root: + +```sh +docker build -f lore-server/Dockerfile -t loreserver . + +aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin .dkr.ecr.us-west-2.amazonaws.com +aws ecr create-repository --repository-name loreserver --region us-west-2 +docker tag loreserver:latest .dkr.ecr.us-west-2.amazonaws.com/loreserver:latest +docker push .dkr.ecr.us-west-2.amazonaws.com/loreserver:latest +``` + +## Deploy + +```sh +cd examples/aws +cp terraform.tfvars.example terraform.tfvars +# Edit terraform.tfvars — set your container_image URI and allowed_cidrs +terraform init +terraform apply +``` + +## Connect + +Get the task IP (Fargate assigns a private IP in the VPC): + +```sh +TASK_ARN=$(aws ecs list-tasks --cluster lore-cluster --service-name lore --query 'taskArns[0]' --output text) +TASK_IP=$(aws ecs describe-tasks --cluster lore-cluster --tasks "$TASK_ARN" \ + --query 'tasks[0].attachments[0].details[?name==`privateIPv4Address`].value' --output text) +echo "$TASK_IP" +``` + +The server generates an ephemeral self-signed certificate on startup. For local testing, skip TLS verification or use `lore://` (plain gRPC, QUIC still has TLS): + +```sh +lore clone lore://${TASK_IP}:41337/my-repo +``` + +For production, configure real TLS certificates (see Customize below) and use `lores://`. + +## Verify + +Check the service is running: + +```sh +aws ecs describe-services --cluster lore-cluster --services lore \ + --query 'services[0].{status:status,running:runningCount}' +``` + +Check server logs: + +```sh +aws logs tail /ecs/lore --since 5m +``` + +## Customize + +This example uses the simplest viable configuration. For production: + +- **TLS** — mount real certificates and set `LORE__SERVER__QUIC__CERTIFICATE__CERT_FILE` / `PKEY_FILE` (and the same for `GRPC`). See [Server configuration reference](https://epicgames.github.io/lore/reference/lore-server-config/#certificate-block). +- **Auth** — configure `LORE__SERVER__AUTH__JWK__ENDPOINT` to validate JWTs. See [Authentication](https://epicgames.github.io/lore/reference/lore-server-config/#authentication). +- **Caching** — switch from Fargate to EC2 with NVMe instances and use `LORE__IMMUTABLE_STORE__MODE=composite` for a local cache in front of S3. +- **Replication** — add edge nodes with `LORE__IMMUTABLE_STORE__MODE=replicated` for multi-region. See [Topology](https://epicgames.github.io/lore/reference/lore-server-config/#topology-settings). +- **HMAC** — set `LORE__SERVER__HTTP__PRESIGNED_URL_HMAC_KEY` (hex, ≥32 bytes) to enable presigned URLs for direct client-to-S3 transfers. + +## Destroy + +```sh +terraform destroy +``` + +Teardown takes 2–3 minutes (VPC, NAT gateway deletion). diff --git a/examples/aws/compute.tf b/examples/aws/compute.tf new file mode 100644 index 0000000..7e630c7 --- /dev/null +++ b/examples/aws/compute.tf @@ -0,0 +1,87 @@ +# ============================================================================= +# ECS Cluster + Fargate Service +# ============================================================================= + +resource "aws_ecs_cluster" "this" { + name = "${local.name}-cluster" + tags = local.tags +} + +resource "aws_cloudwatch_log_group" "lore" { + name = "/ecs/${local.name}" + retention_in_days = 7 + tags = local.tags +} + +resource "aws_ecs_task_definition" "lore" { + family = local.name + requires_compatibilities = ["FARGATE"] + network_mode = "awsvpc" + cpu = "1024" + memory = "2048" + execution_role_arn = aws_iam_role.execution.arn + task_role_arn = aws_iam_role.task.arn + + container_definitions = jsonencode([{ + name = "loreserver" + image = var.container_image + essential = true + + portMappings = [ + { containerPort = local.port_quic_grpc, protocol = "tcp" }, + { containerPort = local.port_quic_grpc, protocol = "udp" }, + { containerPort = local.port_http, protocol = "tcp" }, + ] + + environment = [ + { name = "LORE_ENV", value = "docker" }, + { name = "LORE_CONFIG_PATH", value = "/etc/lore/config" }, + + # Storage: S3 + DynamoDB via the aws plugin + { name = "LORE__IMMUTABLE_STORE__MODE", value = "aws" }, + { name = "LORE__MUTABLE_STORE__MODE", value = "aws" }, + { name = "LORE__LOCK_STORE__MODE", value = "aws" }, + + # AWS plugin config — resource names from Terraform + { name = "LORE__PLUGINS__AWS__IMMUTABLE_STORE__S3_BUCKET", value = aws_s3_bucket.fragments.id }, + { name = "LORE__PLUGINS__AWS__IMMUTABLE_STORE__DYNAMODB_FRAGMENTS_TABLE", value = aws_dynamodb_table.fragments.name }, + { name = "LORE__PLUGINS__AWS__IMMUTABLE_STORE__DYNAMODB_METADATA_TABLE", value = aws_dynamodb_table.metadata.name }, + { name = "LORE__PLUGINS__AWS__MUTABLE_STORE__DYNAMODB_TABLE", value = aws_dynamodb_table.mutable.name }, + { name = "LORE__PLUGINS__AWS__LOCK_STORE__DYNAMODB_TABLE", value = aws_dynamodb_table.locks.name }, + ] + + # TLS: The server generates an ephemeral self-signed certificate when no + # certificate is configured. For production, mount real certs and set: + # LORE__SERVER__QUIC__CERTIFICATE__CERT_FILE=/certs/cert.pem + # LORE__SERVER__QUIC__CERTIFICATE__PKEY_FILE=/certs/key.pem + # LORE__SERVER__GRPC__CERTIFICATE__CERT_FILE=/certs/cert.pem + # LORE__SERVER__GRPC__CERTIFICATE__PKEY_FILE=/certs/key.pem + + logConfiguration = { + logDriver = "awslogs" + options = { + "awslogs-group" = aws_cloudwatch_log_group.lore.name + "awslogs-region" = var.region + "awslogs-stream-prefix" = "lore" + } + } + }]) + + tags = local.tags +} + +resource "aws_ecs_service" "lore" { + name = local.name + cluster = aws_ecs_cluster.this.id + task_definition = aws_ecs_task_definition.lore.arn + desired_count = 1 + launch_type = "FARGATE" + + network_configuration { + subnets = aws_subnet.private[*].id + security_groups = [aws_security_group.lore.id] + assign_public_ip = false + } + + tags = local.tags +} diff --git a/examples/aws/iam.tf b/examples/aws/iam.tf new file mode 100644 index 0000000..b976c05 --- /dev/null +++ b/examples/aws/iam.tf @@ -0,0 +1,85 @@ +# ============================================================================= +# IAM — ECS task role (S3 + DynamoDB access) and execution role (ECR + logs) +# ============================================================================= + +# Task role — what the loreserver container can do +resource "aws_iam_role" "task" { + name_prefix = "${local.name}-task-" + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Action = "sts:AssumeRole" + Effect = "Allow" + Principal = { Service = "ecs-tasks.amazonaws.com" } + }] + }) + tags = local.tags +} + +resource "aws_iam_role_policy" "task_s3" { + name_prefix = "s3-" + role = aws_iam_role.task.id + policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Effect = "Allow" + Action = [ + "s3:GetObject", + "s3:PutObject", + "s3:DeleteObject", + "s3:ListBucket", + "s3:ListBucketVersions", + ] + Resource = [ + aws_s3_bucket.fragments.arn, + "${aws_s3_bucket.fragments.arn}/*", + ] + }] + }) +} + +resource "aws_iam_role_policy" "task_dynamodb" { + name_prefix = "dynamodb-" + role = aws_iam_role.task.id + policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Effect = "Allow" + Action = [ + "dynamodb:GetItem", + "dynamodb:PutItem", + "dynamodb:DeleteItem", + "dynamodb:Query", + "dynamodb:BatchGetItem", + "dynamodb:DescribeTable", + "dynamodb:TransactWriteItems", + ] + Resource = [ + aws_dynamodb_table.fragments.arn, + aws_dynamodb_table.metadata.arn, + aws_dynamodb_table.mutable.arn, + aws_dynamodb_table.locks.arn, + "${aws_dynamodb_table.locks.arn}/index/*", + ] + }] + }) +} + +# Execution role — what ECS needs to start the task (pull image, write logs, read secrets) +resource "aws_iam_role" "execution" { + name_prefix = "${local.name}-exec-" + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Action = "sts:AssumeRole" + Effect = "Allow" + Principal = { Service = "ecs-tasks.amazonaws.com" } + }] + }) + tags = local.tags +} + +resource "aws_iam_role_policy_attachment" "execution_ecr" { + role = aws_iam_role.execution.name + policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy" +} diff --git a/examples/aws/main.tf b/examples/aws/main.tf new file mode 100644 index 0000000..7d0122f --- /dev/null +++ b/examples/aws/main.tf @@ -0,0 +1,16 @@ +provider "aws" { + region = var.region +} + +locals { + name = var.name + tags = { ManagedBy = "terraform", Project = "lore" } + + # Ports — match lore-server/config/default.toml + port_quic_grpc = 41337 # QUIC (UDP) + gRPC (TCP) + port_http = 41339 # Health checks, presigned URLs +} + +data "aws_availability_zones" "available" { + state = "available" +} diff --git a/examples/aws/network.tf b/examples/aws/network.tf new file mode 100644 index 0000000..9765a73 --- /dev/null +++ b/examples/aws/network.tf @@ -0,0 +1,147 @@ +# ============================================================================= +# VPC — minimal 2-AZ layout with public + private subnets +# ============================================================================= + +resource "aws_vpc" "this" { + cidr_block = "10.0.0.0/16" + enable_dns_hostnames = true + enable_dns_support = true + tags = merge(local.tags, { Name = "${local.name}-vpc" }) +} + +resource "aws_internet_gateway" "this" { + vpc_id = aws_vpc.this.id + tags = merge(local.tags, { Name = "${local.name}-igw" }) +} + +resource "aws_subnet" "public" { + count = 2 + vpc_id = aws_vpc.this.id + cidr_block = cidrsubnet(aws_vpc.this.cidr_block, 8, count.index) + availability_zone = data.aws_availability_zones.available.names[count.index] + map_public_ip_on_launch = true + tags = merge(local.tags, { Name = "${local.name}-public-${count.index}" }) +} + +resource "aws_subnet" "private" { + count = 2 + vpc_id = aws_vpc.this.id + cidr_block = cidrsubnet(aws_vpc.this.cidr_block, 8, count.index + 10) + availability_zone = data.aws_availability_zones.available.names[count.index] + tags = merge(local.tags, { Name = "${local.name}-private-${count.index}" }) +} + +resource "aws_eip" "nat" { + domain = "vpc" + tags = merge(local.tags, { Name = "${local.name}-nat-eip" }) +} + +resource "aws_nat_gateway" "this" { + allocation_id = aws_eip.nat.id + subnet_id = aws_subnet.public[0].id + tags = merge(local.tags, { Name = "${local.name}-nat" }) +} + +resource "aws_route_table" "public" { + vpc_id = aws_vpc.this.id + tags = merge(local.tags, { Name = "${local.name}-public-rt" }) +} + +resource "aws_route" "public_internet" { + route_table_id = aws_route_table.public.id + destination_cidr_block = "0.0.0.0/0" + gateway_id = aws_internet_gateway.this.id +} + +resource "aws_route_table_association" "public" { + count = 2 + subnet_id = aws_subnet.public[count.index].id + route_table_id = aws_route_table.public.id +} + +resource "aws_route_table" "private" { + vpc_id = aws_vpc.this.id + tags = merge(local.tags, { Name = "${local.name}-private-rt" }) +} + +resource "aws_route" "private_nat" { + route_table_id = aws_route_table.private.id + destination_cidr_block = "0.0.0.0/0" + nat_gateway_id = aws_nat_gateway.this.id +} + +resource "aws_route_table_association" "private" { + count = 2 + subnet_id = aws_subnet.private[count.index].id + route_table_id = aws_route_table.private.id +} + +# ============================================================================= +# Security Group — Lore server +# ============================================================================= + +resource "aws_security_group" "lore" { + name_prefix = "${local.name}-server-" + description = "Lore server ports" + vpc_id = aws_vpc.this.id + tags = merge(local.tags, { Name = "${local.name}-server-sg" }) + + lifecycle { create_before_destroy = true } +} + +# Client access: QUIC (UDP) + gRPC (TCP) on 41337 +resource "aws_vpc_security_group_ingress_rule" "client_quic" { + for_each = toset(var.allowed_cidrs) + security_group_id = aws_security_group.lore.id + from_port = local.port_quic_grpc + to_port = local.port_quic_grpc + ip_protocol = "udp" + cidr_ipv4 = each.value + description = "Client QUIC" +} + +resource "aws_vpc_security_group_ingress_rule" "client_grpc" { + for_each = toset(var.allowed_cidrs) + security_group_id = aws_security_group.lore.id + from_port = local.port_quic_grpc + to_port = local.port_quic_grpc + ip_protocol = "tcp" + cidr_ipv4 = each.value + description = "Client gRPC" +} + +# HTTP health checks + presigned URLs +resource "aws_vpc_security_group_ingress_rule" "client_http" { + for_each = toset(var.allowed_cidrs) + security_group_id = aws_security_group.lore.id + from_port = local.port_http + to_port = local.port_http + ip_protocol = "tcp" + cidr_ipv4 = each.value + description = "Client HTTP" +} + +resource "aws_vpc_security_group_egress_rule" "all" { + security_group_id = aws_security_group.lore.id + ip_protocol = "-1" + cidr_ipv4 = "0.0.0.0/0" + description = "All outbound" +} + +# ============================================================================= +# VPC Endpoints — S3 and DynamoDB (avoid NAT costs for AWS API traffic) +# ============================================================================= + +resource "aws_vpc_endpoint" "s3" { + vpc_id = aws_vpc.this.id + service_name = "com.amazonaws.${var.region}.s3" + route_table_ids = [aws_route_table.private.id] + tags = merge(local.tags, { Name = "${local.name}-s3-endpoint" }) +} + +resource "aws_vpc_endpoint" "dynamodb" { + vpc_id = aws_vpc.this.id + service_name = "com.amazonaws.${var.region}.dynamodb" + route_table_ids = [aws_route_table.private.id] + tags = merge(local.tags, { Name = "${local.name}-dynamodb-endpoint" }) +} diff --git a/examples/aws/outputs.tf b/examples/aws/outputs.tf new file mode 100644 index 0000000..f2a6f85 --- /dev/null +++ b/examples/aws/outputs.tf @@ -0,0 +1,19 @@ +output "cluster_name" { + description = "ECS cluster name" + value = aws_ecs_cluster.this.name +} + +output "service_name" { + description = "ECS service name" + value = aws_ecs_service.lore.name +} + +output "s3_bucket" { + description = "S3 bucket for fragment storage" + value = aws_s3_bucket.fragments.id +} + +output "log_group" { + description = "CloudWatch log group" + value = aws_cloudwatch_log_group.lore.name +} diff --git a/examples/aws/storage.tf b/examples/aws/storage.tf new file mode 100644 index 0000000..de8a2ab --- /dev/null +++ b/examples/aws/storage.tf @@ -0,0 +1,152 @@ +# ============================================================================= +# S3 — Fragment payloads (immutable store) +# ============================================================================= + +resource "aws_s3_bucket" "fragments" { + bucket_prefix = "${local.name}-fragments-" + tags = local.tags +} + +resource "aws_s3_bucket_versioning" "fragments" { + bucket = aws_s3_bucket.fragments.id + versioning_configuration { status = "Enabled" } +} + +resource "aws_s3_bucket_server_side_encryption_configuration" "fragments" { + bucket = aws_s3_bucket.fragments.id + rule { + apply_server_side_encryption_by_default { sse_algorithm = "AES256" } + } +} + +resource "aws_s3_bucket_public_access_block" "fragments" { + bucket = aws_s3_bucket.fragments.id + block_public_acls = true + block_public_policy = true + ignore_public_acls = true + restrict_public_buckets = true +} + +# ============================================================================= +# DynamoDB — Fragment associations +# Key schema from lore-aws/src/store/immutable_store.rs +# ============================================================================= + +resource "aws_dynamodb_table" "fragments" { + name = "${local.name}-fragments" + billing_mode = "PAY_PER_REQUEST" + hash_key = "hash" + range_key = "repository_context" + + attribute { + name = "hash" + type = "B" + } + attribute { + name = "repository_context" + type = "B" + } + + tags = local.tags +} + +# ============================================================================= +# DynamoDB — Fragment metadata (hash-only key, no sort key) +# Key schema from lore-aws/src/store/immutable_store.rs +# ============================================================================= + +resource "aws_dynamodb_table" "metadata" { + name = "${local.name}-metadata" + billing_mode = "PAY_PER_REQUEST" + hash_key = "hash" + + attribute { + name = "hash" + type = "B" + } + + tags = local.tags +} + +# ============================================================================= +# DynamoDB — Mutable store (branch pointers) +# Key schema from lore-aws/src/store/mutable_store.rs +# ============================================================================= + +resource "aws_dynamodb_table" "mutable" { + name = "${local.name}-mutable" + billing_mode = "PAY_PER_REQUEST" + hash_key = "repository_id" + range_key = "key" + + attribute { + name = "repository_id" + type = "B" + } + attribute { + name = "key" + type = "B" + } + + tags = local.tags +} + +# ============================================================================= +# DynamoDB — Distributed locks +# Key schema + GSIs from lore-aws/src/store/lock_store.rs +# ============================================================================= + +resource "aws_dynamodb_table" "locks" { + name = "${local.name}-locks" + billing_mode = "PAY_PER_REQUEST" + hash_key = "hash" + range_key = "repositoryBranch" + + attribute { + name = "hash" + type = "B" + } + attribute { + name = "repositoryBranch" + type = "B" + } + attribute { + name = "ownerId" + type = "S" + } + attribute { + name = "repository" + type = "B" + } + attribute { + name = "branch" + type = "B" + } + attribute { + name = "description" + type = "S" + } + + global_secondary_index { + name = "owner-repo-branch" + hash_key = "ownerId" + range_key = "repositoryBranch" + projection_type = "ALL" + } + + global_secondary_index { + name = "repo-branch" + hash_key = "repository" + range_key = "branch" + projection_type = "ALL" + } + + global_secondary_index { + name = "repo-branch-description" + hash_key = "repositoryBranch" + range_key = "description" + projection_type = "ALL" + } + + tags = local.tags +} diff --git a/examples/aws/terraform.tfvars.example b/examples/aws/terraform.tfvars.example new file mode 100644 index 0000000..27a34c0 --- /dev/null +++ b/examples/aws/terraform.tfvars.example @@ -0,0 +1,3 @@ +region = "us-west-2" +container_image = "123456789012.dkr.ecr.us-west-2.amazonaws.com/loreserver:latest" +allowed_cidrs = ["10.0.0.0/8"] diff --git a/examples/aws/variables.tf b/examples/aws/variables.tf new file mode 100644 index 0000000..5cf76b4 --- /dev/null +++ b/examples/aws/variables.tf @@ -0,0 +1,21 @@ +variable "container_image" { + description = "Loreserver container image URI in ECR" + type = string +} + +variable "allowed_cidrs" { + description = "CIDR blocks allowed to connect to Lore (e.g., your VPN or office IP)" + type = list(string) +} + +variable "region" { + description = "AWS region" + type = string + default = "us-west-2" +} + +variable "name" { + description = "Name prefix for all resources" + type = string + default = "lore" +} diff --git a/examples/aws/versions.tf b/examples/aws/versions.tf new file mode 100644 index 0000000..00bd111 --- /dev/null +++ b/examples/aws/versions.tf @@ -0,0 +1,10 @@ +terraform { + required_version = ">= 1.5" + + required_providers { + aws = { + source = "hashicorp/aws" + version = ">= 5.0" + } + } +} From a60b9cf76e7f4efa9f2f96c119a6c227bb1b4bb7 Mon Sep 17 00:00:00 2001 From: Sam Biggins Date: Wed, 17 Jun 2026 22:50:22 +0000 Subject: [PATCH 02/10] Address PR review: clarify plugin registration and VPC connectivity - Explain that the Dockerfile build auto-registers lore-aws plugin - Document that the task runs in private subnets (VPC access required) - Add ingress to the Customize section for production paths Signed-off-by: Sam Biggins --- examples/aws/README.md | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/examples/aws/README.md b/examples/aws/README.md index 313b72a..8c872d0 100644 --- a/examples/aws/README.md +++ b/examples/aws/README.md @@ -26,6 +26,11 @@ docker tag loreserver:latest .dkr.ecr.us-west-2.amazonaws.com/lorese docker push .dkr.ecr.us-west-2.amazonaws.com/loreserver:latest ``` +The Dockerfile builds the `loreserver` binary from the workspace, which includes +the `lore-aws` crate. The server's `main()` calls `register_all_plugins()` at +startup, registering the AWS (S3 + DynamoDB) and HashiCorp (Consul) plugins +automatically. No custom binary or fork is needed. + ## Deploy ```sh @@ -38,7 +43,10 @@ terraform apply ## Connect -Get the task IP (Fargate assigns a private IP in the VPC): +The ECS service runs in private subnets. You must connect from within the VPC +(e.g., an EC2 instance, VPN, AWS Client VPN, or SSM port-forwarding session). + +Get the task IP: ```sh TASK_ARN=$(aws ecs list-tasks --cluster lore-cluster --service-name lore --query 'taskArns[0]' --output text) @@ -47,13 +55,15 @@ TASK_IP=$(aws ecs describe-tasks --cluster lore-cluster --tasks "$TASK_ARN" \ echo "$TASK_IP" ``` -The server generates an ephemeral self-signed certificate on startup. For local testing, skip TLS verification or use `lore://` (plain gRPC, QUIC still has TLS): +From a host inside the VPC: ```sh lore clone lore://${TASK_IP}:41337/my-repo ``` -For production, configure real TLS certificates (see Customize below) and use `lores://`. +The server generates an ephemeral self-signed certificate on startup. Use +`lore://` (plain gRPC control plane — QUIC data path still uses TLS) or +configure real certificates and use `lores://` (see Customize below). ## Verify @@ -74,6 +84,7 @@ aws logs tail /ecs/lore --since 5m This example uses the simplest viable configuration. For production: +- **Ingress** — add an NLB, AWS Client VPN, or bastion host for access from outside the VPC. - **TLS** — mount real certificates and set `LORE__SERVER__QUIC__CERTIFICATE__CERT_FILE` / `PKEY_FILE` (and the same for `GRPC`). See [Server configuration reference](https://epicgames.github.io/lore/reference/lore-server-config/#certificate-block). - **Auth** — configure `LORE__SERVER__AUTH__JWK__ENDPOINT` to validate JWTs. See [Authentication](https://epicgames.github.io/lore/reference/lore-server-config/#authentication). - **Caching** — switch from Fargate to EC2 with NVMe instances and use `LORE__IMMUTABLE_STORE__MODE=composite` for a local cache in front of S3. From 4ce9533b22376f12a0f6fc5ffccf4b05082e7202 Mon Sep 17 00:00:00 2001 From: Sam Biggins Date: Wed, 17 Jun 2026 23:44:44 +0000 Subject: [PATCH 03/10] Add s3:DeleteObjectVersion, edge pod with Cloud Map discovery MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add s3:DeleteObjectVersion (required for versioned bucket cleanup) - Add edge pod service with replicated+remote stores via Cloud Map - Add Cloud Map private DNS for edge→primary discovery - Add internal SG rules for node-to-node QUIC+gRPC Signed-off-by: Sam Biggins --- examples/aws/compute.tf | 104 ++++++++++++++++++++++++++++++++++++++++ examples/aws/iam.tf | 1 + examples/aws/main.tf | 5 +- examples/aws/network.tf | 30 ++++++++++++ examples/aws/outputs.tf | 12 ++++- 5 files changed, 149 insertions(+), 3 deletions(-) diff --git a/examples/aws/compute.tf b/examples/aws/compute.tf index 7e630c7..4c3eb97 100644 --- a/examples/aws/compute.tf +++ b/examples/aws/compute.tf @@ -83,5 +83,109 @@ resource "aws_ecs_service" "lore" { assign_public_ip = false } + service_registries { + registry_arn = aws_service_discovery_service.lore.arn + } + + tags = local.tags +} + +# ============================================================================= +# Cloud Map — Service discovery for edge → primary +# ============================================================================= + +resource "aws_service_discovery_private_dns_namespace" "this" { + name = "${local.name}.internal" + vpc = aws_vpc.this.id + tags = local.tags +} + +resource "aws_service_discovery_service" "lore" { + name = "primary" + + dns_config { + namespace_id = aws_service_discovery_private_dns_namespace.this.id + dns_records { + ttl = 10 + type = "A" + } + routing_policy = "MULTIVALUE" + } + + health_check_custom_config { + failure_threshold = 1 + } + + tags = local.tags +} + +# ============================================================================= +# Edge Pod — Caching node with replicated + remote stores +# ============================================================================= + +resource "aws_ecs_task_definition" "edge" { + family = "${local.name}-edge" + requires_compatibilities = ["FARGATE"] + network_mode = "awsvpc" + cpu = "1024" + memory = "2048" + execution_role_arn = aws_iam_role.execution.arn + task_role_arn = aws_iam_role.task.arn + + container_definitions = jsonencode([{ + name = "loreserver" + image = var.container_image + essential = true + + portMappings = [ + { containerPort = local.port_quic_grpc, protocol = "tcp" }, + { containerPort = local.port_quic_grpc, protocol = "udp" }, + { containerPort = local.port_http, protocol = "tcp" }, + ] + + environment = [ + { name = "LORE_ENV", value = "docker" }, + { name = "LORE_CONFIG_PATH", value = "/etc/lore/config" }, + + # Edge stores: replicated immutable (pulls from primary) + remote mutable (proxies to primary) + { name = "LORE__IMMUTABLE_STORE__MODE", value = "replicated" }, + { name = "LORE__IMMUTABLE_STORE__REPLICATED__REMOTE_URL", value = "lore://primary.${local.name}.internal:${local.port_quic_grpc}" }, + { name = "LORE__IMMUTABLE_STORE__REPLICATED__PERIODIC_CLIENT_REFRESH_SECS", value = "300" }, + { name = "LORE__IMMUTABLE_STORE__REPLICATED__REGENERATE_RETRY__INITIAL_BACKOFF_MS", value = "100" }, + { name = "LORE__IMMUTABLE_STORE__REPLICATED__REGENERATE_RETRY__MAX_BACKOFF_MS", value = "5000" }, + { name = "LORE__IMMUTABLE_STORE__REPLICATED__REGENERATE_RETRY__MAX_ATTEMPTS", value = "10" }, + { name = "LORE__MUTABLE_STORE__MODE", value = "remote" }, + { name = "LORE__MUTABLE_STORE__REMOTE__REMOTE_URL", value = "lore://primary.${local.name}.internal:${local.port_quic_grpc}" }, + { name = "LORE__LOCK_STORE__MODE", value = "local" }, + ] + + logConfiguration = { + logDriver = "awslogs" + options = { + "awslogs-group" = aws_cloudwatch_log_group.lore.name + "awslogs-region" = var.region + "awslogs-stream-prefix" = "edge" + } + } + }]) + + tags = local.tags +} + +resource "aws_ecs_service" "edge" { + name = "${local.name}-edge" + cluster = aws_ecs_cluster.this.id + task_definition = aws_ecs_task_definition.edge.arn + desired_count = 1 + launch_type = "FARGATE" + + network_configuration { + subnets = aws_subnet.private[*].id + security_groups = [aws_security_group.lore.id] + assign_public_ip = false + } + + depends_on = [aws_ecs_service.lore] + tags = local.tags } diff --git a/examples/aws/iam.tf b/examples/aws/iam.tf index b976c05..02c655c 100644 --- a/examples/aws/iam.tf +++ b/examples/aws/iam.tf @@ -27,6 +27,7 @@ resource "aws_iam_role_policy" "task_s3" { "s3:GetObject", "s3:PutObject", "s3:DeleteObject", + "s3:DeleteObjectVersion", "s3:ListBucket", "s3:ListBucketVersions", ] diff --git a/examples/aws/main.tf b/examples/aws/main.tf index 7d0122f..b644544 100644 --- a/examples/aws/main.tf +++ b/examples/aws/main.tf @@ -7,8 +7,9 @@ locals { tags = { ManagedBy = "terraform", Project = "lore" } # Ports — match lore-server/config/default.toml - port_quic_grpc = 41337 # QUIC (UDP) + gRPC (TCP) - port_http = 41339 # Health checks, presigned URLs + port_quic_grpc = 41337 # QUIC (UDP) + gRPC (TCP) + port_http = 41339 # Health checks, presigned URLs + port_replication = 41340 # QUIC internal replication (UDP) } data "aws_availability_zones" "available" { diff --git a/examples/aws/network.tf b/examples/aws/network.tf index 9765a73..48b109d 100644 --- a/examples/aws/network.tf +++ b/examples/aws/network.tf @@ -121,6 +121,36 @@ resource "aws_vpc_security_group_ingress_rule" "client_http" { description = "Client HTTP" } +# Internal: QUIC replication (edge → primary on 41340 UDP) +resource "aws_vpc_security_group_ingress_rule" "replication_quic" { + security_group_id = aws_security_group.lore.id + from_port = 41340 + to_port = 41340 + ip_protocol = "udp" + referenced_security_group_id = aws_security_group.lore.id + description = "QUIC replication between Lore nodes" +} + +# Internal: gRPC (edge → primary on 41337 TCP for remote mutable store) +resource "aws_vpc_security_group_ingress_rule" "internal_grpc" { + security_group_id = aws_security_group.lore.id + from_port = 41337 + to_port = 41337 + ip_protocol = "tcp" + referenced_security_group_id = aws_security_group.lore.id + description = "gRPC between Lore nodes" +} + +# Internal: QUIC (edge → primary on 41337 UDP for replicated immutable store) +resource "aws_vpc_security_group_ingress_rule" "internal_quic" { + security_group_id = aws_security_group.lore.id + from_port = 41337 + to_port = 41337 + ip_protocol = "udp" + referenced_security_group_id = aws_security_group.lore.id + description = "QUIC between Lore nodes" +} + resource "aws_vpc_security_group_egress_rule" "all" { security_group_id = aws_security_group.lore.id ip_protocol = "-1" diff --git a/examples/aws/outputs.tf b/examples/aws/outputs.tf index f2a6f85..5fec610 100644 --- a/examples/aws/outputs.tf +++ b/examples/aws/outputs.tf @@ -4,10 +4,20 @@ output "cluster_name" { } output "service_name" { - description = "ECS service name" + description = "ECS service name (primary)" value = aws_ecs_service.lore.name } +output "edge_service_name" { + description = "ECS service name (edge)" + value = aws_ecs_service.edge.name +} + +output "primary_dns" { + description = "Cloud Map DNS for primary (used by edge pods)" + value = "primary.${aws_service_discovery_private_dns_namespace.this.name}" +} + output "s3_bucket" { description = "S3 bucket for fragment storage" value = aws_s3_bucket.fragments.id From 1ef073cea4ee70f534c65818212fd0a173c19aaa Mon Sep 17 00:00:00 2001 From: Sam Biggins Date: Wed, 17 Jun 2026 23:49:25 +0000 Subject: [PATCH 04/10] Wire TLS certs for inter-node replication - Generate CA + server cert via tls provider (SAN: primary.lore.internal) - Store certs in Secrets Manager, provision via init containers - Primary: enables quic_internal:41340 with cert for edge replication - Edge: trusts primary CA via SSL_CERT_FILE, connects replicated+remote - Both services confirmed running in deployment test Signed-off-by: Sam Biggins --- examples/aws/README.md | 9 +- examples/aws/compute.tf | 239 ++++++++++++++++++++++++++------------- examples/aws/iam.tf | 13 +++ examples/aws/outputs.tf | 6 + examples/aws/tls.tf | 73 ++++++++++++ examples/aws/versions.tf | 4 + 6 files changed, 262 insertions(+), 82 deletions(-) create mode 100644 examples/aws/tls.tf diff --git a/examples/aws/README.md b/examples/aws/README.md index 8c872d0..ee8ad0b 100644 --- a/examples/aws/README.md +++ b/examples/aws/README.md @@ -7,7 +7,10 @@ Terraform configuration that deploys a Lore server on AWS with durable S3/Dynamo - VPC with public and private subnets (2 AZs) - S3 bucket for fragment storage (immutable store) - 4 DynamoDB tables (fragments, metadata, mutable store, locks) -- ECS Fargate service running the loreserver container +- ECS Fargate primary service with S3/DynamoDB storage +- ECS Fargate edge service with replicated storage (caches from primary) +- Cloud Map private DNS for edge → primary service discovery +- Self-signed TLS CA + server certificate (inter-node trust) - VPC endpoints for S3 and DynamoDB (reduces NAT costs) - CloudWatch log group @@ -88,7 +91,7 @@ This example uses the simplest viable configuration. For production: - **TLS** — mount real certificates and set `LORE__SERVER__QUIC__CERTIFICATE__CERT_FILE` / `PKEY_FILE` (and the same for `GRPC`). See [Server configuration reference](https://epicgames.github.io/lore/reference/lore-server-config/#certificate-block). - **Auth** — configure `LORE__SERVER__AUTH__JWK__ENDPOINT` to validate JWTs. See [Authentication](https://epicgames.github.io/lore/reference/lore-server-config/#authentication). - **Caching** — switch from Fargate to EC2 with NVMe instances and use `LORE__IMMUTABLE_STORE__MODE=composite` for a local cache in front of S3. -- **Replication** — add edge nodes with `LORE__IMMUTABLE_STORE__MODE=replicated` for multi-region. See [Topology](https://epicgames.github.io/lore/reference/lore-server-config/#topology-settings). +- **Replication** — add more edge nodes or deploy to other regions. See [Topology](https://epicgames.github.io/lore/reference/lore-server-config/#topology-settings). - **HMAC** — set `LORE__SERVER__HTTP__PRESIGNED_URL_HMAC_KEY` (hex, ≥32 bytes) to enable presigned URLs for direct client-to-S3 transfers. ## Destroy @@ -97,4 +100,4 @@ This example uses the simplest viable configuration. For production: terraform destroy ``` -Teardown takes 2–3 minutes (VPC, NAT gateway deletion). +Teardown includes VPC and NAT gateway deletion. diff --git a/examples/aws/compute.tf b/examples/aws/compute.tf index 4c3eb97..7ae9606 100644 --- a/examples/aws/compute.tf +++ b/examples/aws/compute.tf @@ -1,5 +1,5 @@ # ============================================================================= -# ECS Cluster + Fargate Service +# ECS Cluster + Primary + Edge Services # ============================================================================= resource "aws_ecs_cluster" "this" { @@ -13,6 +13,10 @@ resource "aws_cloudwatch_log_group" "lore" { tags = local.tags } +# ============================================================================= +# Primary — Durable storage (S3 + DynamoDB), serves replication to edge +# ============================================================================= + resource "aws_ecs_task_definition" "lore" { family = local.name requires_compatibilities = ["FARGATE"] @@ -22,50 +26,92 @@ resource "aws_ecs_task_definition" "lore" { execution_role_arn = aws_iam_role.execution.arn task_role_arn = aws_iam_role.task.arn - container_definitions = jsonencode([{ - name = "loreserver" - image = var.container_image - essential = true - - portMappings = [ - { containerPort = local.port_quic_grpc, protocol = "tcp" }, - { containerPort = local.port_quic_grpc, protocol = "udp" }, - { containerPort = local.port_http, protocol = "tcp" }, - ] - - environment = [ - { name = "LORE_ENV", value = "docker" }, - { name = "LORE_CONFIG_PATH", value = "/etc/lore/config" }, - - # Storage: S3 + DynamoDB via the aws plugin - { name = "LORE__IMMUTABLE_STORE__MODE", value = "aws" }, - { name = "LORE__MUTABLE_STORE__MODE", value = "aws" }, - { name = "LORE__LOCK_STORE__MODE", value = "aws" }, - - # AWS plugin config — resource names from Terraform - { name = "LORE__PLUGINS__AWS__IMMUTABLE_STORE__S3_BUCKET", value = aws_s3_bucket.fragments.id }, - { name = "LORE__PLUGINS__AWS__IMMUTABLE_STORE__DYNAMODB_FRAGMENTS_TABLE", value = aws_dynamodb_table.fragments.name }, - { name = "LORE__PLUGINS__AWS__IMMUTABLE_STORE__DYNAMODB_METADATA_TABLE", value = aws_dynamodb_table.metadata.name }, - { name = "LORE__PLUGINS__AWS__MUTABLE_STORE__DYNAMODB_TABLE", value = aws_dynamodb_table.mutable.name }, - { name = "LORE__PLUGINS__AWS__LOCK_STORE__DYNAMODB_TABLE", value = aws_dynamodb_table.locks.name }, - ] - - # TLS: The server generates an ephemeral self-signed certificate when no - # certificate is configured. For production, mount real certs and set: - # LORE__SERVER__QUIC__CERTIFICATE__CERT_FILE=/certs/cert.pem - # LORE__SERVER__QUIC__CERTIFICATE__PKEY_FILE=/certs/key.pem - # LORE__SERVER__GRPC__CERTIFICATE__CERT_FILE=/certs/cert.pem - # LORE__SERVER__GRPC__CERTIFICATE__PKEY_FILE=/certs/key.pem - - logConfiguration = { - logDriver = "awslogs" - options = { - "awslogs-group" = aws_cloudwatch_log_group.lore.name - "awslogs-region" = var.region - "awslogs-stream-prefix" = "lore" + volume { + name = "certs" + } + + container_definitions = jsonencode([ + # Init container: write TLS certs from secrets to shared volume + { + name = "init-certs" + image = "public.ecr.aws/amazonlinux/amazonlinux:minimal" + essential = false + command = ["sh", "-c", "echo \"$CERT\" > /certs/fullchain.crt && echo \"$KEY\" > /certs/server.key && echo \"$CA\" > /certs/ca.pem"] + + secrets = [ + { name = "CERT", valueFrom = "${aws_secretsmanager_secret.tls.arn}:fullchain::" }, + { name = "KEY", valueFrom = "${aws_secretsmanager_secret.tls.arn}:key::" }, + { name = "CA", valueFrom = "${aws_secretsmanager_secret.tls.arn}:ca::" }, + ] + + mountPoints = [{ sourceVolume = "certs", containerPath = "/certs", readOnly = false }] + + logConfiguration = { + logDriver = "awslogs" + options = { + "awslogs-group" = aws_cloudwatch_log_group.lore.name + "awslogs-region" = var.region + "awslogs-stream-prefix" = "init" + } } - } - }]) + }, + # Loreserver primary + { + name = "loreserver" + image = var.container_image + essential = true + + dependsOn = [{ containerName = "init-certs", condition = "SUCCESS" }] + + portMappings = [ + { containerPort = local.port_quic_grpc, protocol = "tcp" }, + { containerPort = local.port_quic_grpc, protocol = "udp" }, + { containerPort = local.port_http, protocol = "tcp" }, + { containerPort = local.port_replication, protocol = "udp" }, + ] + + mountPoints = [{ sourceVolume = "certs", containerPath = "/certs", readOnly = true }] + + environment = [ + { name = "LORE_ENV", value = "docker" }, + { name = "LORE_CONFIG_PATH", value = "/etc/lore/config" }, + + # TLS for all endpoints + { name = "LORE__SERVER__QUIC__CERTIFICATE__CERT_FILE", value = "/certs/fullchain.crt" }, + { name = "LORE__SERVER__QUIC__CERTIFICATE__PKEY_FILE", value = "/certs/server.key" }, + { name = "LORE__SERVER__GRPC__CERTIFICATE__CERT_FILE", value = "/certs/fullchain.crt" }, + { name = "LORE__SERVER__GRPC__CERTIFICATE__PKEY_FILE", value = "/certs/server.key" }, + { name = "LORE__SERVER__GRPC__VERIFY_CLIENT_CERTS", value = "false" }, + + # Enable internal QUIC for edge pod replication + { name = "LORE__SERVER__QUIC_INTERNAL__ENABLED", value = "true" }, + { name = "LORE__SERVER__QUIC_INTERNAL__CERTIFICATE__CERT_FILE", value = "/certs/fullchain.crt" }, + { name = "LORE__SERVER__QUIC_INTERNAL__CERTIFICATE__PKEY_FILE", value = "/certs/server.key" }, + { name = "LORE__SERVER__QUIC_INTERNAL__VERIFY_CLIENT_CERTS", value = "false" }, + + # Storage: S3 + DynamoDB via the aws plugin + { name = "LORE__IMMUTABLE_STORE__MODE", value = "aws" }, + { name = "LORE__MUTABLE_STORE__MODE", value = "aws" }, + { name = "LORE__LOCK_STORE__MODE", value = "aws" }, + + # AWS plugin config + { name = "LORE__PLUGINS__AWS__IMMUTABLE_STORE__S3_BUCKET", value = aws_s3_bucket.fragments.id }, + { name = "LORE__PLUGINS__AWS__IMMUTABLE_STORE__DYNAMODB_FRAGMENTS_TABLE", value = aws_dynamodb_table.fragments.name }, + { name = "LORE__PLUGINS__AWS__IMMUTABLE_STORE__DYNAMODB_METADATA_TABLE", value = aws_dynamodb_table.metadata.name }, + { name = "LORE__PLUGINS__AWS__MUTABLE_STORE__DYNAMODB_TABLE", value = aws_dynamodb_table.mutable.name }, + { name = "LORE__PLUGINS__AWS__LOCK_STORE__DYNAMODB_TABLE", value = aws_dynamodb_table.locks.name }, + ] + + logConfiguration = { + logDriver = "awslogs" + options = { + "awslogs-group" = aws_cloudwatch_log_group.lore.name + "awslogs-region" = var.region + "awslogs-stream-prefix" = "lore" + } + } + }, + ]) tags = local.tags } @@ -132,42 +178,77 @@ resource "aws_ecs_task_definition" "edge" { execution_role_arn = aws_iam_role.execution.arn task_role_arn = aws_iam_role.task.arn - container_definitions = jsonencode([{ - name = "loreserver" - image = var.container_image - essential = true - - portMappings = [ - { containerPort = local.port_quic_grpc, protocol = "tcp" }, - { containerPort = local.port_quic_grpc, protocol = "udp" }, - { containerPort = local.port_http, protocol = "tcp" }, - ] - - environment = [ - { name = "LORE_ENV", value = "docker" }, - { name = "LORE_CONFIG_PATH", value = "/etc/lore/config" }, - - # Edge stores: replicated immutable (pulls from primary) + remote mutable (proxies to primary) - { name = "LORE__IMMUTABLE_STORE__MODE", value = "replicated" }, - { name = "LORE__IMMUTABLE_STORE__REPLICATED__REMOTE_URL", value = "lore://primary.${local.name}.internal:${local.port_quic_grpc}" }, - { name = "LORE__IMMUTABLE_STORE__REPLICATED__PERIODIC_CLIENT_REFRESH_SECS", value = "300" }, - { name = "LORE__IMMUTABLE_STORE__REPLICATED__REGENERATE_RETRY__INITIAL_BACKOFF_MS", value = "100" }, - { name = "LORE__IMMUTABLE_STORE__REPLICATED__REGENERATE_RETRY__MAX_BACKOFF_MS", value = "5000" }, - { name = "LORE__IMMUTABLE_STORE__REPLICATED__REGENERATE_RETRY__MAX_ATTEMPTS", value = "10" }, - { name = "LORE__MUTABLE_STORE__MODE", value = "remote" }, - { name = "LORE__MUTABLE_STORE__REMOTE__REMOTE_URL", value = "lore://primary.${local.name}.internal:${local.port_quic_grpc}" }, - { name = "LORE__LOCK_STORE__MODE", value = "local" }, - ] - - logConfiguration = { - logDriver = "awslogs" - options = { - "awslogs-group" = aws_cloudwatch_log_group.lore.name - "awslogs-region" = var.region - "awslogs-stream-prefix" = "edge" + volume { + name = "certs" + } + + container_definitions = jsonencode([ + # Init container: write CA cert so edge trusts primary + { + name = "init-certs" + image = "public.ecr.aws/amazonlinux/amazonlinux:minimal" + essential = false + command = ["sh", "-c", "echo \"$CA\" > /certs/ca.pem"] + + secrets = [ + { name = "CA", valueFrom = "${aws_secretsmanager_secret.tls.arn}:ca::" }, + ] + + mountPoints = [{ sourceVolume = "certs", containerPath = "/certs", readOnly = false }] + + logConfiguration = { + logDriver = "awslogs" + options = { + "awslogs-group" = aws_cloudwatch_log_group.lore.name + "awslogs-region" = var.region + "awslogs-stream-prefix" = "edge-init" + } } - } - }]) + }, + # Loreserver edge + { + name = "loreserver" + image = var.container_image + essential = true + + dependsOn = [{ containerName = "init-certs", condition = "SUCCESS" }] + + portMappings = [ + { containerPort = local.port_quic_grpc, protocol = "tcp" }, + { containerPort = local.port_quic_grpc, protocol = "udp" }, + { containerPort = local.port_http, protocol = "tcp" }, + ] + + mountPoints = [{ sourceVolume = "certs", containerPath = "/certs", readOnly = true }] + + environment = [ + { name = "LORE_ENV", value = "docker" }, + { name = "LORE_CONFIG_PATH", value = "/etc/lore/config" }, + # Trust the primary's CA for QUIC replication connection + { name = "SSL_CERT_FILE", value = "/certs/ca.pem" }, + + # Edge stores: replicated immutable (QUIC to primary:41340) + remote mutable (gRPC to primary:41337) + { name = "LORE__IMMUTABLE_STORE__MODE", value = "replicated" }, + { name = "LORE__IMMUTABLE_STORE__REPLICATED__REMOTE_URL", value = "lore://primary.${local.name}.internal:${local.port_replication}" }, + { name = "LORE__IMMUTABLE_STORE__REPLICATED__PERIODIC_CLIENT_REFRESH_SECS", value = "300" }, + { name = "LORE__IMMUTABLE_STORE__REPLICATED__REGENERATE_RETRY__INITIAL_BACKOFF_MS", value = "100" }, + { name = "LORE__IMMUTABLE_STORE__REPLICATED__REGENERATE_RETRY__MAX_BACKOFF_MS", value = "5000" }, + { name = "LORE__IMMUTABLE_STORE__REPLICATED__REGENERATE_RETRY__MAX_ATTEMPTS", value = "10" }, + { name = "LORE__MUTABLE_STORE__MODE", value = "remote" }, + { name = "LORE__MUTABLE_STORE__REMOTE__REMOTE_URL", value = "lores://primary.${local.name}.internal:${local.port_quic_grpc}" }, + { name = "LORE__LOCK_STORE__MODE", value = "local" }, + ] + + logConfiguration = { + logDriver = "awslogs" + options = { + "awslogs-group" = aws_cloudwatch_log_group.lore.name + "awslogs-region" = var.region + "awslogs-stream-prefix" = "edge" + } + } + }, + ]) tags = local.tags } diff --git a/examples/aws/iam.tf b/examples/aws/iam.tf index 02c655c..de60ca1 100644 --- a/examples/aws/iam.tf +++ b/examples/aws/iam.tf @@ -84,3 +84,16 @@ resource "aws_iam_role_policy_attachment" "execution_ecr" { role = aws_iam_role.execution.name policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy" } + +resource "aws_iam_role_policy" "execution_secrets" { + name_prefix = "secrets-" + role = aws_iam_role.execution.id + policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Effect = "Allow" + Action = ["secretsmanager:GetSecretValue"] + Resource = [aws_secretsmanager_secret.tls.arn] + }] + }) +} diff --git a/examples/aws/outputs.tf b/examples/aws/outputs.tf index 5fec610..6aaae18 100644 --- a/examples/aws/outputs.tf +++ b/examples/aws/outputs.tf @@ -27,3 +27,9 @@ output "log_group" { description = "CloudWatch log group" value = aws_cloudwatch_log_group.lore.name } + +output "ca_certificate_pem" { + description = "CA certificate — clients need this to trust the server's TLS cert" + value = local.ca_pem + sensitive = true +} diff --git a/examples/aws/tls.tf b/examples/aws/tls.tf new file mode 100644 index 0000000..03eedc3 --- /dev/null +++ b/examples/aws/tls.tf @@ -0,0 +1,73 @@ +# ============================================================================= +# TLS — CA + server certificate for QUIC and gRPC between nodes +# +# The public QUIC endpoint generates an ephemeral cert if none is configured, +# but the internal replication endpoint (quic_internal) requires an explicit +# certificate. We generate a CA + server cert here so both primary and edge +# can establish trusted QUIC connections. +# ============================================================================= + +resource "tls_private_key" "ca" { + algorithm = "ECDSA" + ecdsa_curve = "P384" +} + +resource "tls_self_signed_cert" "ca" { + private_key_pem = tls_private_key.ca.private_key_pem + + subject { + common_name = "${local.name}-ca" + organization = "Lore Example" + } + + validity_period_hours = 8760 + is_ca_certificate = true + allowed_uses = ["cert_signing", "crl_signing"] +} + +resource "tls_private_key" "server" { + algorithm = "ECDSA" + ecdsa_curve = "P384" +} + +resource "tls_cert_request" "server" { + private_key_pem = tls_private_key.server.private_key_pem + + subject { + common_name = "lore-server" + organization = "Lore Example" + } + + # Cloud Map DNS name used by edge pods to reach primary + dns_names = ["primary.${local.name}.internal", "localhost"] +} + +resource "tls_locally_signed_cert" "server" { + cert_request_pem = tls_cert_request.server.cert_request_pem + ca_private_key_pem = tls_private_key.ca.private_key_pem + ca_cert_pem = tls_self_signed_cert.ca.cert_pem + + validity_period_hours = 8760 + allowed_uses = ["digital_signature", "key_encipherment", "server_auth"] +} + +# Fullchain = server cert + CA cert +locals { + fullchain_pem = "${tls_locally_signed_cert.server.cert_pem}${tls_self_signed_cert.ca.cert_pem}" + server_key = tls_private_key.server.private_key_pem + ca_pem = tls_self_signed_cert.ca.cert_pem +} + +resource "aws_secretsmanager_secret" "tls" { + name_prefix = "${local.name}-tls-" + tags = local.tags +} + +resource "aws_secretsmanager_secret_version" "tls" { + secret_id = aws_secretsmanager_secret.tls.id + secret_string = jsonencode({ + fullchain = local.fullchain_pem + key = local.server_key + ca = local.ca_pem + }) +} diff --git a/examples/aws/versions.tf b/examples/aws/versions.tf index 00bd111..bbb76b5 100644 --- a/examples/aws/versions.tf +++ b/examples/aws/versions.tf @@ -6,5 +6,9 @@ terraform { source = "hashicorp/aws" version = ">= 5.0" } + tls = { + source = "hashicorp/tls" + version = ">= 4.0" + } } } From ca5cdf3ff43a7d7fc49bed95f0c66d60a80ac806 Mon Sep 17 00:00:00 2001 From: Sam Biggins Date: Thu, 18 Jun 2026 00:15:51 +0000 Subject: [PATCH 05/10] Fix README: point clients at edge, add CA export, note region Signed-off-by: Sam Biggins --- examples/aws/README.md | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/examples/aws/README.md b/examples/aws/README.md index ee8ad0b..cb2d28e 100644 --- a/examples/aws/README.md +++ b/examples/aws/README.md @@ -2,6 +2,8 @@ Terraform configuration that deploys a Lore server on AWS with durable S3/DynamoDB storage using ECS Fargate. +> Region is configurable via `var.region` (default: `us-west-2`). + ## What this creates - VPC with public and private subnets (2 AZs) @@ -46,13 +48,21 @@ terraform apply ## Connect -The ECS service runs in private subnets. You must connect from within the VPC -(e.g., an EC2 instance, VPN, AWS Client VPN, or SSM port-forwarding session). +The ECS services run in private subnets. Connect from within the VPC +(e.g., an EC2 instance, VPN, or AWS Client VPN). + +Export the CA certificate so the client trusts the server's QUIC endpoint: + +```sh +terraform output -raw ca_certificate_pem > lore-ca.pem +export SSL_CERT_FILE=lore-ca.pem +``` -Get the task IP: +Clients connect to the **edge** service (it replicates from the primary +automatically). Get the edge task IP: ```sh -TASK_ARN=$(aws ecs list-tasks --cluster lore-cluster --service-name lore --query 'taskArns[0]' --output text) +TASK_ARN=$(aws ecs list-tasks --cluster lore-cluster --service-name lore-edge --query 'taskArns[0]' --output text) TASK_IP=$(aws ecs describe-tasks --cluster lore-cluster --tasks "$TASK_ARN" \ --query 'tasks[0].attachments[0].details[?name==`privateIPv4Address`].value' --output text) echo "$TASK_IP" @@ -64,9 +74,9 @@ From a host inside the VPC: lore clone lore://${TASK_IP}:41337/my-repo ``` -The server generates an ephemeral self-signed certificate on startup. Use -`lore://` (plain gRPC control plane — QUIC data path still uses TLS) or -configure real certificates and use `lores://` (see Customize below). +> `lore://` uses QUIC (TLS) for data and plain gRPC for the control plane. +> The edge pod's gRPC is not TLS-configured, so `lore://` works directly. +> For `lores://` (gRPC+TLS), configure certificates on the edge pod (see Customize). ## Verify From 96edac2aec0e2785920335612535e2f3264ce2fe Mon Sep 17 00:00:00 2001 From: Sam Biggins Date: Thu, 18 Jun 2026 00:17:52 +0000 Subject: [PATCH 06/10] Rewrite README for clarity: quick start flow, tables, remove internals Signed-off-by: Sam Biggins --- examples/aws/README.md | 113 ++++++++++++++++++++--------------------- 1 file changed, 54 insertions(+), 59 deletions(-) diff --git a/examples/aws/README.md b/examples/aws/README.md index cb2d28e..28d8af3 100644 --- a/examples/aws/README.md +++ b/examples/aws/README.md @@ -1,108 +1,99 @@ # Lore on AWS -Terraform configuration that deploys a Lore server on AWS with durable S3/DynamoDB storage using ECS Fargate. +Deploy a Lore server on AWS with durable S3/DynamoDB storage and an edge node for client access. > Region is configurable via `var.region` (default: `us-west-2`). -## What this creates +## Quick start -- VPC with public and private subnets (2 AZs) -- S3 bucket for fragment storage (immutable store) -- 4 DynamoDB tables (fragments, metadata, mutable store, locks) -- ECS Fargate primary service with S3/DynamoDB storage -- ECS Fargate edge service with replicated storage (caches from primary) -- Cloud Map private DNS for edge → primary service discovery -- Self-signed TLS CA + server certificate (inter-node trust) -- VPC endpoints for S3 and DynamoDB (reduces NAT costs) -- CloudWatch log group +### 1. Build and push the container image -## Prerequisites - -- [Terraform](https://developer.hashicorp.com/terraform/install) >= 1.5 -- AWS credentials configured (`aws configure` or environment variables) -- A `loreserver` container image in ECR — build from the repo root: +From the Lore repo root: ```sh docker build -f lore-server/Dockerfile -t loreserver . - -aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin .dkr.ecr.us-west-2.amazonaws.com -aws ecr create-repository --repository-name loreserver --region us-west-2 -docker tag loreserver:latest .dkr.ecr.us-west-2.amazonaws.com/loreserver:latest -docker push .dkr.ecr.us-west-2.amazonaws.com/loreserver:latest ``` -The Dockerfile builds the `loreserver` binary from the workspace, which includes -the `lore-aws` crate. The server's `main()` calls `register_all_plugins()` at -startup, registering the AWS (S3 + DynamoDB) and HashiCorp (Consul) plugins -automatically. No custom binary or fork is needed. +Push to ECR (replace `` and ``): + +```sh +aws ecr get-login-password --region | docker login --username AWS --password-stdin .dkr.ecr..amazonaws.com +aws ecr create-repository --repository-name loreserver --region +docker tag loreserver:latest .dkr.ecr..amazonaws.com/loreserver:latest +docker push .dkr.ecr..amazonaws.com/loreserver:latest +``` -## Deploy +### 2. Deploy ```sh cd examples/aws cp terraform.tfvars.example terraform.tfvars -# Edit terraform.tfvars — set your container_image URI and allowed_cidrs +``` + +Edit `terraform.tfvars`: + +```hcl +region = "us-west-2" +container_image = ".dkr.ecr.us-west-2.amazonaws.com/loreserver:latest" +allowed_cidrs = ["10.0.0.0/8"] # Your VPC or VPN CIDR +``` + +```sh terraform init terraform apply ``` -## Connect +### 3. Connect -The ECS services run in private subnets. Connect from within the VPC -(e.g., an EC2 instance, VPN, or AWS Client VPN). - -Export the CA certificate so the client trusts the server's QUIC endpoint: +The services run in private subnets — connect from within the VPC (EC2 instance, VPN, or Client VPN). ```sh +# Export the CA so the client trusts the server terraform output -raw ca_certificate_pem > lore-ca.pem export SSL_CERT_FILE=lore-ca.pem -``` -Clients connect to the **edge** service (it replicates from the primary -automatically). Get the edge task IP: - -```sh +# Get the edge node IP TASK_ARN=$(aws ecs list-tasks --cluster lore-cluster --service-name lore-edge --query 'taskArns[0]' --output text) TASK_IP=$(aws ecs describe-tasks --cluster lore-cluster --tasks "$TASK_ARN" \ --query 'tasks[0].attachments[0].details[?name==`privateIPv4Address`].value' --output text) -echo "$TASK_IP" -``` - -From a host inside the VPC: -```sh +# Clone a repository lore clone lore://${TASK_IP}:41337/my-repo ``` -> `lore://` uses QUIC (TLS) for data and plain gRPC for the control plane. -> The edge pod's gRPC is not TLS-configured, so `lore://` works directly. -> For `lores://` (gRPC+TLS), configure certificates on the edge pod (see Customize). +## What gets deployed -## Verify +| Component | Purpose | +|-----------|---------| +| Primary (ECS Fargate) | Stores fragments in S3 and metadata in DynamoDB | +| Edge (ECS Fargate) | Client-facing node that replicates from primary | +| Cloud Map DNS | Edge → primary service discovery | +| VPC | Private subnets, NAT, S3/DynamoDB gateway endpoints | +| TLS CA | Self-signed; establishes trust between nodes | -Check the service is running: +## Verify ```sh -aws ecs describe-services --cluster lore-cluster --services lore \ - --query 'services[0].{status:status,running:runningCount}' +aws ecs describe-services --cluster lore-cluster --services lore lore-edge \ + --query 'services[].{name:serviceName,running:runningCount}' ``` -Check server logs: - ```sh aws logs tail /ecs/lore --since 5m ``` ## Customize -This example uses the simplest viable configuration. For production: +| Need | What to change | +|------|----------------| +| External access | Add an NLB or AWS Client VPN | +| gRPC TLS for clients | Configure edge certificates, use `lores://` | +| Authentication | Set `LORE__SERVER__AUTH__JWK__ENDPOINT` ([docs](https://epicgames.github.io/lore/reference/lore-server-config/#authentication)) | +| NVMe caching | Switch to EC2, use `composite` store mode | +| More edge nodes | Duplicate the edge service definition | +| Presigned URLs | Set `LORE__SERVER__HTTP__PRESIGNED_URL_HMAC_KEY` (hex, ≥32 bytes) | -- **Ingress** — add an NLB, AWS Client VPN, or bastion host for access from outside the VPC. -- **TLS** — mount real certificates and set `LORE__SERVER__QUIC__CERTIFICATE__CERT_FILE` / `PKEY_FILE` (and the same for `GRPC`). See [Server configuration reference](https://epicgames.github.io/lore/reference/lore-server-config/#certificate-block). -- **Auth** — configure `LORE__SERVER__AUTH__JWK__ENDPOINT` to validate JWTs. See [Authentication](https://epicgames.github.io/lore/reference/lore-server-config/#authentication). -- **Caching** — switch from Fargate to EC2 with NVMe instances and use `LORE__IMMUTABLE_STORE__MODE=composite` for a local cache in front of S3. -- **Replication** — add more edge nodes or deploy to other regions. See [Topology](https://epicgames.github.io/lore/reference/lore-server-config/#topology-settings). -- **HMAC** — set `LORE__SERVER__HTTP__PRESIGNED_URL_HMAC_KEY` (hex, ≥32 bytes) to enable presigned URLs for direct client-to-S3 transfers. +Full server configuration: [Lore Server config reference](https://epicgames.github.io/lore/reference/lore-server-config/) ## Destroy @@ -110,4 +101,8 @@ This example uses the simplest viable configuration. For production: terraform destroy ``` -Teardown includes VPC and NAT gateway deletion. +## Prerequisites + +- [Terraform](https://developer.hashicorp.com/terraform/install) >= 1.5 +- AWS credentials with VPC, ECS, S3, DynamoDB, IAM, Secrets Manager, Cloud Map permissions +- Docker (to build the container image) From 89853341a15faa978e3cdb6fe0235c991db9bad4 Mon Sep 17 00:00:00 2001 From: Sam Biggins Date: Thu, 18 Jun 2026 15:14:03 +0000 Subject: [PATCH 07/10] Add integration test (terraform test with mock providers) Validates resource schemas, variable wiring, and service configuration without AWS credentials. Catches breakage from Terraform/provider version upgrades or changes to the Lore AWS plugin config contract. Run: cd examples/aws && terraform init && terraform test Signed-off-by: Sam Biggins --- examples/aws/tests/plan.tftest.hcl | 88 ++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 examples/aws/tests/plan.tftest.hcl diff --git a/examples/aws/tests/plan.tftest.hcl b/examples/aws/tests/plan.tftest.hcl new file mode 100644 index 0000000..a08c967 --- /dev/null +++ b/examples/aws/tests/plan.tftest.hcl @@ -0,0 +1,88 @@ +# Plan-level validation — runs without AWS credentials. +# Catches stale resource schemas, broken variable wiring, and +# Terraform/provider version incompatibilities. +# +# Run: terraform test + +mock_provider "aws" {} +mock_provider "tls" {} + +override_data { + target = data.aws_availability_zones.available + values = { + names = ["us-west-2a", "us-west-2b"] + } +} + +variables { + container_image = "123456789012.dkr.ecr.us-west-2.amazonaws.com/loreserver:latest" + allowed_cidrs = ["10.0.0.0/8"] + region = "us-west-2" +} + +run "primary_service_configured" { + command = plan + + assert { + condition = aws_ecs_cluster.this.name == "lore-cluster" + error_message = "Cluster name should be 'lore-cluster'" + } + + assert { + condition = aws_ecs_service.lore.name == "lore" + error_message = "Primary service name should be 'lore'" + } + + assert { + condition = aws_ecs_service.edge.name == "lore-edge" + error_message = "Edge service name should be 'lore-edge'" + } +} + +run "storage_schemas_correct" { + command = plan + + assert { + condition = aws_dynamodb_table.fragments.hash_key == "hash" + error_message = "Fragments table hash key must be 'hash'" + } + + assert { + condition = aws_dynamodb_table.fragments.range_key == "repository_context" + error_message = "Fragments table range key must be 'repository_context'" + } + + assert { + condition = aws_dynamodb_table.metadata.hash_key == "hash" + error_message = "Metadata table hash key must be 'hash'" + } + + assert { + condition = aws_dynamodb_table.mutable.hash_key == "repository_id" + error_message = "Mutable table hash key must be 'repository_id'" + } + + assert { + condition = aws_dynamodb_table.locks.hash_key == "hash" + error_message = "Locks table hash key must be 'hash'" + } + + assert { + condition = aws_dynamodb_table.locks.range_key == "repositoryBranch" + error_message = "Locks table range key must be 'repositoryBranch'" + } +} + +run "service_discovery_configured" { + command = plan + + assert { + condition = aws_service_discovery_private_dns_namespace.this.name == "lore.internal" + error_message = "Cloud Map namespace should be 'lore.internal'" + } + + assert { + condition = aws_service_discovery_service.lore.name == "primary" + error_message = "Cloud Map service name should be 'primary'" + } +} From 6446d698bdb103289ff3c7be2d33ae0c49dc0e88 Mon Sep 17 00:00:00 2001 From: Sam Biggins Date: Wed, 24 Jun 2026 23:04:23 +0000 Subject: [PATCH 08/10] feat(examples/aws): EC2 c8gd.8xlarge with NVMe composite store Replace Fargate with ECS on EC2 to demonstrate Lore's core value proposition: NVMe-cached edge nodes with high-throughput serving. - c8gd.8xlarge default (32 vCPU, 64GB, 1.9TB NVMe, 25Gbps) - Composite store: local NVMe cache + S3 durable (primary) - Composite store: local NVMe cache + replicated durable (edge) - Separate IAM roles (primary has S3+DDB, edge has none) - Cloud Map for both primary and edge (client-facing DNS) - TLS cert SANs include both primary and edge DNS names - HMAC key via Secrets Manager for presigned URLs - Health check grace periods (120s primary, 300s edge) - DynamoDB PITR on all tables, S3 lifecycle for multipart cleanup - GSI key_schema (provider 6.x), runtime_platform ARM64 - Cache sized to 80% of NVMe (1.52TB on c8gd.8xlarge) - e2e test script (scripts/e2e-test.sh) for post-deploy validation - Full Lore CLI workflow documented in README Signed-off-by: Sam Biggins --- examples/aws/README.md | 107 +++++++--- examples/aws/compute.tf | 292 ++++++++++++++++++++++---- examples/aws/iam.tf | 71 ++++++- examples/aws/outputs.tf | 6 +- examples/aws/scripts/e2e-test.sh | 118 +++++++++++ examples/aws/storage.tf | 63 +++++- examples/aws/terraform.tfvars.example | 3 +- examples/aws/tests/plan.tftest.hcl | 25 ++- examples/aws/tls.tf | 4 +- examples/aws/user_data.sh.tpl | 38 ++++ examples/aws/variables.tf | 8 +- examples/aws/versions.tf | 8 +- 12 files changed, 655 insertions(+), 88 deletions(-) create mode 100755 examples/aws/scripts/e2e-test.sh create mode 100644 examples/aws/user_data.sh.tpl diff --git a/examples/aws/README.md b/examples/aws/README.md index 28d8af3..685f100 100644 --- a/examples/aws/README.md +++ b/examples/aws/README.md @@ -1,6 +1,8 @@ # Lore on AWS -Deploy a Lore server on AWS with durable S3/DynamoDB storage and an edge node for client access. +Deploy Lore on AWS with NVMe-cached edge nodes for high-throughput game asset delivery. + +This example uses **c8gd.8xlarge** Graviton instances (32 vCPU, 64 GB RAM, 1.9 TB NVMe, 25 Gbps network) — the recommended instance type for Lore. The NVMe instance store serves as a local fragment cache, delivering sub-millisecond reads for `lore clone` while S3 provides durable storage. > Region is configurable via `var.region` (default: `us-west-2`). @@ -11,9 +13,12 @@ Deploy a Lore server on AWS with durable S3/DynamoDB storage and an edge node fo From the Lore repo root: ```sh -docker build -f lore-server/Dockerfile -t loreserver . +docker buildx build --platform linux/arm64 -f lore-server/Dockerfile -t loreserver . ``` +> If building on an x86 host, [register QEMU](https://docs.docker.com/build/building/multi-platform/#qemu) first: +> `docker run --rm --privileged multiarch/qemu-user-static --reset -p yes` + Push to ECR (replace `` and ``): ```sh @@ -43,66 +48,110 @@ terraform init terraform apply ``` +First apply may need a second run (DynamoDB PITR timing race). + ### 3. Connect -The services run in private subnets — connect from within the VPC (EC2 instance, VPN, or Client VPN). +Services run in private subnets. Access requires connectivity to the VPC (e.g., NLB in public subnets, AWS Client VPN, VPC peering, or a bastion host). + +Export the CA certificate so the Lore client trusts the server: ```sh -# Export the CA so the client trusts the server terraform output -raw ca_certificate_pem > lore-ca.pem -export SSL_CERT_FILE=lore-ca.pem +cat /etc/ssl/certs/ca-certificates.crt lore-ca.pem > combined-ca.pem +export SSL_CERT_FILE=combined-ca.pem +``` + +Create a repository and push your first asset: -# Get the edge node IP -TASK_ARN=$(aws ecs list-tasks --cluster lore-cluster --service-name lore-edge --query 'taskArns[0]' --output text) -TASK_IP=$(aws ecs describe-tasks --cluster lore-cluster --tasks "$TASK_ARN" \ - --query 'tasks[0].attachments[0].details[?name==`privateIPv4Address`].value' --output text) +```sh +lore repository create lores://edge.lore.internal:41337/my-game +lore clone lores://edge.lore.internal:41337/my-game ./my-game +cp /path/to/assets/* ./my-game/ +cd my-game +lore stage . +lore commit "initial import" +lore push +``` + +Clone from another machine: -# Clone a repository -lore clone lore://${TASK_IP}:41337/my-repo +```sh +lore clone lores://edge.lore.internal:41337/my-game ./my-game ``` -## What gets deployed +**Note:** `edge.lore.internal` resolves via Cloud Map private DNS inside the VPC. + +## Architecture + +| Component | Instance | Purpose | +|-----------|----------|---------| +| Primary (ECS on EC2) | c8gd.8xlarge | Composite store: NVMe cache + S3 durable. Serves replication to edge. | +| Edge (ECS on EC2) | c8gd.8xlarge | Composite store: NVMe cache + replicated durable (QUIC to primary). Client-facing. | +| Cloud Map DNS | — | Service discovery (`primary.lore.internal`, `edge.lore.internal`) | +| VPC | — | Private subnets, NAT, S3/DynamoDB gateway endpoints | +| TLS CA | — | Self-signed; establishes trust between nodes and clients | -| Component | Purpose | -|-----------|---------| -| Primary (ECS Fargate) | Stores fragments in S3 and metadata in DynamoDB | -| Edge (ECS Fargate) | Client-facing node that replicates from primary | -| Cloud Map DNS | Edge → primary service discovery | -| VPC | Private subnets, NAT, S3/DynamoDB gateway endpoints | -| TLS CA | Self-signed; establishes trust between nodes | +**Startup:** Health check grace periods allow the primary (120s) and edge (300s) to initialize without being marked unhealthy. The edge's retry configuration handles Cloud Map DNS propagation delays automatically. On first deploy, edge nodes may restart 1-2 times while DNS propagates — this is expected and self-resolving. + +### Data flow + +``` +Client ──lores://──→ Edge (NVMe cache hit → instant response) + │ cache miss + ├──QUIC:41340──→ Primary (NVMe cache → S3 fallback) + └──gRPC:41337──→ Primary (branch resolution) +``` + +> **Instance sizing:** Use node sizes without network bandwidth caps (32+ vCPU) for production. This example uses c8gd.8xlarge (NVMe + Graviton). ## Verify ```sh aws ecs describe-services --cluster lore-cluster --services lore lore-edge \ - --query 'services[].{name:serviceName,running:runningCount}' + --query 'services[].{name:serviceName,running:runningCount}' --region us-west-2 ``` ```sh -aws logs tail /ecs/lore --since 5m +aws logs tail /ecs/lore --since 5m --region us-west-2 ``` ## Customize | Need | What to change | |------|----------------| -| External access | Add an NLB or AWS Client VPN | -| gRPC TLS for clients | Configure edge certificates, use `lores://` | +| Smaller instances (dev/test) | Set `instance_type = "c8gd.xlarge"` — same architecture, less capacity | +| External access | Add an NLB in public subnets | | Authentication | Set `LORE__SERVER__AUTH__JWK__ENDPOINT` ([docs](https://epicgames.github.io/lore/reference/lore-server-config/#authentication)) | -| NVMe caching | Switch to EC2, use `composite` store mode | -| More edge nodes | Duplicate the edge service definition | -| Presigned URLs | Set `LORE__SERVER__HTTP__PRESIGNED_URL_HMAC_KEY` (hex, ≥32 bytes) | +| More edge nodes | Increase ASG `max_size` + edge service `desired_count` | +| Faster edge startup | Consider adding a startup probe that polls `primary.lore.internal` before starting loreserver | +| Presigned URLs | Already configured via HMAC key in Secrets Manager | +| Production hardening | Add `deletion_protection_enabled = true` to DynamoDB tables | Full server configuration: [Lore Server config reference](https://epicgames.github.io/lore/reference/lore-server-config/) ## Destroy +The S3 bucket has `force_destroy = false` (prevents accidental data loss). Teardown takes ~6 minutes (capacity provider reconciliation). To destroy: + ```sh +aws s3 rm s3://$(terraform output -raw s3_bucket) --recursive terraform destroy ``` +If destroy fails on Cloud Map services ("Service contains registered instances"), scale to zero first: + +```sh +aws ecs update-service --cluster lore-cluster --service lore --desired-count 0 --region us-west-2 +aws ecs update-service --cluster lore-cluster --service lore-edge --desired-count 0 --region us-west-2 +sleep 30 +terraform destroy +``` + +For dev/test where you want one-command teardown, add `force_destroy = true` to the `aws_s3_bucket` resource. + ## Prerequisites -- [Terraform](https://developer.hashicorp.com/terraform/install) >= 1.5 -- AWS credentials with VPC, ECS, S3, DynamoDB, IAM, Secrets Manager, Cloud Map permissions -- Docker (to build the container image) +- [Terraform](https://developer.hashicorp.com/terraform/install) >= 1.7 +- AWS credentials with VPC, ECS, EC2, S3, DynamoDB, IAM, Secrets Manager, Cloud Map, Auto Scaling permissions +- Docker (to build the ARM64 container image) diff --git a/examples/aws/compute.tf b/examples/aws/compute.tf index 7ae9606..3748c28 100644 --- a/examples/aws/compute.tf +++ b/examples/aws/compute.tf @@ -1,9 +1,23 @@ # ============================================================================= -# ECS Cluster + Primary + Edge Services +# ECS on EC2 — c8gd.8xlarge with NVMe instance store for fragment caching +# +# This is the recommended deployment for Lore. The NVMe instance store provides +# sub-millisecond fragment reads for clones, while S3 provides durability. +# c8gd.8xlarge: 32 vCPU, 64 GB RAM, 1x 1.9 TB NVMe, 25 Gbps network. # ============================================================================= +data "aws_ssm_parameter" "ecs_ami" { + name = "/aws/service/ecs/optimized-ami/amazon-linux-2023/arm64/recommended/image_id" +} + resource "aws_ecs_cluster" "this" { name = "${local.name}-cluster" + + setting { + name = "containerInsights" + value = "enabled" + } + tags = local.tags } @@ -14,29 +28,138 @@ resource "aws_cloudwatch_log_group" "lore" { } # ============================================================================= -# Primary — Durable storage (S3 + DynamoDB), serves replication to edge +# Launch Template + ASG — ECS-managed instances with NVMe setup +# ============================================================================= + +resource "aws_launch_template" "ecs" { + name_prefix = "${local.name}-ecs-" + image_id = data.aws_ssm_parameter.ecs_ami.value + instance_type = var.instance_type + + iam_instance_profile { + arn = aws_iam_instance_profile.ecs_instance.arn + } + + vpc_security_group_ids = [aws_security_group.lore.id] + + user_data = base64encode(templatefile("${path.module}/user_data.sh.tpl", { + cluster_name = aws_ecs_cluster.this.name + mount_path = "/srv/urc" + })) + + metadata_options { + http_endpoint = "enabled" + http_tokens = "required" + http_put_response_hop_limit = 1 + } + + tag_specifications { + resource_type = "instance" + tags = merge(local.tags, { Name = "${local.name}-ecs" }) + } + + tags = local.tags +} + +resource "aws_autoscaling_group" "ecs" { + name_prefix = "${local.name}-ecs-" + min_size = 2 + max_size = 2 + desired_capacity = 2 + vpc_zone_identifier = aws_subnet.private[*].id + + launch_template { + id = aws_launch_template.ecs.id + version = "$Latest" + } + + protect_from_scale_in = true + + # Allows terraform destroy to delete the ASG without waiting for capacity + # provider reconciliation (~6 min). Remove for production if you want + # graceful drain before ASG deletion. + force_delete = true + + tag { + key = "AmazonECSManaged" + value = "true" + propagate_at_launch = true + } + + tag { + key = "Name" + value = "${local.name}-ecs" + propagate_at_launch = true + } + + lifecycle { + ignore_changes = [desired_capacity] + } +} + +# ============================================================================= +# Capacity Provider — links ASG to ECS cluster +# ============================================================================= + +resource "aws_ecs_capacity_provider" "ec2" { + name = "${local.name}-ec2" + + auto_scaling_group_provider { + auto_scaling_group_arn = aws_autoscaling_group.ecs.arn + managed_termination_protection = "ENABLED" + + managed_scaling { + status = "ENABLED" + target_capacity = 100 + minimum_scaling_step_size = 1 + maximum_scaling_step_size = 1 + } + } + + tags = local.tags +} + +resource "aws_ecs_cluster_capacity_providers" "this" { + cluster_name = aws_ecs_cluster.this.name + capacity_providers = [aws_ecs_capacity_provider.ec2.name] + + default_capacity_provider_strategy { + capacity_provider = aws_ecs_capacity_provider.ec2.name + weight = 100 + } +} + +# ============================================================================= +# Primary — Composite store (NVMe cache + durable S3), serves replication # ============================================================================= resource "aws_ecs_task_definition" "lore" { family = local.name - requires_compatibilities = ["FARGATE"] + requires_compatibilities = ["EC2"] network_mode = "awsvpc" - cpu = "1024" - memory = "2048" execution_role_arn = aws_iam_role.execution.arn task_role_arn = aws_iam_role.task.arn + runtime_platform { + operating_system_family = "LINUX" + cpu_architecture = "ARM64" + } + + volume { + name = "instance-store-cache" + host_path = "/srv/urc" + } + volume { name = "certs" } container_definitions = jsonencode([ - # Init container: write TLS certs from secrets to shared volume { name = "init-certs" image = "public.ecr.aws/amazonlinux/amazonlinux:minimal" essential = false - command = ["sh", "-c", "echo \"$CERT\" > /certs/fullchain.crt && echo \"$KEY\" > /certs/server.key && echo \"$CA\" > /certs/ca.pem"] + command = ["sh", "-c", "echo \"$CERT\" > /certs/fullchain.crt && echo \"$KEY\" > /certs/server.key && chmod 600 /certs/server.key && echo \"$CA\" > /certs/ca.pem"] secrets = [ { name = "CERT", valueFrom = "${aws_secretsmanager_secret.tls.arn}:fullchain::" }, @@ -45,6 +168,7 @@ resource "aws_ecs_task_definition" "lore" { ] mountPoints = [{ sourceVolume = "certs", containerPath = "/certs", readOnly = false }] + memoryReservation = 64 logConfiguration = { logDriver = "awslogs" @@ -55,13 +179,13 @@ resource "aws_ecs_task_definition" "lore" { } } }, - # Loreserver primary { name = "loreserver" image = var.container_image essential = true dependsOn = [{ containerName = "init-certs", condition = "SUCCESS" }] + memoryReservation = 8192 portMappings = [ { containerPort = local.port_quic_grpc, protocol = "tcp" }, @@ -70,27 +194,41 @@ resource "aws_ecs_task_definition" "lore" { { containerPort = local.port_replication, protocol = "udp" }, ] - mountPoints = [{ sourceVolume = "certs", containerPath = "/certs", readOnly = true }] + mountPoints = [ + { sourceVolume = "instance-store-cache", containerPath = "/srv/urc", readOnly = false }, + { sourceVolume = "certs", containerPath = "/certs", readOnly = true }, + ] + + secrets = [ + { name = "LORE__SERVER__HTTP__PRESIGNED_URL_HMAC_KEY", valueFrom = aws_secretsmanager_secret.hmac.arn }, + ] environment = [ { name = "LORE_ENV", value = "docker" }, { name = "LORE_CONFIG_PATH", value = "/etc/lore/config" }, - # TLS for all endpoints + # TLS { name = "LORE__SERVER__QUIC__CERTIFICATE__CERT_FILE", value = "/certs/fullchain.crt" }, { name = "LORE__SERVER__QUIC__CERTIFICATE__PKEY_FILE", value = "/certs/server.key" }, { name = "LORE__SERVER__GRPC__CERTIFICATE__CERT_FILE", value = "/certs/fullchain.crt" }, { name = "LORE__SERVER__GRPC__CERTIFICATE__PKEY_FILE", value = "/certs/server.key" }, { name = "LORE__SERVER__GRPC__VERIFY_CLIENT_CERTS", value = "false" }, - # Enable internal QUIC for edge pod replication + # Internal QUIC for edge replication { name = "LORE__SERVER__QUIC_INTERNAL__ENABLED", value = "true" }, { name = "LORE__SERVER__QUIC_INTERNAL__CERTIFICATE__CERT_FILE", value = "/certs/fullchain.crt" }, { name = "LORE__SERVER__QUIC_INTERNAL__CERTIFICATE__PKEY_FILE", value = "/certs/server.key" }, { name = "LORE__SERVER__QUIC_INTERNAL__VERIFY_CLIENT_CERTS", value = "false" }, - # Storage: S3 + DynamoDB via the aws plugin - { name = "LORE__IMMUTABLE_STORE__MODE", value = "aws" }, + # Storage: composite (NVMe cache + S3 durable) + { name = "LORE__IMMUTABLE_STORE__MODE", value = "composite" }, + { name = "LORE__IMMUTABLE_STORE__COMPOSITE__LOCAL__MODE", value = "local" }, + { name = "LORE__IMMUTABLE_STORE__COMPOSITE__LOCAL__LOCAL__PATH", value = "/srv/urc" }, + # 80% of c8gd.8xlarge NVMe (1.9 TB). Reserves 20% for xfs metadata/journal. + # The fragment cache is the only consumer of the instance store. + { name = "LORE__IMMUTABLE_STORE__COMPOSITE__LOCAL__LOCAL__MAX_SIZE", value = "1520000000000" }, + { name = "LORE__IMMUTABLE_STORE__COMPOSITE__LOCAL__LOCAL__FLUSH_DELAY_SECONDS", value = "10" }, + { name = "LORE__IMMUTABLE_STORE__COMPOSITE__DURABLE__MODE", value = "aws" }, { name = "LORE__MUTABLE_STORE__MODE", value = "aws" }, { name = "LORE__LOCK_STORE__MODE", value = "aws" }, @@ -121,23 +259,35 @@ resource "aws_ecs_service" "lore" { cluster = aws_ecs_cluster.this.id task_definition = aws_ecs_task_definition.lore.arn desired_count = 1 - launch_type = "FARGATE" + + health_check_grace_period_seconds = 120 + + capacity_provider_strategy { + capacity_provider = aws_ecs_capacity_provider.ec2.name + weight = 100 + } network_configuration { - subnets = aws_subnet.private[*].id - security_groups = [aws_security_group.lore.id] - assign_public_ip = false + subnets = aws_subnet.private[*].id + security_groups = [aws_security_group.lore.id] } service_registries { registry_arn = aws_service_discovery_service.lore.arn } + placement_constraints { + type = "distinctInstance" + } + tags = local.tags } # ============================================================================= -# Cloud Map — Service discovery for edge → primary +# Cloud Map — Service discovery for edge → primary and client → edge +# +# NOTE: terraform destroy may fail if ECS tasks are still registered. If this +# happens, scale services to 0 and wait 30s before re-running destroy. # ============================================================================= resource "aws_service_discovery_private_dns_namespace" "this" { @@ -158,43 +308,68 @@ resource "aws_service_discovery_service" "lore" { routing_policy = "MULTIVALUE" } - health_check_custom_config { - failure_threshold = 1 + health_check_custom_config {} + + tags = local.tags +} + +resource "aws_service_discovery_service" "edge" { + name = "edge" + + dns_config { + namespace_id = aws_service_discovery_private_dns_namespace.this.id + dns_records { + ttl = 10 + type = "A" + } + routing_policy = "MULTIVALUE" } + health_check_custom_config {} + tags = local.tags } # ============================================================================= -# Edge Pod — Caching node with replicated + remote stores +# Edge — Composite store (NVMe cache + replicated durable via QUIC to primary) # ============================================================================= resource "aws_ecs_task_definition" "edge" { family = "${local.name}-edge" - requires_compatibilities = ["FARGATE"] + requires_compatibilities = ["EC2"] network_mode = "awsvpc" - cpu = "1024" - memory = "2048" execution_role_arn = aws_iam_role.execution.arn - task_role_arn = aws_iam_role.task.arn + task_role_arn = aws_iam_role.edge_task.arn + + runtime_platform { + operating_system_family = "LINUX" + cpu_architecture = "ARM64" + } + + volume { + name = "instance-store-cache" + host_path = "/srv/urc" + } volume { name = "certs" } container_definitions = jsonencode([ - # Init container: write CA cert so edge trusts primary { name = "init-certs" image = "public.ecr.aws/amazonlinux/amazonlinux:minimal" essential = false - command = ["sh", "-c", "echo \"$CA\" > /certs/ca.pem"] + command = ["sh", "-c", "echo \"$CERT\" > /certs/fullchain.crt && echo \"$KEY\" > /certs/server.key && chmod 600 /certs/server.key && cat /etc/pki/tls/certs/ca-bundle.crt > /certs/ca.pem && echo \"$CA\" >> /certs/ca.pem"] secrets = [ + { name = "CERT", valueFrom = "${aws_secretsmanager_secret.tls.arn}:fullchain::" }, + { name = "KEY", valueFrom = "${aws_secretsmanager_secret.tls.arn}:key::" }, { name = "CA", valueFrom = "${aws_secretsmanager_secret.tls.arn}:ca::" }, ] mountPoints = [{ sourceVolume = "certs", containerPath = "/certs", readOnly = false }] + memoryReservation = 64 logConfiguration = { logDriver = "awslogs" @@ -205,13 +380,13 @@ resource "aws_ecs_task_definition" "edge" { } } }, - # Loreserver edge { name = "loreserver" image = var.container_image essential = true dependsOn = [{ containerName = "init-certs", condition = "SUCCESS" }] + memoryReservation = 8192 portMappings = [ { containerPort = local.port_quic_grpc, protocol = "tcp" }, @@ -219,21 +394,43 @@ resource "aws_ecs_task_definition" "edge" { { containerPort = local.port_http, protocol = "tcp" }, ] - mountPoints = [{ sourceVolume = "certs", containerPath = "/certs", readOnly = true }] + mountPoints = [ + { sourceVolume = "instance-store-cache", containerPath = "/srv/urc", readOnly = false }, + { sourceVolume = "certs", containerPath = "/certs", readOnly = true }, + ] + + secrets = [ + { name = "LORE__SERVER__HTTP__PRESIGNED_URL_HMAC_KEY", valueFrom = aws_secretsmanager_secret.hmac.arn }, + ] environment = [ { name = "LORE_ENV", value = "docker" }, { name = "LORE_CONFIG_PATH", value = "/etc/lore/config" }, - # Trust the primary's CA for QUIC replication connection { name = "SSL_CERT_FILE", value = "/certs/ca.pem" }, - # Edge stores: replicated immutable (QUIC to primary:41340) + remote mutable (gRPC to primary:41337) - { name = "LORE__IMMUTABLE_STORE__MODE", value = "replicated" }, - { name = "LORE__IMMUTABLE_STORE__REPLICATED__REMOTE_URL", value = "lore://primary.${local.name}.internal:${local.port_replication}" }, - { name = "LORE__IMMUTABLE_STORE__REPLICATED__PERIODIC_CLIENT_REFRESH_SECS", value = "300" }, - { name = "LORE__IMMUTABLE_STORE__REPLICATED__REGENERATE_RETRY__INITIAL_BACKOFF_MS", value = "100" }, - { name = "LORE__IMMUTABLE_STORE__REPLICATED__REGENERATE_RETRY__MAX_BACKOFF_MS", value = "5000" }, - { name = "LORE__IMMUTABLE_STORE__REPLICATED__REGENERATE_RETRY__MAX_ATTEMPTS", value = "10" }, + # TLS for client-facing endpoints + { name = "LORE__SERVER__QUIC__CERTIFICATE__CERT_FILE", value = "/certs/fullchain.crt" }, + { name = "LORE__SERVER__QUIC__CERTIFICATE__PKEY_FILE", value = "/certs/server.key" }, + { name = "LORE__SERVER__GRPC__CERTIFICATE__CERT_FILE", value = "/certs/fullchain.crt" }, + { name = "LORE__SERVER__GRPC__CERTIFICATE__PKEY_FILE", value = "/certs/server.key" }, + { name = "LORE__SERVER__GRPC__VERIFY_CLIENT_CERTS", value = "false" }, + + # Storage: composite (NVMe cache + replicated durable via QUIC to primary) + { name = "LORE__IMMUTABLE_STORE__MODE", value = "composite" }, + { name = "LORE__IMMUTABLE_STORE__COMPOSITE__LOCAL__MODE", value = "local" }, + { name = "LORE__IMMUTABLE_STORE__COMPOSITE__LOCAL__LOCAL__PATH", value = "/srv/urc" }, + # 80% of c8gd.8xlarge NVMe (1.9 TB). Reserves 20% for xfs metadata/journal. + # The fragment cache is the only consumer of the instance store. + { name = "LORE__IMMUTABLE_STORE__COMPOSITE__LOCAL__LOCAL__MAX_SIZE", value = "1520000000000" }, + { name = "LORE__IMMUTABLE_STORE__COMPOSITE__LOCAL__LOCAL__FLUSH_DELAY_SECONDS", value = "10" }, + { name = "LORE__IMMUTABLE_STORE__COMPOSITE__DURABLE__MODE", value = "replicated" }, + { name = "LORE__IMMUTABLE_STORE__COMPOSITE__DURABLE__REPLICATED__REMOTE_URL", value = "lore://primary.${local.name}.internal:${local.port_replication}" }, + { name = "LORE__IMMUTABLE_STORE__COMPOSITE__DURABLE__REPLICATED__PERIODIC_CLIENT_REFRESH_SECS", value = "180" }, + { name = "LORE__IMMUTABLE_STORE__COMPOSITE__DURABLE__REPLICATED__REGENERATE_RETRY__INITIAL_BACKOFF_MS", value = "100" }, + { name = "LORE__IMMUTABLE_STORE__COMPOSITE__DURABLE__REPLICATED__REGENERATE_RETRY__MAX_BACKOFF_MS", value = "1000" }, + { name = "LORE__IMMUTABLE_STORE__COMPOSITE__DURABLE__REPLICATED__REGENERATE_RETRY__MAX_ATTEMPTS", value = "10" }, + + # Branch resolution proxied to primary { name = "LORE__MUTABLE_STORE__MODE", value = "remote" }, { name = "LORE__MUTABLE_STORE__REMOTE__REMOTE_URL", value = "lores://primary.${local.name}.internal:${local.port_quic_grpc}" }, { name = "LORE__LOCK_STORE__MODE", value = "local" }, @@ -258,12 +455,25 @@ resource "aws_ecs_service" "edge" { cluster = aws_ecs_cluster.this.id task_definition = aws_ecs_task_definition.edge.arn desired_count = 1 - launch_type = "FARGATE" + + health_check_grace_period_seconds = 300 + + capacity_provider_strategy { + capacity_provider = aws_ecs_capacity_provider.ec2.name + weight = 100 + } network_configuration { - subnets = aws_subnet.private[*].id - security_groups = [aws_security_group.lore.id] - assign_public_ip = false + subnets = aws_subnet.private[*].id + security_groups = [aws_security_group.lore.id] + } + + service_registries { + registry_arn = aws_service_discovery_service.edge.arn + } + + placement_constraints { + type = "distinctInstance" } depends_on = [aws_ecs_service.lore] diff --git a/examples/aws/iam.tf b/examples/aws/iam.tf index de60ca1..f87f39e 100644 --- a/examples/aws/iam.tf +++ b/examples/aws/iam.tf @@ -1,8 +1,38 @@ # ============================================================================= -# IAM — ECS task role (S3 + DynamoDB access) and execution role (ECR + logs) +# IAM — EC2 instance role, ECS task roles, execution role # ============================================================================= -# Task role — what the loreserver container can do +# EC2 instance role — ECS agent needs to communicate with the ECS API +resource "aws_iam_role" "ecs_instance" { + name_prefix = "${local.name}-instance-" + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Action = "sts:AssumeRole" + Effect = "Allow" + Principal = { Service = "ec2.amazonaws.com" } + }] + }) + tags = local.tags +} + +resource "aws_iam_role_policy_attachment" "ecs_instance_role" { + role = aws_iam_role.ecs_instance.name + policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonEC2ContainerServiceforEC2Role" +} + +resource "aws_iam_role_policy_attachment" "ecs_instance_ssm" { + role = aws_iam_role.ecs_instance.name + policy_arn = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" +} + +resource "aws_iam_instance_profile" "ecs_instance" { + name_prefix = "${local.name}-instance-" + role = aws_iam_role.ecs_instance.name + tags = local.tags +} + +# Primary task role — S3 + DynamoDB access for durable storage resource "aws_iam_role" "task" { name_prefix = "${local.name}-task-" assume_role_policy = jsonencode({ @@ -16,6 +46,21 @@ resource "aws_iam_role" "task" { tags = local.tags } +# Edge task role — intentionally empty. Edge proxies all storage operations +# through the primary via gRPC/QUIC, so it needs no direct S3 or DynamoDB access. +resource "aws_iam_role" "edge_task" { + name_prefix = "${local.name}-edge-task-" + assume_role_policy = jsonencode({ + Version = "2012-10-17" + Statement = [{ + Action = "sts:AssumeRole" + Effect = "Allow" + Principal = { Service = "ecs-tasks.amazonaws.com" } + }] + }) + tags = local.tags +} + resource "aws_iam_role_policy" "task_s3" { name_prefix = "s3-" role = aws_iam_role.task.id @@ -66,7 +111,7 @@ resource "aws_iam_role_policy" "task_dynamodb" { }) } -# Execution role — what ECS needs to start the task (pull image, write logs, read secrets) +# Execution role — ECS agent pulls images, writes logs, reads secrets resource "aws_iam_role" "execution" { name_prefix = "${local.name}-exec-" assume_role_policy = jsonencode({ @@ -93,7 +138,25 @@ resource "aws_iam_role_policy" "execution_secrets" { Statement = [{ Effect = "Allow" Action = ["secretsmanager:GetSecretValue"] - Resource = [aws_secretsmanager_secret.tls.arn] + Resource = [aws_secretsmanager_secret.tls.arn, aws_secretsmanager_secret.hmac.arn] }] }) } + +# ============================================================================= +# HMAC Key — presigned URL feature for fragment transfer between nodes +# ============================================================================= + +resource "random_id" "hmac" { + byte_length = 32 +} + +resource "aws_secretsmanager_secret" "hmac" { + name_prefix = "${local.name}-hmac-" + tags = local.tags +} + +resource "aws_secretsmanager_secret_version" "hmac" { + secret_id = aws_secretsmanager_secret.hmac.id + secret_string = random_id.hmac.hex +} diff --git a/examples/aws/outputs.tf b/examples/aws/outputs.tf index 6aaae18..ef00c59 100644 --- a/examples/aws/outputs.tf +++ b/examples/aws/outputs.tf @@ -18,6 +18,11 @@ output "primary_dns" { value = "primary.${aws_service_discovery_private_dns_namespace.this.name}" } +output "edge_dns" { + description = "Cloud Map DNS for edge (used by clients)" + value = "edge.${aws_service_discovery_private_dns_namespace.this.name}" +} + output "s3_bucket" { description = "S3 bucket for fragment storage" value = aws_s3_bucket.fragments.id @@ -31,5 +36,4 @@ output "log_group" { output "ca_certificate_pem" { description = "CA certificate — clients need this to trust the server's TLS cert" value = local.ca_pem - sensitive = true } diff --git a/examples/aws/scripts/e2e-test.sh b/examples/aws/scripts/e2e-test.sh new file mode 100755 index 0000000..e9149d3 --- /dev/null +++ b/examples/aws/scripts/e2e-test.sh @@ -0,0 +1,118 @@ +#!/usr/bin/env bash +# scripts/e2e-test.sh — End-to-end validation of Lore push + clone via the edge node. +# Requires: terraform apply completed, AWS credentials, SSM access to instances. +# Platforms: Linux, macOS (runs remotely on Graviton instance via SSM) +# +# Usage: ./scripts/e2e-test.sh [region] +# +# Builds the Lore CLI from source inside a Docker container on one of the +# ECS instances, then pushes a 10MB test file and clones it back to verify +# data integrity through the full storage chain (NVMe cache → S3 → replication). +set -euo pipefail + +REGION="${1:-us-west-2}" +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +EXAMPLE_DIR="$SCRIPT_DIR/.." + +cd "$EXAMPLE_DIR" + +echo "=== E2E Test: Lore on AWS ===" + +# Get deployment info from terraform +S3_BUCKET=$(terraform output -raw s3_bucket) +CA_CERT=$(terraform output -raw ca_certificate_pem) +EDGE_DNS=$(terraform output -raw edge_dns) +PRIMARY_DNS=$(terraform output -raw primary_dns) +echo " Bucket: $S3_BUCKET" +echo " Edge: $EDGE_DNS" +echo " Primary: $PRIMARY_DNS" + +# Find an instance to run on (uses ECS-managed tag, not the Name tag) +CLUSTER=$(terraform output -raw cluster_name) +INSTANCE_ID=$(aws ec2 describe-instances \ + --filters "Name=tag:aws:ecs:clusterName,Values=$CLUSTER" 'Name=instance-state-name,Values=running' \ + --query 'Reservations[0].Instances[0].InstanceId' \ + --output text --region "$REGION") +echo " Instance: $INSTANCE_ID" + +# Upload source if not already present +if ! aws s3 ls "s3://$S3_BUCKET/build/lore-src.tar.gz" --region "$REGION" >/dev/null 2>&1; then + echo " Uploading Lore source to S3..." + REPO_ROOT="$(cd "$EXAMPLE_DIR/../.." && pwd)" + tar -czf /tmp/lore-src.tar.gz -C "$REPO_ROOT" \ + --exclude=target --exclude=.git --exclude='examples/aws/.terraform*' \ + --exclude='*.tfstate*' --exclude='*.tfvars' . + aws s3 cp /tmp/lore-src.tar.gz "s3://$S3_BUCKET/build/lore-src.tar.gz" --region "$REGION" +fi + +PRESIGNED_URL=$(aws s3 presign "s3://$S3_BUCKET/build/lore-src.tar.gz" --expires-in 900 --region "$REGION") + +# Write the CA cert for the combined bundle +echo "$CA_CERT" > /tmp/e2e-ca.pem + +echo "" +echo "=== Building Lore CLI on $INSTANCE_ID (takes ~4 min) ===" + +COMMAND_ID=$(aws ssm send-command \ + --instance-ids "$INSTANCE_ID" \ + --document-name "AWS-RunShellScript" \ + --parameters "commands=[ + \"set -ex\", + \"curl -sSo /tmp/lore-src.tar.gz '$PRESIGNED_URL'\", + \"rm -rf /tmp/lore-src && mkdir -p /tmp/lore-src && tar -xzf /tmp/lore-src.tar.gz -C /tmp/lore-src\", + \"echo '$CA_CERT' > /tmp/lore-ca.pem && cat /etc/pki/tls/certs/ca-bundle.crt /tmp/lore-ca.pem > /tmp/combined-ca.pem\", + \"docker run --rm --network host -v /tmp/lore-src:/src -v /tmp/combined-ca.pem:/certs/ca.pem -w /src -e SSL_CERT_FILE=/certs/ca.pem rust:latest bash -c 'apt-get update -qq && apt-get install -y -qq pkg-config libssl-dev protobuf-compiler >/dev/null 2>&1 && cargo build --release -p lore-client 2>&1 | tail -3 && echo BUILD_OK && REPO=e2e-\$(date +%s) && ./target/release/lore --version && echo === CREATE REPO \$REPO === && ./target/release/lore repository create lores://$PRIMARY_DNS:41337/\$REPO && echo === CLONE === && ./target/release/lore clone lores://$PRIMARY_DNS:41337/\$REPO /tmp/e2e && echo === ADD 10MB FILE === && dd if=/dev/urandom of=/tmp/e2e/asset.bin bs=1M count=10 2>&1 && cd /tmp/e2e && echo === STAGE === && /src/target/release/lore stage asset.bin && echo === COMMIT === && /src/target/release/lore commit --non-interactive e2e-test && echo === PUSH === && /src/target/release/lore push && echo === CLONE BACK === && rm -rf /tmp/clone && /src/target/release/lore clone lores://$PRIMARY_DNS:41337/\$REPO /tmp/clone && echo === VERIFY === && md5sum /tmp/e2e/asset.bin /tmp/clone/asset.bin'\" + ]" \ + --timeout-seconds 900 \ + --query 'Command.CommandId' \ + --output text \ + --region "$REGION") + +echo " Command: $COMMAND_ID" +echo " Waiting for completion..." + +# Poll until done +while true; do + sleep 30 + STATUS=$(aws ssm get-command-invocation \ + --command-id "$COMMAND_ID" \ + --instance-id "$INSTANCE_ID" \ + --query 'Status' \ + --output text \ + --region "$REGION" 2>/dev/null || echo "Pending") + + case "$STATUS" in + InProgress|Pending) echo " ... still running" ;; + Success) + echo "" + echo "=== SUCCESS ===" + aws ssm get-command-invocation \ + --command-id "$COMMAND_ID" \ + --instance-id "$INSTANCE_ID" \ + --query 'StandardOutputContent' \ + --output text \ + --region "$REGION" | grep -A1 "VERIFY" + echo "" + echo "✓ Push + Clone verified. MD5 checksums match." + exit 0 + ;; + *) + echo "" + echo "=== FAILED (status: $STATUS) ===" + aws ssm get-command-invocation \ + --command-id "$COMMAND_ID" \ + --instance-id "$INSTANCE_ID" \ + --query 'StandardOutputContent' \ + --output text \ + --region "$REGION" | tail -20 + echo "---STDERR---" + aws ssm get-command-invocation \ + --command-id "$COMMAND_ID" \ + --instance-id "$INSTANCE_ID" \ + --query 'StandardErrorContent' \ + --output text \ + --region "$REGION" | grep -v "^++" | grep -v "MII\|BEGIN\|END" | tail -10 + exit 1 + ;; + esac +done diff --git a/examples/aws/storage.tf b/examples/aws/storage.tf index de8a2ab..6fe8b46 100644 --- a/examples/aws/storage.tf +++ b/examples/aws/storage.tf @@ -2,6 +2,8 @@ # S3 — Fragment payloads (immutable store) # ============================================================================= +# force_destroy defaults to false — the bucket cannot be destroyed with data inside. +# For dev/test teardown, set force_destroy = true or empty the bucket before destroy. resource "aws_s3_bucket" "fragments" { bucket_prefix = "${local.name}-fragments-" tags = local.tags @@ -27,6 +29,19 @@ resource "aws_s3_bucket_public_access_block" "fragments" { restrict_public_buckets = true } +resource "aws_s3_bucket_lifecycle_configuration" "fragments" { + bucket = aws_s3_bucket.fragments.id + + rule { + id = "abort-incomplete-multipart" + status = "Enabled" + filter {} + abort_incomplete_multipart_upload { + days_after_initiation = 7 + } + } +} + # ============================================================================= # DynamoDB — Fragment associations # Key schema from lore-aws/src/store/immutable_store.rs @@ -47,6 +62,8 @@ resource "aws_dynamodb_table" "fragments" { type = "B" } + point_in_time_recovery { enabled = true } + tags = local.tags } @@ -65,6 +82,8 @@ resource "aws_dynamodb_table" "metadata" { type = "B" } + point_in_time_recovery { enabled = true } + tags = local.tags } @@ -88,6 +107,8 @@ resource "aws_dynamodb_table" "mutable" { type = "B" } + point_in_time_recovery { enabled = true } + tags = local.tags } @@ -96,6 +117,13 @@ resource "aws_dynamodb_table" "mutable" { # Key schema + GSIs from lore-aws/src/store/lock_store.rs # ============================================================================= +# NOTE: Table-level hash_key/range_key emits a deprecation warning suggesting key_schema, +# but key_schema blocks don't exist at the table level in the provider schema (only in GSIs). +# The warning is premature — no migration path exists yet for table primary keys. + +# Deletion protection disabled for teardown convenience. +# Production: add deletion_protection_enabled = true to each table. + resource "aws_dynamodb_table" "locks" { name = "${local.name}-locks" billing_mode = "PAY_PER_REQUEST" @@ -129,24 +157,47 @@ resource "aws_dynamodb_table" "locks" { global_secondary_index { name = "owner-repo-branch" - hash_key = "ownerId" - range_key = "repositoryBranch" projection_type = "ALL" + + key_schema { + attribute_name = "ownerId" + key_type = "HASH" + } + key_schema { + attribute_name = "repositoryBranch" + key_type = "RANGE" + } } global_secondary_index { name = "repo-branch" - hash_key = "repository" - range_key = "branch" projection_type = "ALL" + + key_schema { + attribute_name = "repository" + key_type = "HASH" + } + key_schema { + attribute_name = "branch" + key_type = "RANGE" + } } global_secondary_index { name = "repo-branch-description" - hash_key = "repositoryBranch" - range_key = "description" projection_type = "ALL" + + key_schema { + attribute_name = "repositoryBranch" + key_type = "HASH" + } + key_schema { + attribute_name = "description" + key_type = "RANGE" + } } + point_in_time_recovery { enabled = true } + tags = local.tags } diff --git a/examples/aws/terraform.tfvars.example b/examples/aws/terraform.tfvars.example index 27a34c0..f102229 100644 --- a/examples/aws/terraform.tfvars.example +++ b/examples/aws/terraform.tfvars.example @@ -1,3 +1,4 @@ region = "us-west-2" -container_image = "123456789012.dkr.ecr.us-west-2.amazonaws.com/loreserver:latest" +container_image = "123456789012.dkr.ecr.us-west-2.amazonaws.com/loreserver:latest" # must support linux/arm64 allowed_cidrs = ["10.0.0.0/8"] +# instance_type = "c8gd.8xlarge" # default — 32 vCPU, 64 GB, 1.9 TB NVMe, 25 Gbps diff --git a/examples/aws/tests/plan.tftest.hcl b/examples/aws/tests/plan.tftest.hcl index a08c967..8fca227 100644 --- a/examples/aws/tests/plan.tftest.hcl +++ b/examples/aws/tests/plan.tftest.hcl @@ -6,6 +6,7 @@ mock_provider "aws" {} mock_provider "tls" {} +mock_provider "random" {} override_data { target = data.aws_availability_zones.available @@ -14,13 +15,21 @@ override_data { } } +override_data { + target = data.aws_ssm_parameter.ecs_ami + values = { + value = "ami-0123456789abcdef0" + } +} + variables { container_image = "123456789012.dkr.ecr.us-west-2.amazonaws.com/loreserver:latest" allowed_cidrs = ["10.0.0.0/8"] region = "us-west-2" + name = "lore" } -run "primary_service_configured" { +run "cluster_and_services_configured" { command = plan assert { @@ -86,3 +95,17 @@ run "service_discovery_configured" { error_message = "Cloud Map service name should be 'primary'" } } + +run "ec2_infrastructure_configured" { + command = plan + + assert { + condition = aws_launch_template.ecs.instance_type == "c8gd.8xlarge" + error_message = "Launch template should use c8gd.8xlarge" + } + + assert { + condition = aws_autoscaling_group.ecs.min_size == 2 + error_message = "ASG min size should be 2 (primary + edge)" + } +} diff --git a/examples/aws/tls.tf b/examples/aws/tls.tf index 03eedc3..18c7d4f 100644 --- a/examples/aws/tls.tf +++ b/examples/aws/tls.tf @@ -38,8 +38,8 @@ resource "tls_cert_request" "server" { organization = "Lore Example" } - # Cloud Map DNS name used by edge pods to reach primary - dns_names = ["primary.${local.name}.internal", "localhost"] + # Cloud Map DNS names used by clients and inter-node communication + dns_names = ["primary.${local.name}.internal", "edge.${local.name}.internal", "localhost"] } resource "tls_locally_signed_cert" "server" { diff --git a/examples/aws/user_data.sh.tpl b/examples/aws/user_data.sh.tpl new file mode 100644 index 0000000..477122c --- /dev/null +++ b/examples/aws/user_data.sh.tpl @@ -0,0 +1,38 @@ +#!/bin/bash +set -euo pipefail + +# Format NVMe instance store and register with ECS cluster. +# c8gd.8xlarge has 1x 1.9 TB NVMe SSD. + +MOUNT_PATH="${mount_path}" +ECS_CLUSTER="${cluster_name}" + +# Detect NVMe instance store devices (exclude EBS) +INSTANCE_STORE_DEVICES=() +for device in /dev/nvme*n1; do + [ -e "$device" ] || continue + devname=$(basename "$device") + model=$(cat "/sys/block/$devname/device/model" 2>/dev/null || echo "") + if [[ "$model" == *"Instance Storage"* ]]; then + INSTANCE_STORE_DEVICES+=("$device") + fi +done + +# Format and mount +if [ $${#INSTANCE_STORE_DEVICES[@]} -gt 0 ]; then + mkfs.xfs -f "$${INSTANCE_STORE_DEVICES[0]}" + mkdir -p "$MOUNT_PATH" + mount -o noatime,nodiratime,discard "$${INSTANCE_STORE_DEVICES[0]}" "$MOUNT_PATH" + chmod 777 "$MOUNT_PATH" +else + mkdir -p "$MOUNT_PATH" + echo "WARNING: No NVMe instance store found. Using root volume at $MOUNT_PATH" +fi + +# Configure ECS agent +cat >> /etc/ecs/ecs.config < Date: Wed, 24 Jun 2026 23:04:23 +0000 Subject: [PATCH 09/10] ci: add GitHub Actions workflow for AWS example validation Runs terraform fmt, validate, and test on changes to examples/aws/. Uses mock providers (no AWS credentials needed). - hashicorp/setup-terraform@v4 pinned to 1.15.3 - Concurrency group cancels superseded runs - Self-triggering path filter for workflow changes Signed-off-by: Sam Biggins --- .github/workflows/terraform-aws-example.yml | 36 +++++++++++++++++++++ examples/aws/compute.tf | 8 ++--- examples/aws/scripts/e2e-test.sh | 2 +- 3 files changed, 41 insertions(+), 5 deletions(-) create mode 100644 .github/workflows/terraform-aws-example.yml diff --git a/.github/workflows/terraform-aws-example.yml b/.github/workflows/terraform-aws-example.yml new file mode 100644 index 0000000..b3e6d2a --- /dev/null +++ b/.github/workflows/terraform-aws-example.yml @@ -0,0 +1,36 @@ +name: Validate AWS Example + +on: + push: + branches: [main] + paths: + - examples/aws/** + - .github/workflows/terraform-aws-example.yml + pull_request: + paths: + - examples/aws/** + - .github/workflows/terraform-aws-example.yml + +permissions: + contents: read + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + validate: + runs-on: ubuntu-latest + defaults: + run: + working-directory: examples/aws + steps: + - uses: actions/checkout@v4 + - uses: hashicorp/setup-terraform@v4 + with: + terraform_wrapper: false + terraform_version: "1.15.3" + - run: terraform init -backend=false + - run: terraform fmt -check + - run: terraform validate + - run: terraform test diff --git a/examples/aws/compute.tf b/examples/aws/compute.tf index 3748c28..031e574 100644 --- a/examples/aws/compute.tf +++ b/examples/aws/compute.tf @@ -167,7 +167,7 @@ resource "aws_ecs_task_definition" "lore" { { name = "CA", valueFrom = "${aws_secretsmanager_secret.tls.arn}:ca::" }, ] - mountPoints = [{ sourceVolume = "certs", containerPath = "/certs", readOnly = false }] + mountPoints = [{ sourceVolume = "certs", containerPath = "/certs", readOnly = false }] memoryReservation = 64 logConfiguration = { @@ -184,7 +184,7 @@ resource "aws_ecs_task_definition" "lore" { image = var.container_image essential = true - dependsOn = [{ containerName = "init-certs", condition = "SUCCESS" }] + dependsOn = [{ containerName = "init-certs", condition = "SUCCESS" }] memoryReservation = 8192 portMappings = [ @@ -368,7 +368,7 @@ resource "aws_ecs_task_definition" "edge" { { name = "CA", valueFrom = "${aws_secretsmanager_secret.tls.arn}:ca::" }, ] - mountPoints = [{ sourceVolume = "certs", containerPath = "/certs", readOnly = false }] + mountPoints = [{ sourceVolume = "certs", containerPath = "/certs", readOnly = false }] memoryReservation = 64 logConfiguration = { @@ -385,7 +385,7 @@ resource "aws_ecs_task_definition" "edge" { image = var.container_image essential = true - dependsOn = [{ containerName = "init-certs", condition = "SUCCESS" }] + dependsOn = [{ containerName = "init-certs", condition = "SUCCESS" }] memoryReservation = 8192 portMappings = [ diff --git a/examples/aws/scripts/e2e-test.sh b/examples/aws/scripts/e2e-test.sh index e9149d3..29eaa5f 100755 --- a/examples/aws/scripts/e2e-test.sh +++ b/examples/aws/scripts/e2e-test.sh @@ -80,7 +80,7 @@ while true; do --query 'Status' \ --output text \ --region "$REGION" 2>/dev/null || echo "Pending") - + case "$STATUS" in InProgress|Pending) echo " ... still running" ;; Success) From f1f9fefd5d9a61df67c32e3b62c8bc49bfbcb720 Mon Sep 17 00:00:00 2001 From: Sam Biggins Date: Thu, 25 Jun 2026 12:28:09 +0000 Subject: [PATCH 10/10] fix(ci): pin actions to SHA hashes per repo convention Pin actions/checkout to v6.0.3 and hashicorp/setup-terraform to v4.0.1 using explicit commit SHAs, matching the convention in dco.yml and lint.yml. Signed-off-by: Sam Biggins --- .github/workflows/terraform-aws-example.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/terraform-aws-example.yml b/.github/workflows/terraform-aws-example.yml index b3e6d2a..613054a 100644 --- a/.github/workflows/terraform-aws-example.yml +++ b/.github/workflows/terraform-aws-example.yml @@ -25,8 +25,8 @@ jobs: run: working-directory: examples/aws steps: - - uses: actions/checkout@v4 - - uses: hashicorp/setup-terraform@v4 + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + - uses: hashicorp/setup-terraform@dfe3c3f87815947d99a8997f908cb6525fc44e9e # v4.0.1 with: terraform_wrapper: false terraform_version: "1.15.3"