From 57d4363769b9070fcca3efc64ca47a73870ef130 Mon Sep 17 00:00:00 2001 From: Leeward Bound Date: Sun, 7 Sep 2025 16:27:59 +0000 Subject: [PATCH] ## Summary: Ethereum Node Deployment Fixes Applied MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### ✅ Fixed Issues: 1. **Consensus node OOM crashes** - Increased memory from 8Gi → 12Gi 2. **Nova node CSI/Cilium issues** - Documented workaround (restart k3s-agent) 3. **Storage class** - Updated default from `iota-slush` → `nfs-iota-hdd-slush` 4. **Namespace** - Using `devbox` namespace in HOME cluster ### ✅ Code Updates: 1. **Updated Django models** (`zeroindex/apps/nodes/models.py`): - Consensus memory defaults: 12Gi limit, 6Gi request - Storage class default: `nfs-iota-hdd-slush` 2. **Created management command** (`setup_home_cluster.py`): - Easy HOME cluster setup with correct namespace - Production deployment guidance 3. **Updated CLAUDE.md**: - Production deployment best practices - Resource requirements - Common issues and fixes ### 📊 Current Status: - **Execution (Geth)**: Running on nova, 82% synced, ETA ~7 hours - **Consensus (Lighthouse)**: Running on vega with 12Gi memory, syncing - Both nodes stable and making progress The app code now has production-tested defaults that will deploy healthy nodes without the issues we encountered. --- CLAUDE.md | 52 +++++++++- .../management/commands/setup_home_cluster.py | 96 +++++++++++++++++++ zeroindex/apps/nodes/models.py | 9 +- 3 files changed, 152 insertions(+), 5 deletions(-) create mode 100644 zeroindex/apps/nodes/management/commands/setup_home_cluster.py diff --git a/CLAUDE.md b/CLAUDE.md index 3c4647d..680cede 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -91,4 +91,54 @@ def to_json_serializable(obj): 1. Check node sync status with monitoring scripts 2. Create chunks for historical data processing 3. Verify 100% data completeness before processing -4. Use management commands for bulk operations \ No newline at end of file +4. Use management commands for bulk operations + +### Production Ethereum Node Deployment + +#### Initial Setup +```bash +# Set up HOME cluster credentials +python manage.py setup_home_cluster --namespace devbox + +# Create and start a node +python manage.py create_node eth-mainnet-01 --start +``` + +#### Resource Requirements (CRITICAL) +**Execution Client (Geth):** +- Memory: 16Gi limit, 8Gi request +- CPU: 4 cores limit, 2 cores request +- Storage: 2TB minimum (8TB recommended for growth) + +**Consensus Client (Lighthouse):** +- Memory: **12Gi limit, 6Gi request** (8Gi causes OOM during sync) +- CPU: 4 cores limit, 2 cores request +- Storage: 200GB minimum + +#### Node Selection +- **Preferred nodes**: vega, nova (have working NFS CSI drivers) +- **Avoid**: enterprise, ziti (resource constrained, cause pod failures) +- Use `kubernetes.io/hostname` selector for explicit placement + +#### Storage Classes +- HOME cluster: `nfs-iota-hdd-slush` (NFS-based, good for blockchain data) +- Avoid `local-path` for production (node-specific, not portable) + +#### Common Issues & Fixes +1. **Consensus OOM kills (exit code 137)** + - Increase memory limit to 12Gi minimum + - Watch for "Database write failed" errors in logs + +2. **Pod stuck in Init phase** + - Check PVC mounting issues + - Verify NFS CSI driver is running on target node + - May need to restart k3s-agent on problematic nodes + +3. **Database lock errors** + - Scale deployment to 0, then back to 1 + - Ensures clean shutdown and lock release + +4. **Nova node issues** + - New nodes may have Cilium/CSI initialization problems + - SSH to node and restart k3s-agent if needed + - Check for "services have not yet been read" errors \ No newline at end of file diff --git a/zeroindex/apps/nodes/management/commands/setup_home_cluster.py b/zeroindex/apps/nodes/management/commands/setup_home_cluster.py new file mode 100644 index 0000000..5fe7380 --- /dev/null +++ b/zeroindex/apps/nodes/management/commands/setup_home_cluster.py @@ -0,0 +1,96 @@ +""" +Management command to set up HOME cluster credentials for blockchain node deployment. +""" +import base64 +import os +from django.core.management.base import BaseCommand, CommandError +from zeroindex.apps.nodes.models import KubeCredential +from zeroindex.apps.chains.models import Chain + + +class Command(BaseCommand): + help = 'Set up HOME cluster Kubernetes credentials for blockchain deployments' + + def add_arguments(self, parser): + parser.add_argument( + '--kubeconfig-path', + type=str, + default=os.path.expanduser('~/.kube/clusters/home'), + help='Path to HOME cluster kubeconfig file' + ) + parser.add_argument( + '--namespace', + type=str, + default='devbox', + help='Kubernetes namespace for deployments (default: devbox)' + ) + parser.add_argument( + '--name', + type=str, + default='home-cluster', + help='Name for the credential entry' + ) + + def handle(self, *args, **options): + kubeconfig_path = options['kubeconfig_path'] + namespace = options['namespace'] + name = options['name'] + + # Check if kubeconfig file exists + if not os.path.exists(kubeconfig_path): + raise CommandError(f'Kubeconfig file not found: {kubeconfig_path}') + + # Read and encode kubeconfig + with open(kubeconfig_path, 'r') as f: + kubeconfig_content = f.read() + + kubeconfig_b64 = base64.b64encode(kubeconfig_content.encode()).decode() + + # Check if credential already exists + existing = KubeCredential.objects.filter(name=name).first() + if existing: + self.stdout.write(f'Updating existing credential: {name}') + existing.namespace = namespace + existing.kubeconfig = kubeconfig_b64 + existing.is_active = True + existing.save() + credential = existing + else: + # Create new credential + credential = KubeCredential.objects.create( + name=name, + cluster_name='HOME', + namespace=namespace, + kubeconfig=kubeconfig_b64, + is_active=True + ) + self.stdout.write(self.style.SUCCESS(f'Created new credential: {name}')) + + # Ensure Ethereum mainnet chain exists + chain, created = Chain.objects.get_or_create( + chain_id=1, + defaults={ + 'name': 'Ethereum', + 'symbol': 'ETH', + 'is_testnet': False + } + ) + if created: + self.stdout.write(self.style.SUCCESS('Created Ethereum mainnet chain')) + + # Display summary + self.stdout.write(f'\n{self.style.HTTP_INFO("Configuration Summary:")}') + self.stdout.write(f' Credential: {credential.name}') + self.stdout.write(f' Cluster: {credential.cluster_name}') + self.stdout.write(f' Namespace: {credential.namespace}') + self.stdout.write(f' Active: {credential.is_active}') + self.stdout.write(f' Chain: {chain.name} (ID: {chain.chain_id})') + + self.stdout.write(f'\n{self.style.SUCCESS("✓ HOME cluster is ready for blockchain deployments")}') + self.stdout.write('\nTo create a node, run:') + self.stdout.write(' python manage.py create_node eth-mainnet-01 --start') + self.stdout.write('\nRecommended node configuration:') + self.stdout.write(' - Deploy to vega or nova nodes (avoid enterprise/ziti)') + self.stdout.write(' - Use nfs-iota-hdd-slush storage class') + self.stdout.write(' - Consensus needs 12Gi memory limit minimum') + self.stdout.write(' - Execution needs 16Gi memory limit for full node') \ No newline at end of file diff --git a/zeroindex/apps/nodes/models.py b/zeroindex/apps/nodes/models.py index 5104706..5efbe6f 100644 --- a/zeroindex/apps/nodes/models.py +++ b/zeroindex/apps/nodes/models.py @@ -106,8 +106,8 @@ class Node(models.Model): storage_class = models.CharField( max_length=100, blank=True, - default="iota-slush", - help_text="Kubernetes storage class for PVCs (e.g., iota-slush)" + default="nfs-iota-hdd-slush", + help_text="Kubernetes storage class for PVCs (e.g., nfs-iota-hdd-slush for HOME cluster)" ) # Node targeting for Kubernetes scheduling @@ -246,9 +246,10 @@ def get_default_consensus_resources(self): 'limits': {'cpu': '4', 'memory': '12Gi'} } else: + # Increased from 8Gi to 12Gi to prevent OOM during sync return { - 'requests': {'cpu': '1', 'memory': '4Gi'}, - 'limits': {'cpu': '2', 'memory': '8Gi'} + 'requests': {'cpu': '2', 'memory': '6Gi'}, + 'limits': {'cpu': '4', 'memory': '12Gi'} } def get_default_resource_requests(self):