diff --git a/go.work b/go.work index e8f20deb2..55af4ad97 100644 --- a/go.work +++ b/go.work @@ -2,5 +2,6 @@ go 1.26.2 use ( . + ./scripts/setup ./tools/generateTimeline ) diff --git a/go.work.sum b/go.work.sum index bf3d7145b..8956b7a2d 100644 --- a/go.work.sum +++ b/go.work.sum @@ -6,22 +6,16 @@ github.com/IBM/sarama v1.43.1/go.mod h1:GG5q1RURtDNPz8xxJs3mgX6Ytak8Z9eLhAkJPObe github.com/boombuler/barcode v1.0.1/go.mod h1:paBWMcWSl3LHKBqUq+rly7CNSldXjb2rDl3JlRe0mD8= github.com/campoy/embedmd v1.0.0/go.mod h1:oxyr9RCiSXg0M3VJ3ks0UGfp98BpSSGr0kpiX3MzVl8= github.com/cncf/xds/go v0.0.0-20251210132809-ee656c7534f5/go.mod h1:KdCmV+x/BuvyMxRnYBlmVaq4OLiKW6iRQfvC62cvdkI= -github.com/containerd/containerd v1.5.2/go.mod h1:0DOxVqwDy2iZvrZp2JUx/E+hS0UNTVn7dJnIOwtYR4g= -github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/eapache/go-resiliency v1.6.0/go.mod h1:5yPzW0MIvSe0JDsv0v+DvcjEv2FyD6iZYSs1ZI+iQho= github.com/eapache/go-xerial-snappy v0.0.0-20230731223053-c322873962e3/go.mod h1:YvSRo5mw33fLEx1+DlK6L2VV43tJt5Eyel9n9XBcR+0= github.com/eapache/queue v1.1.0/go.mod h1:6eCeP0CKFpHLu8blIFXhExK/dRa7WDZfr6jVFPTqq+I= -github.com/ease-lab/vhive/utils/tracing/go v0.0.0-20211206145412-3a4607297428/go.mod h1:7a477TAJimC1AkzEynida5yXncx5l2FwL1YqEaxLNI8= github.com/envoyproxy/go-control-plane v0.14.0/go.mod h1:NcS5X47pLl/hfqxU70yPwL9ZMkUlwlKxtAohpi2wBEU= github.com/envoyproxy/go-control-plane/envoy v1.36.0/go.mod h1:ty89S1YCCVruQAm9OtKeEkQLTb+Lkz0k8v9W0Oxsv98= github.com/envoyproxy/go-control-plane/ratelimit v0.1.0/go.mod h1:Wk+tMFAFbCXaJPzVVHnPgRKdUdwW/KdbRt94AzgRee4= github.com/envoyproxy/protoc-gen-validate v1.3.0/go.mod h1:HvYl7zwPa5mffgyeTUHA9zHIH36nmrm7oCbo4YKoSWA= -github.com/go-fonts/liberation v0.3.1/go.mod h1:jdJ+cqF+F4SUL2V+qxBth8fvBpBDS7yloUL5Fi8GTGY= github.com/go-fonts/liberation v0.3.3/go.mod h1:eUAzNRuJnpSnd1sm2EyloQfSOT79pdw7X7++Ri+3MCU= github.com/go-jose/go-jose/v4 v4.1.3/go.mod h1:x4oUasVrzR7071A4TnHLGSPpNOm2a21K9Kf04k1rs08= -github.com/go-latex/latex v0.0.0-20230307184459-12ec69307ad9/go.mod h1:gWuR/CrFDDeVRFQwHPvsv9soJVB/iqymhuZQuJ3a9OM= github.com/go-latex/latex v0.0.0-20240709081214-31cef3c7570e/go.mod h1:J4SAGzkcl+28QWi7yz72tyC/4aGnppOvya+AEv4TaAQ= -github.com/go-pdf/fpdf v0.8.0/go.mod h1:gfqhcNwXrsd3XYKte9a7vM3smvU/jB4ZRDrmWSxpfdc= github.com/go-pdf/fpdf v0.9.0/go.mod h1:oO8N111TkmKb9D7VvWGLvLJlaZUQVPM+6V42pp3iV4Y= github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572/go.mod h1:9Pwr4B2jHnOSGXyyzV8ROjYa2ojvAY6HCGYYfMoC3Ls= github.com/goccmack/gocc v1.0.2/go.mod h1:LXX2tFVUggS/Zgx/ICPOr3MLyusuM7EcbfkPvNsjdO8= @@ -42,34 +36,24 @@ github.com/onsi/ginkgo/v2 v2.11.0/go.mod h1:ZhrRA5XmEE3x3rhlzamx/JJvujdZoJ2uvgI7 github.com/onsi/gomega v1.27.10/go.mod h1:RsS8tutOdbdgzbPtzzATp12yT7kM5I5aElG3evPbQ0M= github.com/phpdave11/gofpdi v1.0.13/go.mod h1:vBmVV0Do6hSBHC8uKUQ71JGW+ZGQq74llk/7bXwjDoI= github.com/pierrec/lz4/v4 v4.1.21/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4= -github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10/go.mod h1:t/avpk3KcrXxUnYOhZhMXJlSEyie6gQbtLq5NM3loB8= github.com/rabbitmq/amqp091-go v1.9.0/go.mod h1:+jPrT9iY2eLjRaMSRHUhc3z14E/l85kv/f+6luSD3pc= github.com/rcrowley/go-metrics v0.0.0-20201227073835-cf1acfcdf475/go.mod h1:bCqnVzQkZxMG4s8nGwiZ5l3QUCyqpo9Y+/ZMZ9VjZe4= -github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs= github.com/ruudk/golang-pdf417 v0.0.0-20201230142125-a7e3863a1245/go.mod h1:pQAZKsJ8yyVxGRWYNEm9oFB8ieLgKFnamEyDmSA0BRk= github.com/spiffe/go-spiffe/v2 v2.6.0/go.mod h1:gm2SeUoMZEtpnzPNs2Csc0D/gX33k1xIx7lEzqblHEs= github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= -github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= -github.com/vhive-serverless/loader v0.0.0-20260415061204-0512e9eaa2a3/go.mod h1:yET1r9lcWPzTJl8W4RJ7xK+5G4iQXx+aSUWjBArdT5E= github.com/xdg-go/pbkdf2 v1.0.0/go.mod h1:jrpuAogTd400dnrH08LKmI/xc1MbPOebTwRqcT5RDeI= github.com/xdg-go/scram v1.1.2/go.mod h1:RT/sEzTbU5y00aCK8UOx6R7YryM0iF1N2MOmC3kKLN4= github.com/xdg-go/stringprep v1.0.4/go.mod h1:mPGuuIYwz7CmR2bT9j4GbQqutWS1zV24gijq1dTyGkM= github.com/youmark/pkcs8 v0.0.0-20240726163527-a2c0da244d78/go.mod h1:aL8wCCfTfSfmXjznFBSZNN13rSJjlIOI1fUNAtF7rmI= -go.opentelemetry.io/contrib v0.20.0 h1:ubFQUn0VCZ0gPwIoJfBJVpeBlyRMxu8Mm/huKWYd9p0= -go.opentelemetry.io/contrib v0.20.0/go.mod h1:G/EtFaa6qaN7+LxqfIAT3GiZa7Wv5DTBUzl5H4LY0Kc= go.opentelemetry.io/contrib/detectors/gcp v1.39.0/go.mod h1:t/OGqzHBa5v6RHZwrDBJ2OirWc+4q/w2fTbLZwAKjTk= -go.opentelemetry.io/otel/exporters/trace/zipkin v0.20.0/go.mod h1:QnYEWBA4wTy/15vvmj7Poeklp6xndAMcdejvzZNUtvM= golang.org/x/mod v0.35.0/go.mod h1:+GwiRhIInF8wPm+4AoT6L0FA1QWAad3OMdTRx4tFYlU= golang.org/x/oauth2 v0.34.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA= golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0= golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.30.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/tools v0.44.0/go.mod h1:KA0AfVErSdxRZIsOVipbv3rQhVXTnlU6UhKxHd1seDI= golang.org/x/tools/go/expect v0.1.1-deprecated/go.mod h1:eihoPOH+FgIqa3FpoTwguz/bVUSGBlGQU67vpBeOrBY= golang.org/x/tools/go/packages/packagestest v0.1.1-deprecated/go.mod h1:RVAQXBGNv1ib0J382/DPCRS/BPnsGebyM1Gj5VSDpG8= gonum.org/v1/tools v0.0.0-20200318103217-c168b003ce8c/go.mod h1:fy6Otjqbk477ELp8IXTpw1cObQtLbRCBVonY+bTTfcM= -google.golang.org/genproto v0.0.0-20230410155749-daa745c078e1 h1:KpwkzHKEF7B9Zxg18WzOa7djJ+Ha5DzthMyZYQfEn2A= -google.golang.org/genproto v0.0.0-20230410155749-daa745c078e1/go.mod h1:nKE/iIaLqn2bQwXBg8f1g2Ylh6r5MN5CmZvuzZCgsCU= google.golang.org/genproto/googleapis/api v0.0.0-20260120221211-b8f7ae30c516/go.mod h1:p3MLuOwURrGBRoEyFHBT3GjUwaCQVKeNqqWxlcISGdw= diff --git a/scripts/konk-ci/03-kind-metrics.sh b/scripts/konk-ci/03-kind-metrics.sh index 17ac1238b..e365f0376 100644 --- a/scripts/konk-ci/03-kind-metrics.sh +++ b/scripts/konk-ci/03-kind-metrics.sh @@ -33,7 +33,7 @@ yq eval ' .prometheus.prometheusSpec.resources.limits.memory = "512Mi" | .prometheus.prometheusSpec.resources.requests.cpu = "100m" | .prometheus.prometheusSpec.resources.requests.memory = "256Mi" -' -i "./config/prometh_values_kn.yaml" +' -i "./scripts/setup/configs/prometheus/prom_values.yaml" # Install prometheus stack release_label="prometheus" @@ -42,7 +42,7 @@ prometheus_chart_version="60.1.0" helm install \ -n monitoring $release_label \ --version $prometheus_chart_version prometheus-community/kube-prometheus-stack \ - -f ./config/prometh_values_kn.yaml + -f ./scripts/setup/configs/prometheus/prom_values.yaml # Configure kubectl config for non-root user docker exec knative-control-plane sh -c "mkdir -p /home/$(whoami)/.kube" diff --git a/scripts/setup/cluster/create_mulitnode.go b/scripts/setup/cluster/create_mulitnode.go new file mode 100644 index 000000000..027802039 --- /dev/null +++ b/scripts/setup/cluster/create_mulitnode.go @@ -0,0 +1,115 @@ +package cluster + +import ( + "fmt" + "time" + + "github.com/vhive-serverless/loader/scripts/setup/configs" + loaderUtils "github.com/vhive-serverless/loader/scripts/setup/utils" + "github.com/vhive-serverless/vHive/scripts/utils" +) + +func CreateMultiNodeSetup(configDir string, configName string) error { + // Load Configurations + cfg, err := configs.CommonConfigSetup(configDir, configName) + if err != nil { + utils.FatalPrintf("Failed to load configurations: %v\n", err) + return err + } + + // Distribute Loader SSH Key + utils.InfoPrintf("Distributing loader SSH key...\n") + if err := distributeLoaderSSHKey(cfg.LoaderNode, cfg.AllNodes); err != nil { + utils.FatalPrintf("Failed to distribute loader SSH key: %v\n", err) + return err + } + utils.InfoPrintf("Loader SSH key distributed.\n") + + // Determine Operation Mode + var operationMode string + switch cfg.SetupCfg.ClusterMode { + case "container": + operationMode = "stock-only" + case "firecracker", "firecracker_snapshots": + operationMode = "firecracker" + default: + utils.FatalPrintf("Unsupported cluster mode: %s\n", cfg.SetupCfg.ClusterMode) + return fmt.Errorf("unsupported cluster mode: %s", cfg.SetupCfg.ClusterMode) + } + + // Common Initialization on all nodes + utils.InfoPrintf("Starting common initialization on all nodes...\n") + if err := commonInit(cfg.AllNodes, cfg.SetupCfg, operationMode); err != nil { + utils.FatalPrintf("Failed during common initialization: %v\n", err) + return err + } + utils.InfoPrintf("Common initialization completed.\n") + + // Setup Master Node + utils.InfoPrintf("Setting up master node: %s\n", cfg.MasterNode) + joinToken, err := setupMaster(cfg.MasterNode, operationMode) + if err != nil { + utils.FatalPrintf("Failed to setup master node: %v\n", err) + return err + } + utils.InfoPrintf("Master node setup completed.\n") + + // Setup Worker Nodes + utils.InfoPrintf("Setting up worker nodes...\n") + if err := setupWorkers(cfg.WorkerNodes, joinToken, cfg.SetupCfg, operationMode); err != nil { + utils.FatalPrintf("Failed to setup worker nodes: %v\n", err) + return err + } + utils.InfoPrintf("Worker nodes setup completed.\n") + + time.Sleep(5 * time.Second) // Wait for nodes to stabilize + + // Extend CIDR if necessary + if cfg.SetupCfg.PodsPerNode > 240 { + if cfg.SetupCfg.PodsPerNode > 1022 { + utils.FatalPrintf("PODS_PER_NODE value %d is too high to extend CIDR range. Maximum supported is 1022.\n", cfg.SetupCfg.PodsPerNode) + return fmt.Errorf("PODS_PER_NODE value %d is too high to extend CIDR range. Maximum supported is 1022.", cfg.SetupCfg.PodsPerNode) + } + if len(cfg.AllNodes) > 63 { + utils.FatalPrintf("Total number of nodes %d is too high to extend CIDR range. Maximum supported is 63.\n", len(cfg.AllNodes)) + return fmt.Errorf("Total number of nodes %d is too high to extend CIDR range. Maximum supported is 63.", len(cfg.AllNodes)) + } + utils.InfoPrintf("Extending CIDR range...\n") + if err := extendCIDR(cfg.MasterNode, cfg.WorkerNodes, joinToken); err != nil { + utils.FatalPrintf("Failed to extend CIDR range: %v\n", err) + return err + } + utils.InfoPrintf("CIDR range extended.\n") + } + + time.Sleep(5 * time.Second) // Wait for nodes to stabilize + + // Finalize Cluster Setup + utils.InfoPrintf("Finalizing cluster setup...\n") + if err := finalizeClusterSetup(cfg.MasterNode, cfg.AllNodes); err != nil { + utils.FatalPrintf("Failed to finalize cluster setup: %v\n", err) + return err + } + utils.InfoPrintf("Cluster setup finalized.\n") + + // Label Nodes + utils.InfoPrintf("Labeling nodes...\n") + if err := loaderUtils.LabelNodes(cfg.MasterNode, configDir, configName); err != nil { + utils.FatalPrintf("Failed to label nodes: %v\n", err) + return err + } + utils.InfoPrintf("Node labeling completed.\n") + + // Deploy Prometheus if enabled + if cfg.SetupCfg.DeployPrometheus { + utils.InfoPrintf("Setting up Prometheus components...\n") + if err := setupPrometheus(cfg.MasterNode, cfg.AllNodes, cfg.PromConfig); err != nil { + utils.FatalPrintf("Failed to setup Prometheus components: %v\n", err) + return err + } + utils.InfoPrintf("Prometheus components setup completed.\n") + } + + utils.InfoPrintf("Multi-node cluster setup finished successfully!\n") + return nil +} diff --git a/scripts/setup/cluster/setup_knative.go b/scripts/setup/cluster/setup_knative.go new file mode 100644 index 000000000..d0d3be733 --- /dev/null +++ b/scripts/setup/cluster/setup_knative.go @@ -0,0 +1,452 @@ +package cluster + +import ( + "fmt" + "strings" + "sync" + "time" + + "github.com/vhive-serverless/loader/scripts/setup/configs" + loaderUtils "github.com/vhive-serverless/loader/scripts/setup/utils" + "github.com/vhive-serverless/vHive/scripts/utils" +) + +func commonInit(nodes []string, cfg *configs.SetupConfig, operationMode string) error { + var wg sync.WaitGroup + errChan := make(chan error, len(nodes)) + + for _, node := range nodes { + wg.Add(1) + go func(node string) { + defer wg.Done() + utils.InfoPrintf("Initializing node: %s\n", node) + // Clone vHive repository + utils.WaitPrintf("Cloning vHive repository on node %s...\n", node) + _, err := loaderUtils.ServerExec(node, fmt.Sprintf("git clone --branch=%s %s", cfg.HiveBranch, cfg.HiveRepo)) + if !utils.CheckErrorWithMsg(err, "Failed to clone vHive repository on node %s: %v \n", node, err) { + errChan <- err + return + } + + // Install Go and setup node + utils.WaitPrintf("Installing Go and setting up node %s...\n", node) + _, err = loaderUtils.ServerExec(node, fmt.Sprintf("pushd ~/vhive/scripts > /dev/null && ./install_go.sh && source /etc/profile && go build -o setup_tool && ./setup_tool setup_node %s && popd > /dev/null", operationMode)) + if !utils.CheckErrorWithMsg(err, "Failed to install Go and setup node %s: %v \n", node, err) { + errChan <- err + return + } + + // Start containerd in tmux + utils.WaitPrintf("Starting containerd in tmux on node %s...\n", node) + _, err = loaderUtils.ServerExec(node, "tmux new -s containerd -d") + if !utils.CheckErrorWithMsg(err, "Failed to create tmux session for containerd on node %s: %v \n", node, err) { + errChan <- err + return + } + _, err = loaderUtils.ServerExec(node, `tmux send -t containerd "sudo containerd 2>&1 | tee ~/containerd_log.txt" ENTER`) + if !utils.CheckErrorWithMsg(err, "Failed to start containerd in tmux on node %s: %v \n", node, err) { + errChan <- err + return + } + + // Install chrony, htop, sysstat + utils.WaitPrintf("Installing chrony, htop, sysstat on node %s...\n", node) + _, err = loaderUtils.ServerExec(node, "sudo apt-get update && sudo apt-get install -y chrony htop sysstat etcd-client") + if !utils.CheckErrorWithMsg(err, "Failed to install chrony, htop, sysstat on node %s: %v \n", node, err) { + errChan <- err + return + } + + // Configure chrony + utils.WaitPrintf("Configuring chrony on node %s...\n", node) + _, err = loaderUtils.ServerExec(node, `echo "server ops.emulab.net iburst prefer" | sudo tee -a /etc/chrony/chrony.conf >/dev/null`) + if !utils.CheckErrorWithMsg(err, "Failed to configure chrony on node %s: %v \n", node, err) { + errChan <- err + return + } + + // Restart chronyd + utils.WaitPrintf("Restarting chronyd on node %s...\n", node) + _, err = loaderUtils.ServerExec(node, "sudo systemctl restart chronyd") + if !utils.CheckErrorWithMsg(err, "Failed to restart chronyd on node %s: %v \n", node, err) { + errChan <- err + return + } + + // Check chrony tracking + utils.WaitPrintf("Checking chrony tracking on node %s...\n", node) + _, err = loaderUtils.ServerExec(node, "sudo chronyc tracking") + if !utils.CheckErrorWithMsg(err, "Failed to check chrony tracking on node %s: %v \n", node, err) { + errChan <- err + return + } + + // Clone loader repository + utils.WaitPrintf("Cloning loader repository on node %s...\n", node) + _, err = loaderUtils.ServerExec(node, fmt.Sprintf("git clone --depth=1 --branch=%s %s loader", cfg.LoaderBranch, cfg.LoaderRepo)) + if !utils.CheckErrorWithMsg(err, "Failed to clone loader repository on node %s: %v \n", node, err) { + errChan <- err + return + } + + // Install python3-pip + utils.WaitPrintf("Installing python3-pip on node %s...\n", node) + _, err = loaderUtils.ServerExec(node, `echo -en "\n\n" | sudo apt-get install -y python3-pip`) + if !utils.CheckErrorWithMsg(err, "Failed to install python3-pip on node %s: %v \n", node, err) { + errChan <- err + return + } + + // Run stabilize script + utils.WaitPrintf("Running stabilize script on node %s...\n", node) + _, err = loaderUtils.ServerExec(node, "~/loader/scripts/setup/stabilize.sh") + if !utils.CheckErrorWithMsg(err, "Failed to run stabilize script on node %s: %v \n", node, err) { + errChan <- err + return + } + }(node) + } + + wg.Wait() + close(errChan) + + for err := range errChan { + return err + } + return nil +} + +func setupMaster(masterNode string, operationMode string) (string, error) { + setupCmd := fmt.Sprintf("pushd ~/vhive/scripts > /dev/null && ./setup_tool create_multinode_cluster %s && popd > /dev/null", operationMode) + // Create tmux sessions + _, err := loaderUtils.ServerExec(masterNode, "tmux new -s runner -d") + if !utils.CheckErrorWithMsg(err, "Failed to create tmux session 'runner' on master node %s: %v \n", masterNode, err) { + return "", err + } + _, err = loaderUtils.ServerExec(masterNode, "tmux new -s master -d") + if !utils.CheckErrorWithMsg(err, "Failed to create tmux session 'master' on master node %s: %v\n", masterNode, err) { + return "", err + } + + // Rewrite YAML files + _, err = loaderUtils.ServerExec(masterNode, "~/loader/scripts/setup/rewrite_yaml_files.sh") + if !utils.CheckErrorWithMsg(err, "Failed to rewrite YAML files on master node %s: %v\n", masterNode, err) { + return "", err + } + + // Execute setup command in master tmux session + _, err = loaderUtils.ServerExec(masterNode, fmt.Sprintf(`tmux send -t master "%s" ENTER`, setupCmd)) + if !utils.CheckErrorWithMsg(err, "Failed to send setup command to master tmux session on master node %s: %v\n", masterNode, err) { + return "", err + } + + // Wait for masterKey.yaml to be created + utils.InfoPrintf("Waiting for master key to be generated...\n") + for { + _, err := loaderUtils.ServerExec(masterNode, "stat ~/vhive/scripts/masterKey.yaml") + if err == nil { + break + } + time.Sleep(2 * time.Second) + } + utils.InfoPrintf("Master key found.\n") + + // Get the join token + getTokenCmd := `awk '/^ApiserverAdvertiseAddress:/ {ip=$2} /^ApiserverPort:/ {port=$2} /^ApiserverToken:/ {token=$2} /^ApiserverTokenHash:/ {token_hash=$2} END {print "sudo kubeadm join " ip ":" port " --token " token " --discovery-token-ca-cert-hash " token_hash}' ~/vhive/scripts/masterKey.yaml` + joinToken, err := loaderUtils.ServerExec(masterNode, getTokenCmd) + if !utils.CheckErrorWithMsg(err, "Failed to get join token from master: %v\n", err) { + return "", err + } + + return strings.TrimSpace(joinToken), nil +} + +func setupWorkers(workerNodes []string, joinToken string, cfg *configs.SetupConfig, operationMode string) error { + var wg sync.WaitGroup + errChan := make(chan error, len(workerNodes)) + + for _, node := range workerNodes { + wg.Add(1) + go func(node string) { + defer wg.Done() + utils.InfoPrintf("Setting up worker: %s\n", node) + + // Setup worker kubelet + utils.WaitPrintf("Setting up worker kubelet on node %s...\n", node) + setupKubeletCmd := fmt.Sprintf("pushd ~/vhive/scripts > /dev/null && ./setup_tool setup_worker_kubelet %s && popd > /dev/null", operationMode) + _, err := loaderUtils.ServerExec(node, setupKubeletCmd) + if !utils.CheckErrorWithMsg(err, "Failed to setup worker kubelet on worker %s: %v\n", node, err) { + errChan <- err + return + } + + // Setup vHive Firecracker daemon if operation mode is firecracker + if operationMode == "firecracker" { + err = setupVhiveFirecrackerDaemon(node, cfg.ClusterMode) + if !utils.CheckErrorWithMsg(err, "Failed to setup vHive Firecracker daemon on worker %s: %v\n", node, err) { + errChan <- err + return + } + } + + // Join token + _, err = loaderUtils.ServerExec(node, joinToken) + if !utils.CheckErrorWithMsg(err, "Failed to join worker %s with token: %v\n", node, err) { + errChan <- err + return + } + + // Configure maxPods + _, err = loaderUtils.ServerExec(node, fmt.Sprintf("echo \"maxPods: %d\" | sudo tee -a /var/lib/kubelet/config.yaml >/dev/null", cfg.PodsPerNode)) + if !utils.CheckErrorWithMsg(err, "Failed to set maxPods on worker %s: %v\n", node, err) { + errChan <- err + return + } + + // Configure containerLogMaxSize + _, err = loaderUtils.ServerExec(node, `echo "containerLogMaxSize: 512Mi" | sudo tee -a /var/lib/kubelet/config.yaml >/dev/null`) + if !utils.CheckErrorWithMsg(err, "Failed to set containerLogMaxSize on worker %s: %v\n", node, err) { + errChan <- err + return + } + + // Restart kubelet + utils.WaitPrintf("Restarting kubelet on worker %s...\n", node) + _, err = loaderUtils.ServerExec(node, "sudo systemctl restart kubelet") + if !utils.CheckErrorWithMsg(err, "Failed to restart kubelet on worker %s: %v\n", node, err) { + errChan <- err + return + } + + time.Sleep(10 * time.Second) + + // Rejoin after restart + utils.WaitPrintf("Rejoining worker %s after kubelet restart...\n", node) + _, _ = loaderUtils.ServerExec(node, joinToken) + utils.InfoPrintf("Worker node %s has joined the cluster.\n", node) + }(node) + } + + wg.Wait() + close(errChan) + + for err := range errChan { + return err + } + return nil +} + +func extendCIDR(masterNode string, workerNodes []string, joinToken string) error { + utils.InfoPrintf("Extending CIDR for all nodes...\n") + + // Get node names first + nodeList, err := loaderUtils.ServerExec(masterNode, "kubectl get no | tail -n +2 | awk '{print $1}'") + if !utils.CheckErrorWithMsg(err, "Failed to get node names from master node %s: %v\n", masterNode, err) { + return err + } + nodeNames := strings.Fields(nodeList) + if len(nodeNames) == 0 { + return fmt.Errorf("could not retrieve any node names") + } + + for i, nodeName := range nodeNames { + subnet := i*4 + 4 + // Get node JSON, modify podCIDR, and save to node.yaml + getNodeCmd := fmt.Sprintf("kubectl get node %s -o json | jq '.spec.podCIDR |= \"10.168.%d.0/22\"' > node.yaml", nodeName, subnet) + _, err := loaderUtils.ServerExec(masterNode, getNodeCmd) + if !utils.CheckErrorWithMsg(err, "Failed to get and modify node %s JSON on master node %s: %v\n", nodeName, masterNode, err) { + return err + } + + // Delete the node + deleteNodeCmd := fmt.Sprintf("kubectl delete node %s", nodeName) + _, err = loaderUtils.ServerExec(masterNode, deleteNodeCmd) + if !utils.CheckErrorWithMsg(err, "Failed to delete node %s on master node %s: %v\n", nodeName, masterNode, err) { + return err + } + + // Create the node with updated CIDR + createNodeCmd := "kubectl create -f node.yaml" + _, err = loaderUtils.ServerExec(masterNode, createNodeCmd) + if !utils.CheckErrorWithMsg(err, "Failed to create node %s with updated CIDR on master node %s: %v\n", nodeName, masterNode, err) { + return err + } + utils.InfoPrintf("Changed pod CIDR for node %s to 10.168.%d.0/22\n", nodeName, subnet) + } + + // Rejoin the cluster for the 3rd time + for _, node := range workerNodes { + _, _ = loaderUtils.ServerExec(node, joinToken) + } + + return nil +} + +func finalizeClusterSetup(masterNode string, allNodes []string) error { + // Untaint master node + untaintCmd := "kubectl taint nodes $(hostname) node-role.kubernetes.io/control-plane-" + var err error // Declare err once + _, err = loaderUtils.ServerExec(masterNode, untaintCmd) + if !utils.CheckErrorWithMsg(err, "Failed to untaint master node %s: %v\n", masterNode, err) { + return err + } + + // Notify the master that all nodes have joined + _, err = loaderUtils.ServerExec(masterNode, `tmux send -t master "y" ENTER`) + if !utils.CheckErrorWithMsg(err, "Failed to notify master node %s: %v\n", masterNode, err) { + return err + } + + // Wait for knative-serving namespace + utils.InfoPrintf("Waiting for knative-serving namespace to be created...\n") + for { + out, innerErr := loaderUtils.ServerExec(masterNode, "kubectl get namespaces") // Use innerErr for loop scope + if innerErr == nil && strings.Contains(out, "knative-serving") { + break + } + time.Sleep(15 * time.Second) + } + utils.InfoPrintf("Knative-serving namespace is ready.\n") + + // Copy kubeconfig from master to all other nodes + err = copyK8sCertificates(masterNode, allNodes) + if !utils.CheckErrorWithMsg(err, "Failed to copy kubeconfig: %v\n", err) { + return err + } + + // Patch init scale + _, err = loaderUtils.ServerExec(masterNode, "cd ~/loader; bash scripts/setup/patch_init_scale.sh") + if !utils.CheckErrorWithMsg(err, "Failed to run patch_init_scale.sh on master node %s: %v\n", masterNode, err) { + return err + } + + // Enable affinity + _, err = loaderUtils.ServerExec(masterNode, `kubectl patch configmap -n knative-serving config-features -p '{"data": {"kubernetes.podspec-affinity": "enabled"}}'`) + if !utils.CheckErrorWithMsg(err, "Failed to enable affinity on master node %s: %v\n", masterNode, err) { + return err + } + + return nil +} + +func copyK8sCertificates(masterNode string, allNodes []string) error { + utils.InfoPrintf("Copying K8s certificates from master to all nodes...\n") + kubeconfig, err := loaderUtils.ServerExec(masterNode, "cat ~/.kube/config") + if !utils.CheckErrorWithMsg(err, "Failed to get kubeconfig from master node %s: %v\n", masterNode, err) { + return err + } + + var wg sync.WaitGroup + errChan := make(chan error, len(allNodes)-1) + for _, node := range allNodes { + if node == masterNode { + continue + } + wg.Add(1) + go func(node string) { + defer wg.Done() + // Create .kube directory + mkdirCmd := "mkdir -p ~/.kube" + _, err := loaderUtils.ServerExec(node, mkdirCmd) + if !utils.CheckErrorWithMsg(err, "Failed to create .kube directory on node %s: %v\n", node, err) { + errChan <- err + return + } + + // Write kubeconfig to file + echoKubeconfigCmd := fmt.Sprintf("echo '%s' > ~/.kube/config", kubeconfig) + _, err = loaderUtils.ServerExec(node, echoKubeconfigCmd) + if !utils.CheckErrorWithMsg(err, "Failed to write kubeconfig to node %s: %v\n", node, err) { + errChan <- err + } + }(node) + } + wg.Wait() + close(errChan) + for err := range errChan { + return err + } + utils.InfoPrintf("Successfully copied K8s certificates.\n") + return nil +} + +func distributeLoaderSSHKey(loaderNode string, allNodes []string) error { + _, err := loaderUtils.ServerExec(loaderNode, `echo -e "\n\n\n" | ssh-keygen -t rsa -N ""`) + if !utils.CheckErrorWithMsg(err, "Failed to generate SSH key on loader node %s: %v\n", loaderNode, err) { + return err + } + + pubKey, err := loaderUtils.ServerExec(loaderNode, "cat ~/.ssh/id_rsa.pub") + if !utils.CheckErrorWithMsg(err, "Failed to get public key from loader node %s: %v\n", loaderNode, err) { + return err + } + + var wg sync.WaitGroup + errChan := make(chan error, len(allNodes)) + for _, node := range allNodes { + wg.Add(1) + go func(node string) { + defer wg.Done() + cmd := fmt.Sprintf("echo '%s' >> ~/.ssh/authorized_keys", strings.TrimSpace(pubKey)) + _, err := loaderUtils.ServerExec(node, cmd) + if !utils.CheckErrorWithMsg(err, "Failed to distribute SSH key to node %s: %v\n", node, err) { + errChan <- err + } + }(node) + } + wg.Wait() + close(errChan) + for err := range errChan { + return err + } + utils.InfoPrintf("Successfully distributed loader SSH key.\n") + return nil +} + +// setupVhiveFirecrackerDaemon sets up the vHive Firecracker daemon on a given node. +func setupVhiveFirecrackerDaemon(node string, clusterMode string) error { + // Build vHive + _, err := loaderUtils.ServerExec(node, "cd vhive; source /etc/profile && go build") + if !utils.CheckErrorWithMsg(err, "Failed to build vHive on node %s: %v\n", node, err) { + return err + } + + // Create tmux session for firecracker + _, err = loaderUtils.ServerExec(node, "tmux new -s firecracker -d") + if !utils.CheckErrorWithMsg(err, "Failed to create tmux session 'firecracker' on node %s: %v\n", node, err) { + return err + } + + // Start firecracker-containerd + _, err = loaderUtils.ServerExec(node, `tmux send -t firecracker "sudo PATH=$PATH /usr/local/bin/firecracker-containerd --config /etc/firecracker-containerd/config.toml 2>&1 | tee ~/firecracker_log.txt" ENTER`) + if !utils.CheckErrorWithMsg(err, "Failed to start firecracker-containerd on node %s: %v\n", node, err) { + return err + } + + // Create tmux session for vhive + _, err = loaderUtils.ServerExec(node, "tmux new -s vhive -d") + if !utils.CheckErrorWithMsg(err, "Failed to create tmux session 'vhive' on node %s: %v\n", node, err) { + return err + } + + // Change directory to vhive in tmux + _, err = loaderUtils.ServerExec(node, `tmux send -t vhive "cd vhive" ENTER`) + if !utils.CheckErrorWithMsg(err, "Failed to change directory to vhive in tmux on node %s: %v\n", node, err) { + return err + } + + // Determine firecracker snapshots argument based on cluster mode + firecrackerSnapshots := "" + if clusterMode == "firecracker_snapshots" { + firecrackerSnapshots = "-snapshots" + } + + // Run vhive + runVhiveCmd := fmt.Sprintf("sudo ./vhive %s 2>&1 | tee ~/vhive_log.txt", firecrackerSnapshots) + _, err = loaderUtils.ServerExec(node, fmt.Sprintf(`tmux send -t vhive "%s" ENTER`, runVhiveCmd)) + if !utils.CheckErrorWithMsg(err, "Failed to run vhive on node %s: %v\n", node, err) { + return err + } + + return nil +} diff --git a/scripts/setup/cluster/setup_prometheus.go b/scripts/setup/cluster/setup_prometheus.go new file mode 100644 index 000000000..fd8e92015 --- /dev/null +++ b/scripts/setup/cluster/setup_prometheus.go @@ -0,0 +1,157 @@ +package cluster + +import ( + "fmt" + "time" + + "github.com/vhive-serverless/loader/scripts/setup/configs" + loaderUtils "github.com/vhive-serverless/loader/scripts/setup/utils" + "github.com/vhive-serverless/vHive/scripts/utils" +) + +// setupPrometheus sets up Prometheus components on the master node. +func setupPrometheus(masterNode string, allNode []string, promConfig *configs.PrometheusConfig) error { + // Install htop + utils.WaitPrintf("Installing htop on master node %s...\n", masterNode) + _, err := loaderUtils.ServerExec(masterNode, "sudo apt install htop") + if !utils.CheckErrorWithMsg(err, "Failed to install htop on master node %s: %v\n", masterNode, err) { + return err + } + + for _, node := range allNode { + utils.WaitPrintf("Set Paranoid level on node %s...\n", node) + _, err := loaderUtils.ServerExec(node, "sudo sysctl -w kernel.perf_event_paranoid=-1") + if !utils.CheckErrorWithMsg(err, "Failed to set Paranoid level on node %s: %v\n", node, err) { + return err + } + } + + // Deploy Metrics Server + utils.WaitPrintf("Deploying Metrics Server version %s on master node %s...\n", promConfig.MetricsServerVersion, masterNode) + _, err = loaderUtils.ServerExec(masterNode, fmt.Sprintf("kubectl apply -f https://github.com/kubernetes-sigs/metrics-server/releases/download/%s/components.yaml", promConfig.MetricsServerVersion)) + if !utils.CheckErrorWithMsg(err, "Failed to deploy Metrics Server on master node %s: %v\n", masterNode, err) { + return err + } + + // Patch Metrics Server deployment to allow insecure TLS + utils.WaitPrintf("Patching Metrics Server deployment for insecure TLS on master node %s...\n", masterNode) + _, err = loaderUtils.ServerExec(masterNode, `kubectl patch deployment metrics-server -n kube-system --type='json' -p='[{"op": "add", "path": "/spec/template/spec/containers/0/args/-", "value": "--kubelet-insecure-tls=true"}]'`) + if !utils.CheckErrorWithMsg(err, "Failed to patch Metrics Server deployment on master node %s: %v\n", masterNode, err) { + return err + } + + // Install Helm + utils.WaitPrintf("Installing Helm on master node %s...\n", masterNode) + _, err = loaderUtils.ServerExec(masterNode, "curl https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 | bash") + if !utils.CheckErrorWithMsg(err, "Failed to install Helm on master node %s: %v\n", masterNode, err) { + return err + } + + time.Sleep(1 * time.Second) + + // Add Prometheus Helm repository + utils.WaitPrintf("Adding Prometheus Helm repository on master node %s...\n", masterNode) + _, err = loaderUtils.ServerExec(masterNode, "helm repo add prometheus-community https://prometheus-community.github.io/helm-charts") + if !utils.CheckErrorWithMsg(err, "Failed to add Prometheus Helm repository on master node %s: %v\n", masterNode, err) { + return err + } + + // Update Helm repositories + utils.WaitPrintf("Updating Helm repositories on master node %s...\n", masterNode) + _, err = loaderUtils.ServerExec(masterNode, "helm repo update") + if !utils.CheckErrorWithMsg(err, "Failed to update Helm repositories on master node %s: %v\n", masterNode, err) { + return err + } + + // Create monitoring namespace + utils.WaitPrintf("Creating 'monitoring' namespace on master node %s...\n", masterNode) + _, err = loaderUtils.ServerExec(masterNode, "kubectl create namespace monitoring") + if !utils.CheckErrorWithMsg(err, "Failed to create 'monitoring' namespace on master node %s: %v\n", masterNode, err) { + return err + } + + // Install and start Prometheus stack using helm + utils.WaitPrintf("Installing Prometheus stack (version %s) on master node %s...\n", promConfig.PromChartVersion, masterNode) + _, err = loaderUtils.ServerExec(masterNode, fmt.Sprintf("helm install -n monitoring prometheus --version %s prometheus-community/kube-prometheus-stack -f %s/prom_values.yaml", promConfig.PromChartVersion, promConfig.PromValuePath)) + if !utils.CheckErrorWithMsg(err, "Failed to install Prometheus stack on master node %s: %v\n", masterNode, err) { + return err + } + + // Install Prometheus pushgateway using helm + utils.WaitPrintf("Installing Prometheus pushgateway (version %s) on master node %s...\n", promConfig.PushgatewayChartVersion, masterNode) + _, err = loaderUtils.ServerExec(masterNode, fmt.Sprintf("helm install -n monitoring prometheus-pushgateway --version %s prometheus-community/prometheus-pushgateway -f %s/prom_pushgateway.yaml", promConfig.PushgatewayChartVersion, promConfig.PromValuePath)) + if !utils.CheckErrorWithMsg(err, "Failed to install Prometheus pushgateway on master node %s: %v\n", masterNode, err) { + return err + } + + // Apply ServiceMonitors/PodMonitors for Knative metrics + utils.WaitPrintf("Applying Knative ServiceMonitors/PodMonitors on master node %s...\n", masterNode) + _, err = loaderUtils.ServerExec(masterNode, fmt.Sprintf("curl -sL %s/config/serving-monitors.yaml | sed 's/interval: 30s/interval: 2s/g' | kubectl apply -f -", promConfig.KnativePromURL)) + if !utils.CheckErrorWithMsg(err, "Failed to apply Knative ServiceMonitors/PodMonitors on master node %s: %v\n", masterNode, err) { + return err + } + + // Apply Grafana dashboards for Knative + utils.WaitPrintf("Applying Knative Grafana dashboards on master node %s...\n", masterNode) + _, err = loaderUtils.ServerExec(masterNode, fmt.Sprintf(`curl -sL %s/config/configmap-serving-dashboard.json | sed 's/"namespace": "knative-serving"/"namespace": "monitoring"/g' | kubectl apply -f -`, promConfig.KnativePromURL)) + if !utils.CheckErrorWithMsg(err, "Failed to apply Knative Grafana dashboards on master node %s: %v\n", masterNode, err) { + return err + } + + // Re-render the controller manager manifest so Prometheus can scrape it on + // all interfaces using the existing kubeadm init configuration file. + utils.WaitPrintf("Binding controller manager address on master node %s...\n", masterNode) + _, err = loaderUtils.ServerExec(masterNode, fmt.Sprintf("sudo kubeadm init phase control-plane controller-manager --config %s/kubeadm_init.yaml --v=7", promConfig.PromValuePath)) + if !utils.CheckErrorWithMsg(err, "Failed to bind controller manager address on master node %s: %v\n", masterNode, err) { + return err + } + + // Re-render the scheduler manifest using the same single kubeadm config file. + utils.WaitPrintf("Binding scheduler address on master node %s...\n", masterNode) + _, err = loaderUtils.ServerExec(masterNode, fmt.Sprintf("sudo kubeadm init phase control-plane scheduler --config %s/kubeadm_init.yaml --v=7", promConfig.PromValuePath)) + if !utils.CheckErrorWithMsg(err, "Failed to bind scheduler address on master node %s: %v\n", masterNode, err) { + return err + } + + // kube-proxy is configured via the kube-system/kube-proxy ConfigMap, so + // re-apply it from the init config before restarting the DaemonSet pods. + utils.WaitPrintf("Updating kube-proxy metrics bind address on master node %s...\n", masterNode) + _, err = loaderUtils.ServerExec(masterNode, fmt.Sprintf("sudo kubeadm init phase addon kube-proxy --config %s/kubeadm_init.yaml --v=7", promConfig.PromValuePath)) + if !utils.CheckErrorWithMsg(err, "Failed to update kube-proxy configuration on master node %s: %v\n", masterNode, err) { + return err + } + + // Restart the kube-proxy + utils.WaitPrintf("Restarting kube-proxy on master node %s...\n", masterNode) + _, err = loaderUtils.ServerExec(masterNode, "kubectl delete pod -l k8s-app=kube-proxy -n kube-system") + if !utils.CheckErrorWithMsg(err, "Failed to restart kube-proxy on master node %s: %v\n", masterNode, err) { + return err + } + + time.Sleep(5 * time.Second) + + // Set up Prometheus port-forwarding in tmux + utils.WaitPrintf("Setting up Prometheus port-forwarding on master node %s...\n", masterNode) + _, err = loaderUtils.ServerExec(masterNode, "tmux new -s prometheusd -d") + if !utils.CheckErrorWithMsg(err, "Failed to create tmux session for Prometheus on master node %s: %v\n", masterNode, err) { + return err + } + _, err = loaderUtils.ServerExec(masterNode, `tmux send -t prometheusd "while true; do kubectl port-forward -n monitoring service/prometheus-kube-prometheus-prometheus 9090; done" ENTER`) + if !utils.CheckErrorWithMsg(err, "Failed to set up Prometheus port-forwarding in tmux on master node %s: %v\n", masterNode, err) { + return err + } + + // Set up Grafana dashboard port-forwarding in tmux (id: admin, pwd: prom-operator) + utils.WaitPrintf("Setting up Grafana dashboard port-forwarding on master node %s...\n", masterNode) + _, err = loaderUtils.ServerExec(masterNode, "tmux new -s grafanad -d") + if !utils.CheckErrorWithMsg(err, "Failed to create tmux session for Grafana on master node %s: %v\n", masterNode, err) { + return err + } + _, err = loaderUtils.ServerExec(masterNode, `tmux send -t grafanad "while true; do kubectl -n monitoring port-forward deployment/prometheus-grafana 3000; done" ENTER`) + if !utils.CheckErrorWithMsg(err, "Failed to set up Grafana dashboard port-forwarding in tmux on master node %s: %v\n", masterNode, err) { + return err + } + + utils.InfoPrintf("Done setting up Prometheus components.\n") + return nil +} diff --git a/scripts/setup/configs/config.go b/scripts/setup/configs/config.go new file mode 100644 index 000000000..612b0c42b --- /dev/null +++ b/scripts/setup/configs/config.go @@ -0,0 +1,165 @@ +package configs + +import ( + "encoding/json" + "os" + "path/filepath" + "strconv" + + "github.com/vhive-serverless/vHive/scripts/utils" +) + +type InvitroSetupConfig struct { + MasterNode string + LoaderNode string + WorkerNodes []string + AllNodes []string + + SetupCfg *SetupConfig + PromConfig *PrometheusConfig +} + +type NodeSetup struct { + NodeSetup struct { + MasterNode []string `json:"MASTER_NODE"` + LoaderNode []string `json:"LOADER_NODE"` + WorkerNode []string `json:"WORKER_NODE"` + } `json:"NODE_SETUP"` + NodeLabel map[string][]string `json:"NODE_LABEL"` + NodeURL []string `json:"NODE_URL"` +} + +type SetupConfig struct { + HiveRepo string `json:"VHIVE_REPO"` + HiveBranch string `json:"VHIVE_BRANCH"` + LoaderRepo string `json:"LOADER_REPO"` + LoaderBranch string `json:"LOADER_BRANCH"` + ClusterMode string `json:"CLUSTER_MODE"` + PodsPerNode int `json:"PODS_PER_NODE"` + DeployPrometheus bool `json:"DEPLOY_PROMETHEUS"` +} + +type PrometheusConfig struct { + MetricsServerVersion string `json:"MetricsServerVersion"` + PromChartVersion string `json:"PromChartVersion"` + PushgatewayChartVersion string `json:"PushgatewayChartVersion"` + PromValuePath string `json:"PromValuePath"` + KnativePromURL string `json:"KnativePromURL"` +} + +func CommonConfigSetup(configDir string, configName string) (*InvitroSetupConfig, error) { + // Load Configurations + _, extNodeSetup, err := GetNodeSetup(configDir, configName) + if err != nil { + utils.FatalPrintf("Failed to get node setup config: %v\n", err) + return nil, err + } + + setupCfg, err := GetSetupJSON(configDir) + if err != nil { + utils.FatalPrintf("Failed to get setup config: %v\n", err) + return nil, err + } + + promConfig, err := GetPromConfig(configDir) + if err != nil { + utils.FatalPrintf("Failed to get Prometheus config: %v\n", err) + return nil, err + } + + masterNode := extNodeSetup.NodeSetup.MasterNode[0] + loaderNode := extNodeSetup.NodeSetup.LoaderNode[0] + workerNodes := extNodeSetup.NodeSetup.WorkerNode + allNodes := append([]string{masterNode}, workerNodes...) + + return &InvitroSetupConfig{ + MasterNode: masterNode, + LoaderNode: loaderNode, + WorkerNodes: workerNodes, + AllNodes: allNodes, + SetupCfg: setupCfg, + PromConfig: promConfig, + }, nil +} + +func GetNodeSetup(path string, configName string) (*NodeSetup, *NodeSetup, error) { + configPath := filepath.Join(path, configName) + configFile, err := os.ReadFile(configPath) + if err != nil { + return nil, nil, err + } + + var intNodeSetup NodeSetup + var extlNodeSetup NodeSetup + err = json.Unmarshal(configFile, &intNodeSetup) + if err != nil { + return nil, nil, err + } + + // Map internal IPs to real world URLs + ipToURL := mapNodeURLs(&intNodeSetup) + + extlNodeSetup.NodeSetup.MasterNode = swapIPs(intNodeSetup.NodeSetup.MasterNode, ipToURL) + extlNodeSetup.NodeSetup.LoaderNode = swapIPs(intNodeSetup.NodeSetup.LoaderNode, ipToURL) + extlNodeSetup.NodeSetup.WorkerNode = swapIPs(intNodeSetup.NodeSetup.WorkerNode, ipToURL) + + extlNodeSetup.NodeLabel = make(map[string][]string) + for k, v := range intNodeSetup.NodeLabel { + extlNodeSetup.NodeLabel[k] = swapIPs(v, ipToURL) + } + + return &intNodeSetup, &extlNodeSetup, nil +} + +func GetSetupJSON(path string) (*SetupConfig, error) { + configPath := filepath.Join(path, "setup.json") + configFile, err := os.ReadFile(configPath) + if err != nil { + return nil, err + } + + var setupConfig SetupConfig + err = json.Unmarshal(configFile, &setupConfig) + if err != nil { + return nil, err + } + + return &setupConfig, nil +} + +func GetPromConfig(path string) (*PrometheusConfig, error) { + configPath := filepath.Join(path, "prometheus/prom_config.json") + configFile, err := os.ReadFile(configPath) + if err != nil { + return nil, err + } + + var promConfig PrometheusConfig + err = json.Unmarshal(configFile, &promConfig) + if err != nil { + return nil, err + } + + return &promConfig, nil +} + +func mapNodeURLs(nodeSetup *NodeSetup) map[string]string { + mapping := make(map[string]string) + for i, url := range nodeSetup.NodeURL { + ip := "10.0.1." + strconv.Itoa(i+1) + mapping[ip] = url + } + return mapping +} + +func swapIPs(nodes []string, ipToURL map[string]string) []string { + swapped := make([]string, len(nodes)) + for i, ip := range nodes { + if url, ok := ipToURL[ip]; ok { + swapped[i] = url + } else { + swapped[i] = ip + } + } + return swapped +} diff --git a/scripts/setup/configs/node_setup.json b/scripts/setup/configs/node_setup.json new file mode 100644 index 000000000..ab777aa14 --- /dev/null +++ b/scripts/setup/configs/node_setup.json @@ -0,0 +1,30 @@ +{ + "NODE_SETUP": { + "MASTER_NODE": [ + "10.0.1.1" + ], + "LOADER_NODE": [ + "10.0.1.2" + ], + "WORKER_NODE": [ + "10.0.1.2", + "10.0.1.3", + "10.0.1.4" + ] + }, + + "NODE_LABEL": { + "loader-nodetype=master": [ + "10.0.1.1" + ], + "loader-nodetype=monitoring": [ + "10.0.1.2" + ], + "loader-nodetype=worker": [ + "10.0.1.3", + "10.0.1.4" + ] + }, + + "NODE_URL": ["username@example.com", "username@example.com", "username@example.com", "username@example.com"] +} \ No newline at end of file diff --git a/config/kubeadm_init.yaml b/scripts/setup/configs/prometheus/kubeadm_init.yaml similarity index 52% rename from config/kubeadm_init.yaml rename to scripts/setup/configs/prometheus/kubeadm_init.yaml index 31334b009..d7e7aff6e 100644 --- a/config/kubeadm_init.yaml +++ b/scripts/setup/configs/prometheus/kubeadm_init.yaml @@ -1,15 +1,17 @@ -apiVersion: kubeadm.k8s.io/v1beta3 +apiVersion: kubeadm.k8s.io/v1beta4 kind: InitConfiguration localAPIEndpoint: advertiseAddress: 10.0.1.1 +timeouts: + controlPlaneComponentHealthCheck: 4m0s --- -apiVersion: kubeadm.k8s.io/v1beta3 +apiVersion: kubeadm.k8s.io/v1beta4 kind: ClusterConfiguration apiServer: extraArgs: - authorization-mode: Node,RBAC - timeoutForControlPlane: 4m0s # The timeout that we wait for the API server to appear + - name: authorization-mode + value: Node,RBAC certificatesDir: /etc/kubernetes/pki clusterName: kubernetes dns: {} @@ -17,7 +19,8 @@ etcd: local: dataDir: /var/lib/etcd extraArgs: - listen-metrics-urls: http://0.0.0.0:2381 + - name: listen-metrics-urls + value: http://0.0.0.0:2381 imageRepository: registry.k8s.io kubernetesVersion: v1.34.3 networking: @@ -26,21 +29,21 @@ networking: serviceSubnet: 10.96.0.0/12 controllerManager: extraArgs: - bind-address: 0.0.0.0 # This allows scheduler and controllers to be scraped on other ports and nodes. + - name: bind-address + value: 0.0.0.0 scheduler: extraArgs: - bind-address: 0.0.0.0 + - name: bind-address + value: 0.0.0.0 --- -# Configure kubelet for large pod limits. apiVersion: kubelet.config.k8s.io/v1beta1 kind: KubeletConfiguration -kubeAPIQPS: 50 # Default: 5 -kubeAPIBurst: 100 # Default: 10 -configMapAndSecretChangeDetectionStrategy: "Watch" - +kubeAPIQPS: 50 +kubeAPIBurst: 100 +configMapAndSecretChangeDetectionStrategy: Watch --- apiVersion: kubeproxy.config.k8s.io/v1alpha1 kind: KubeProxyConfiguration -metricsBindAddress: 0.0.0.0:10249 \ No newline at end of file +metricsBindAddress: 0.0.0.0:10249 diff --git a/scripts/setup/configs/prometheus/prom_config.json b/scripts/setup/configs/prometheus/prom_config.json new file mode 100644 index 000000000..b8c8a7c9d --- /dev/null +++ b/scripts/setup/configs/prometheus/prom_config.json @@ -0,0 +1,7 @@ +{ + "MetricsServerVersion": "v0.7.2", + "PromChartVersion": "72.6.2", + "PushgatewayChartVersion": "3.4.1", + "PromValuePath": "~/loader/scripts/setup/configs/prometheus", + "KnativePromURL": "https://raw.githubusercontent.com/knative-extensions/monitoring/2427116e8e3cf0c65118c42a7f586490a1bb3ec2" +} \ No newline at end of file diff --git a/scripts/setup/configs/prometheus/prom_pushgateway.yaml b/scripts/setup/configs/prometheus/prom_pushgateway.yaml new file mode 100644 index 000000000..abc142db9 --- /dev/null +++ b/scripts/setup/configs/prometheus/prom_pushgateway.yaml @@ -0,0 +1,26 @@ +enabled: true +serviceMonitor: + enabled: true + additionalLabels: + release: prometheus + interval: "1s" +affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: loader-nodetype + operator: In + values: + - monitoring + - singlenode +ingress: + enabled: true + ingressClassName: "" + annotations: + kubernetes.io/ingress.class: "istio" + hosts: + - prom-pushgateway.monitoring.10.200.3.4.sslip.io + paths: + - / + pathType: Prefix \ No newline at end of file diff --git a/config/prometh_values_kn.yaml b/scripts/setup/configs/prometheus/prom_values.yaml similarity index 86% rename from config/prometh_values_kn.yaml rename to scripts/setup/configs/prometheus/prom_values.yaml index a50618b12..02ae4c78d 100644 --- a/config/prometh_values_kn.yaml +++ b/scripts/setup/configs/prometheus/prom_values.yaml @@ -93,6 +93,10 @@ prometheus-node-exporter: prometheus: monitor: interval: "15s" + # extraArgs: + # # enable perf collector + # - --collector.perf + # - --collector.perf.cpus=0-27 # adjust according to number of cores (xl170 has 10 cores, c6620 has 28 cores) prometheusOperator: admissionWebhooks: @@ -180,3 +184,15 @@ prometheus: requests: memory: 150Mi enableAdminAPI: true + +# additionalPrometheusRulesMap: +# perf-custom: +# groups: +# - name: "custom:perf_rules" +# rules: +# - record: "custom_perf_instructions_per_cycle:rate1m" +# expr: | +# sum by (cpu,instance) (rate(node_perf_instructions_total[1m])) / +# sum by (cpu,instance) (rate(node_perf_cpucycles_total[1m])) +# labels: +# unit: "instructions/cycle" \ No newline at end of file diff --git a/scripts/setup/configs/setup.json b/scripts/setup/configs/setup.json new file mode 100644 index 000000000..ab6e5b7b9 --- /dev/null +++ b/scripts/setup/configs/setup.json @@ -0,0 +1,9 @@ +{ + "VHIVE_REPO": "https://github.com/vhive-serverless/vhive", + "VHIVE_BRANCH": "v1.8.2", + "LOADER_REPO": "https://github.com/vhive-serverless/invitro", + "LOADER_BRANCH": "main", + "CLUSTER_MODE": "container", + "PODS_PER_NODE": 240, + "DEPLOY_PROMETHEUS": false +} \ No newline at end of file diff --git a/scripts/setup/expose_infra_metrics.sh b/scripts/setup/expose_infra_metrics.sh index d6dec0034..c5222f6ea 100755 --- a/scripts/setup/expose_infra_metrics.sh +++ b/scripts/setup/expose_infra_metrics.sh @@ -47,7 +47,7 @@ server_exec() { server_exec 'kubectl create namespace monitoring' release_label="prometheus" prometheus_chart_version="72.6.2" - server_exec "cd loader; helm install -n monitoring $release_label --version $prometheus_chart_version prometheus-community/kube-prometheus-stack -f config/prometh_values_kn.yaml" + server_exec "cd loader; helm install -n monitoring $release_label --version $prometheus_chart_version prometheus-community/kube-prometheus-stack -f ~/loader/scripts/setup/configs/prometheus/prom_values.yaml" #* Apply the ServiceMonitors/PodMonitors to collect metrics from Knative. #* The ports of the control manager and scheduler are mapped in a way that prometheus default installation can find them. @@ -57,7 +57,10 @@ server_exec() { server_exec "curl -sL $commit_version/config/configmap-serving-dashboard.json | sed 's/"namespace": "knative-serving"/"namespace": "monitoring"/g' | kubectl apply -f -" #* Bind addresses of the control manager and scheduler to "0.0.0.0" so that prometheus can scrape them from any domains. - server_exec 'cd loader; sudo kubeadm upgrade apply --config config/kubeadm_init.yaml --ignore-preflight-errors all --yes --v=7' + server_exec 'sudo kubeadm init phase control-plane controller-manager --config ~/loader/scripts/setup/configs/prometheus/kubeadm_init.yaml --v=7' + server_exec 'sudo kubeadm init phase control-plane scheduler --config ~/loader/scripts/setup/configs/prometheus/kubeadm_init.yaml --v=7' + #* Re-apply kube-proxy configuration so Prometheus can scrape kube-proxy metrics. + server_exec 'sudo kubeadm init phase addon kube-proxy --config ~/loader/scripts/setup/configs/prometheus/kubeadm_init.yaml --v=7' #* Restart the kube-proxy to apply the changes. server_exec 'kubectl delete pod -l k8s-app=kube-proxy -n kube-system' diff --git a/scripts/setup/go.mod b/scripts/setup/go.mod new file mode 100644 index 000000000..9e3e26382 --- /dev/null +++ b/scripts/setup/go.mod @@ -0,0 +1,7 @@ +module github.com/vhive-serverless/loader/scripts/setup + +go 1.26.2 + +require github.com/vhive-serverless/vHive/scripts/configs v0.0.0-20231114022852-400a49d284cb // indirect + +require github.com/vhive-serverless/vHive/scripts/utils v0.0.0-20231114022852-400a49d284cb diff --git a/scripts/setup/go.sum b/scripts/setup/go.sum new file mode 100644 index 000000000..8a34ec0f9 --- /dev/null +++ b/scripts/setup/go.sum @@ -0,0 +1,4 @@ +github.com/vhive-serverless/vHive/scripts/configs v0.0.0-20231114022852-400a49d284cb h1:JoudbjIOvkLiDY7WjaQosP0IKipHnlxFOCIeICQrtRQ= +github.com/vhive-serverless/vHive/scripts/configs v0.0.0-20231114022852-400a49d284cb/go.mod h1:nJSon4Eng7PdZ4HJX9dnZ7H4qxVm/r5zseFPfom7Jto= +github.com/vhive-serverless/vHive/scripts/utils v0.0.0-20231114022852-400a49d284cb h1:SmFLKJc4wAhr61ige8hoLiLWDKAI2uewBQUNejI63Hs= +github.com/vhive-serverless/vHive/scripts/utils v0.0.0-20231114022852-400a49d284cb/go.mod h1:xyjKlPn6JqSQtzKOCu8L4DW4rQcNmhxK9f97cOPo0Sg= diff --git a/scripts/setup/setup.go b/scripts/setup/setup.go new file mode 100644 index 000000000..e47326b60 --- /dev/null +++ b/scripts/setup/setup.go @@ -0,0 +1,59 @@ +package main + +import ( + "flag" + "fmt" + "os" + "path/filepath" + + "github.com/vhive-serverless/loader/scripts/setup/cluster" + loaderUtils "github.com/vhive-serverless/loader/scripts/setup/utils" + "github.com/vhive-serverless/vHive/scripts/utils" +) + +var ( + Setup = flag.String("setup-type", "create_multinode_cluster", "Type of setup to perform") + configName = flag.String("config", "node_setup.json", "Configuration file name") + configDir = flag.String("config-dir", "", "Path to setup config directory") +) + +func main() { + flag.Parse() + + availableCmds := []string{ + "create_multinode_cluster", + } + + dir, err := loaderUtils.ResolveConfigDir(*configDir, *configName) + if err != nil { + utils.FatalPrintf("%v\n", err) + os.Exit(1) + } + + switch *Setup { + case "create_multinode_cluster": + selectedConfigName := *configName + if flag.NArg() > 0 { + selectedConfigName, err = loaderUtils.CreateTempNodeSetup(dir, flag.Args()) + if err != nil { + utils.FatalPrintf("Failed to create temporary node setup config: %v\n", err) + os.Exit(1) + } + defer os.Remove(filepath.Join(dir, selectedConfigName)) + } + + err := cluster.CreateMultiNodeSetup(dir, selectedConfigName) + if err != nil { + utils.FatalPrintf("Failed to create multinode cluster: %v\n", err) + os.Exit(1) + } + // Call the function to create a multinode cluster + + default: + utils.FatalPrintf("Invalid subcommand --> %s! Available subcommands list: \n", *Setup) + for _, subCmd := range availableCmds { + fmt.Printf("%s\n", subCmd) + } + os.Exit(1) + } +} diff --git a/scripts/setup/utils/label.go b/scripts/setup/utils/label.go new file mode 100644 index 000000000..1f57a8eb2 --- /dev/null +++ b/scripts/setup/utils/label.go @@ -0,0 +1,44 @@ +package utils + +import ( + "fmt" + "strings" + + "github.com/vhive-serverless/loader/scripts/setup/configs" + "github.com/vhive-serverless/vHive/scripts/utils" +) + +// LabelNodes applies labels to Kubernetes nodes based on the configuration file using kubectl commands. +// masterNode is the IP address of the master node where kubectl commands will be executed. +// configDir specifies the directory where `node_setup.json` is located. +func LabelNodes(masterNode, configDir, configName string) error { + _, extNodeSetup, err := configs.GetNodeSetup(configDir, configName) + if !utils.CheckErrorWithMsg(err, "Failed to get node setup config from %s: %v", configDir, err) { + return err + } + + for label, nodeIPs := range extNodeSetup.NodeLabel { + for _, nodeIP := range nodeIPs { + // Get the hostname of the node from its IP + nodeName, err := ServerExec(nodeIP, "hostname") + if !utils.CheckErrorWithMsg(err, "Failed to get hostname for node IP %s: %v", nodeIP, err) { + // Continue labeling other nodes even if one fails + utils.FatalPrintf("Error getting hostname for node IP %s: %v\n", nodeIP, err) + continue + } + nodeName = strings.TrimSpace(nodeName) + + // Apply the label using kubectl on the master node + labelCmd := fmt.Sprintf("kubectl label nodes %s %s --overwrite", nodeName, label) + _, err = ServerExec(masterNode, labelCmd) + if !utils.CheckErrorWithMsg(err, "Failed to label node %s with %s on master node %s: %v", nodeName, label, masterNode, err) { + // Continue labeling other nodes even if one fails + utils.FatalPrintf("Error labeling node %s with %s: %v\n", nodeName, label, err) + } else { + utils.InfoPrintf("Successfully labeled node %s with %s\n", nodeName, label) + } + } + } + + return nil +} diff --git a/scripts/setup/utils/utils.go b/scripts/setup/utils/utils.go new file mode 100644 index 000000000..b60f503f0 --- /dev/null +++ b/scripts/setup/utils/utils.go @@ -0,0 +1,132 @@ +package utils + +import ( + "bufio" + "encoding/json" + "fmt" + "os" + "os/exec" + "path/filepath" + "strings" +) + +func ReadSetupCfg(path string) (map[string]string, error) { + file, err := os.Open(path) + if err != nil { + return nil, err + } + defer file.Close() + + config := make(map[string]string) + scanner := bufio.NewScanner(file) + for scanner.Scan() { + line := scanner.Text() + if strings.Contains(line, "=") { + parts := strings.SplitN(line, "=", 2) + key := strings.TrimSpace(parts[0]) + value := strings.Trim(strings.TrimSpace(parts[1]), "'") + config[key] = value + } + } + + if err := scanner.Err(); err != nil { + return nil, err + } + + return config, nil +} + +func ServerExec(node, command string) (string, error) { + cmd := exec.Command("ssh", "-oStrictHostKeyChecking=no", "-p", "22", node, command) + output, err := cmd.CombinedOutput() + if err != nil { + return "", fmt.Errorf("failed to execute command on node %s: %v, output: %s", node, err, string(output)) + } + return string(output), nil +} + +func ResolveConfigDir(input string, configName string) (string, error) { + if input != "" { + if isValidConfigDir(input) { + return input, nil + } + return "", fmt.Errorf("config directory %q does not contain setup.json and prometheus/prom_config.json", input) + } + + candidates := []string{ + "configs", + "scripts/setup/configs", + } + + for _, dir := range candidates { + if isValidConfigDir(dir) { + return dir, nil + } + } + + return "", fmt.Errorf("could not locate config directory; pass -config-dir") +} + +type tempNodeSetup struct { + NodeSetup struct { + MasterNode []string `json:"MASTER_NODE"` + LoaderNode []string `json:"LOADER_NODE"` + WorkerNode []string `json:"WORKER_NODE"` + } `json:"NODE_SETUP"` + NodeLabel map[string][]string `json:"NODE_LABEL"` + NodeURL []string `json:"NODE_URL"` +} + +func CreateTempNodeSetup(configDir string, nodes []string) (string, error) { + if len(nodes) < 2 { + return "", fmt.Errorf("expected at least ") + } + + const tempConfigName = "node_setup_temp.json" + + setup := tempNodeSetup{ + NodeLabel: make(map[string][]string), + NodeURL: append([]string(nil), nodes...), + } + + setup.NodeSetup.MasterNode = []string{"10.0.1.1"} + setup.NodeSetup.LoaderNode = []string{"10.0.1.2"} + + workerNodes := make([]string, 0, len(nodes)-1) + workerLabelNodes := make([]string, 0, len(nodes)-2) + + for i := 1; i < len(nodes); i++ { + internalIP := fmt.Sprintf("10.0.1.%d", i+1) + workerNodes = append(workerNodes, internalIP) + if i > 1 { + workerLabelNodes = append(workerLabelNodes, internalIP) + } + } + + setup.NodeSetup.WorkerNode = workerNodes + setup.NodeLabel["loader-nodetype=master"] = []string{"10.0.1.1"} + setup.NodeLabel["loader-nodetype=monitoring"] = []string{"10.0.1.2"} + setup.NodeLabel["loader-nodetype=worker"] = workerLabelNodes + + configData, err := json.MarshalIndent(setup, "", " ") + if err != nil { + return "", err + } + + configPath := filepath.Join(configDir, tempConfigName) + if err := os.WriteFile(configPath, append(configData, '\n'), 0o644); err != nil { + return "", err + } + + return tempConfigName, nil +} + +func isValidConfigDir(dir string) bool { + if _, err := os.Stat(filepath.Join(dir, "setup.json")); err != nil { + return false + } + if _, err := os.Stat(filepath.Join(dir, "prometheus", "prom_config.json")); err != nil { + return false + } + return true +} diff --git a/scripts/setup/utils/utils_test.go b/scripts/setup/utils/utils_test.go new file mode 100644 index 000000000..0a4b74b77 --- /dev/null +++ b/scripts/setup/utils/utils_test.go @@ -0,0 +1,79 @@ +package utils + +import ( + "os" + "path/filepath" + "reflect" + "testing" + + "github.com/vhive-serverless/loader/scripts/setup/configs" +) + +func TestCreateTempNodeSetupPreservesNodeSetupJSONContract(t *testing.T) { + t.Parallel() + + configDir := t.TempDir() + nodes := []string{ + "master@example.com", + "loader@example.com", + "worker1@example.com", + "worker2@example.com", + } + + configName, err := CreateTempNodeSetup(configDir, nodes) + if err != nil { + t.Fatalf("CreateTempNodeSetup returned error: %v", err) + } + + if configName != "node_setup_temp.json" { + t.Fatalf("CreateTempNodeSetup returned unexpected file name: %s", configName) + } + + intNodeSetup, extNodeSetup, err := configs.GetNodeSetup(configDir, configName) + if err != nil { + t.Fatalf("GetNodeSetup returned error for generated temp config: %v", err) + } + + if _, err := os.Stat(filepath.Join(configDir, configName)); err != nil { + t.Fatalf("generated config file is missing: %v", err) + } + + expectedInternalWorkers := []string{"10.0.1.2", "10.0.1.3", "10.0.1.4"} + if !reflect.DeepEqual(intNodeSetup.NodeSetup.MasterNode, []string{"10.0.1.1"}) { + t.Fatalf("unexpected internal master node: %#v", intNodeSetup.NodeSetup.MasterNode) + } + if !reflect.DeepEqual(intNodeSetup.NodeSetup.LoaderNode, []string{"10.0.1.2"}) { + t.Fatalf("unexpected internal loader node: %#v", intNodeSetup.NodeSetup.LoaderNode) + } + if !reflect.DeepEqual(intNodeSetup.NodeSetup.WorkerNode, expectedInternalWorkers) { + t.Fatalf("unexpected internal worker nodes: %#v", intNodeSetup.NodeSetup.WorkerNode) + } + + if !reflect.DeepEqual(extNodeSetup.NodeSetup.MasterNode, []string{"master@example.com"}) { + t.Fatalf("unexpected external master node: %#v", extNodeSetup.NodeSetup.MasterNode) + } + if !reflect.DeepEqual(extNodeSetup.NodeSetup.LoaderNode, []string{"loader@example.com"}) { + t.Fatalf("unexpected external loader node: %#v", extNodeSetup.NodeSetup.LoaderNode) + } + if !reflect.DeepEqual(extNodeSetup.NodeSetup.WorkerNode, []string{"loader@example.com", "worker1@example.com", "worker2@example.com"}) { + t.Fatalf("unexpected external worker nodes: %#v", extNodeSetup.NodeSetup.WorkerNode) + } + + expectedLabels := map[string][]string{ + "loader-nodetype=master": {"master@example.com"}, + "loader-nodetype=monitoring": {"loader@example.com"}, + "loader-nodetype=worker": {"worker1@example.com", "worker2@example.com"}, + } + if !reflect.DeepEqual(extNodeSetup.NodeLabel, expectedLabels) { + t.Fatalf("unexpected external node labels: %#v", extNodeSetup.NodeLabel) + } +} + +func TestCreateTempNodeSetupRequiresMasterAndLoader(t *testing.T) { + t.Parallel() + + _, err := CreateTempNodeSetup(t.TempDir(), []string{"master@example.com"}) + if err == nil { + t.Fatal("CreateTempNodeSetup returned nil error for incomplete node list") + } +}