diff --git a/.github/workflows/build-api.yml b/.github/workflows/build-api.yml new file mode 100644 index 000000000..7bd5c50a3 --- /dev/null +++ b/.github/workflows/build-api.yml @@ -0,0 +1,53 @@ +name: build-api +on: + push: + paths: + - "src/Components/API/**" + - ".github/workflows/build-api.yml" + workflow_dispatch: +jobs: + build: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + id-token: write + env: + REGION: us-central1 + IMAGE_NAME: api + REPO_PATH: echonet + steps: + - uses: actions/checkout@v4 + - name: Set up QEMU (multi-arch) + uses: docker/setup-qemu-action@v3 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - name: Auth to GCP + uses: google-github-actions/auth@v2 + with: + workload_identity_provider: ${{ secrets.GCP_WIF_PROVIDER }} + service_account: ${{ secrets.GCP_SA_EMAIL }} + - name: Configure Docker auth + run: gcloud auth configure-docker $REGION-docker.pkg.dev -q + - name: Configure Docker auth (staging AU registry) + run: gcloud auth configure-docker australia-southeast2-docker.pkg.dev -q + - name: Compute short SHA + id: vars + run: echo "sha_short=${GITHUB_SHA::7}" >> $GITHUB_OUTPUT + - name: Build & push (multi-arch) + uses: docker/build-push-action@v5 + with: + context: src/Components/API + platforms: linux/amd64,linux/arm64 + push: true + tags: | + ${{ env.REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT }}/${{ env.REPO_PATH }}/${{ env.IMAGE_NAME }}:${{ steps.vars.outputs.sha_short }} + ${{ env.REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT }}/${{ env.REPO_PATH }}/${{ env.IMAGE_NAME }}:latest + australia-southeast2-docker.pkg.dev/${{ secrets.GCP_PROJECT }}/${{ env.REPO_PATH }}/${{ env.IMAGE_NAME }}:${{ steps.vars.outputs.sha_short }} + australia-southeast2-docker.pkg.dev/${{ secrets.GCP_PROJECT }}/${{ env.REPO_PATH }}/${{ env.IMAGE_NAME }}:latest + - name: Trivy scan + uses: aquasecurity/trivy-action@v0.20.0 + with: + image-ref: ${{ env.REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT }}/$REPO_PATH/${{ env.IMAGE_NAME }}:latest + exit-code: '0' + severity: 'CRITICAL,HIGH' diff --git a/.github/workflows/build-engine-model.yaml b/.github/workflows/build-engine-model.yaml new file mode 100644 index 000000000..b25a5bcba --- /dev/null +++ b/.github/workflows/build-engine-model.yaml @@ -0,0 +1,49 @@ +name: build-engine-model +on: + push: + paths: + - "src/Components/Engine/**" + - "src/Components/Engine/Dockerfile" + - ".github/workflows/build-engine-model.yaml" + workflow_dispatch: +jobs: + build: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + id-token: write + env: + REGION: us-central1 + REPO_PATH: echonet + steps: + - uses: actions/checkout@v4 + - name: Auth to GCP + uses: google-github-actions/auth@v2 + with: + workload_identity_provider: ${{ secrets.GCP_WIF_PROVIDER }} + service_account: ${{ secrets.GCP_SA_EMAIL }} + - name: Configure Docker auth + run: gcloud auth configure-docker $REGION-docker.pkg.dev -q + - name: Build & push engine + run: | + IMAGE_ENGINE="$REGION-docker.pkg.dev/${{ secrets.GCP_PROJECT }}/$REPO_PATH/engine" + GIT_SHA=${GITHUB_SHA::7} + docker build -t $IMAGE_ENGINE:$GIT_SHA -t $IMAGE_ENGINE:latest src/Components/Engine + docker push $IMAGE_ENGINE:$GIT_SHA + docker push $IMAGE_ENGINE:latest + - name: Build & push model-server (if Dockerfile exists) + run: | + if [ -f src/Components/Engine/model-server.Dockerfile ]; then \ + IMAGE_MODEL="$REGION-docker.pkg.dev/${{ secrets.GCP_PROJECT }}/$REPO_PATH/model-server"; \ + GIT_SHA=${GITHUB_SHA::7}; \ + docker build -f src/Components/Engine/ "finalizers": ["kubernetes"]Dockerfile -t $IMAGE_MODEL:$GIT_SHA -t $IMAGE_MODEL:latest src/Components/Engine; \ + docker push $IMAGE_MODEL:$GIT_SHA; \ + docker push $IMAGE_MODEL:latest; \ + else echo "No model-server.Dockerfile present"; fi + - name: Trivy scan engine + uses: aquasecurity/trivy-action@0.20.0 + with: + image-ref: ${{ env.REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT }}/$REPO_PATH/engine:latest + exit-code: '0' + severity: 'CRITICAL,HIGH' \ No newline at end of file diff --git a/.github/workflows/build-hmi.yml b/.github/workflows/build-hmi.yml new file mode 100644 index 000000000..bdbf7ef8b --- /dev/null +++ b/.github/workflows/build-hmi.yml @@ -0,0 +1,49 @@ +name: build-hmi +on: + push: + paths: + - "src/Components/HMI/**" + - ".github/workflows/build-hmi.yml" + workflow_dispatch: +jobs: + build: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + id-token: write + env: + REGION: us-central1 + REPO_PATH: echonet + IMAGE_NAME: hmi + steps: + - uses: actions/checkout@v4 + - name: Set up QEMU (multi-arch) + uses: docker/setup-qemu-action@v3 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - name: Auth to GCP + uses: google-github-actions/auth@v2 + with: + workload_identity_provider: ${{ secrets.GCP_WIF_PROVIDER }} + service_account: ${{ secrets.GCP_SA_EMAIL }} + - name: Configure Docker auth (US registry) + run: gcloud auth configure-docker $REGION-docker.pkg.dev -q + - name: Configure Docker auth (AU registry) + run: gcloud auth configure-docker australia-southeast2-docker.pkg.dev -q + - name: Compute short SHA + id: vars + run: echo "sha_short=${GITHUB_SHA::7}" >> $GITHUB_OUTPUT + - name: Build & push HMI (multi-arch) + uses: docker/build-push-action@v5 + with: + context: src/Components/HMI + file: src/Components/HMI/HMI.Dockerfile + platforms: linux/amd64,linux/arm64 + push: true + tags: | + ${{ env.REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT }}/${{ env.REPO_PATH }}/${{ env.IMAGE_NAME }}:${{ steps.vars.outputs.sha_short }} + ${{ env.REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT }}/${{ env.REPO_PATH }}/${{ env.IMAGE_NAME }}:latest + australia-southeast2-docker.pkg.dev/${{ secrets.GCP_PROJECT }}/${{ env.REPO_PATH }}/${{ env.IMAGE_NAME }}:${{ steps.vars.outputs.sha_short }} + australia-southeast2-docker.pkg.dev/${{ secrets.GCP_PROJECT }}/${{ env.REPO_PATH }}/${{ env.IMAGE_NAME }}:latest + diff --git a/.github/workflows/build-mqtt.yml b/.github/workflows/build-mqtt.yml new file mode 100644 index 000000000..2728bd0fe --- /dev/null +++ b/.github/workflows/build-mqtt.yml @@ -0,0 +1,49 @@ +name: build-mqtt +on: + push: + paths: + - "src/Components/MQTT-Server/**" + - ".github/workflows/build-mqtt.yml" + workflow_dispatch: +jobs: + build: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + id-token: write + env: + REGION: us-central1 + REPO_PATH: echonet + IMAGE_NAME: mqtt + steps: + - uses: actions/checkout@v4 + - name: Set up QEMU (multi-arch) + uses: docker/setup-qemu-action@v3 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - name: Auth to GCP + uses: google-github-actions/auth@v2 + with: + workload_identity_provider: ${{ secrets.GCP_WIF_PROVIDER }} + service_account: ${{ secrets.GCP_SA_EMAIL }} + - name: Configure Docker auth (US registry) + run: gcloud auth configure-docker $REGION-docker.pkg.dev -q + - name: Configure Docker auth (AU registry) + run: gcloud auth configure-docker australia-southeast2-docker.pkg.dev -q + - name: Compute short SHA + id: vars + run: echo "sha_short=${GITHUB_SHA::7}" >> $GITHUB_OUTPUT + - name: Build & push MQTT (multi-arch) + uses: docker/build-push-action@v5 + with: + context: src/Components/MQTT-Server + file: src/Components/MQTT-Server/MQTT.Dockerfile + platforms: linux/amd64,linux/arm64 + push: true + tags: | + ${{ env.REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT }}/${{ env.REPO_PATH }}/${{ env.IMAGE_NAME }}:${{ steps.vars.outputs.sha_short }} + ${{ env.REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT }}/${{ env.REPO_PATH }}/${{ env.IMAGE_NAME }}:latest + australia-southeast2-docker.pkg.dev/${{ secrets.GCP_PROJECT }}/${{ env.REPO_PATH }}/${{ env.IMAGE_NAME }}:${{ steps.vars.outputs.sha_short }} + australia-southeast2-docker.pkg.dev/${{ secrets.GCP_PROJECT }}/${{ env.REPO_PATH }}/${{ env.IMAGE_NAME }}:latest + diff --git a/.github/workflows/build-simulator.yml b/.github/workflows/build-simulator.yml new file mode 100644 index 000000000..55cde076f --- /dev/null +++ b/.github/workflows/build-simulator.yml @@ -0,0 +1,49 @@ +name: build-simulator +on: + push: + paths: + - "src/Components/Simulator/**" + - ".github/workflows/build-simulator.yml" + workflow_dispatch: +jobs: + build: + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + id-token: write + env: + REGION: us-central1 + REPO_PATH: echonet + IMAGE_NAME: simulator + steps: + - uses: actions/checkout@v4 + - name: Set up QEMU (multi-arch) + uses: docker/setup-qemu-action@v3 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - name: Auth to GCP + uses: google-github-actions/auth@v2 + with: + workload_identity_provider: ${{ secrets.GCP_WIF_PROVIDER }} + service_account: ${{ secrets.GCP_SA_EMAIL }} + - name: Configure Docker auth (US registry) + run: gcloud auth configure-docker $REGION-docker.pkg.dev -q + - name: Configure Docker auth (AU registry) + run: gcloud auth configure-docker australia-southeast2-docker.pkg.dev -q + - name: Compute short SHA + id: vars + run: echo "sha_short=${GITHUB_SHA::7}" >> $GITHUB_OUTPUT + - name: Build & push Simulator (multi-arch) + uses: docker/build-push-action@v5 + with: + context: src/Components/Simulator + file: src/Components/Simulator/Simulator.Dockerfile + platforms: linux/amd64,linux/arm64 + push: true + tags: | + ${{ env.REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT }}/${{ env.REPO_PATH }}/${{ env.IMAGE_NAME }}:${{ steps.vars.outputs.sha_short }} + ${{ env.REGION }}-docker.pkg.dev/${{ secrets.GCP_PROJECT }}/${{ env.REPO_PATH }}/${{ env.IMAGE_NAME }}:latest + australia-southeast2-docker.pkg.dev/${{ secrets.GCP_PROJECT }}/${{ env.REPO_PATH }}/${{ env.IMAGE_NAME }}:${{ steps.vars.outputs.sha_short }} + australia-southeast2-docker.pkg.dev/${{ secrets.GCP_PROJECT }}/${{ env.REPO_PATH }}/${{ env.IMAGE_NAME }}:latest + diff --git a/.github/workflows/deploy-echonet.yaml b/.github/workflows/deploy-echonet.yaml new file mode 100644 index 000000000..b620952d0 --- /dev/null +++ b/.github/workflows/deploy-echonet.yaml @@ -0,0 +1,95 @@ +name: deploy-echonet + +on: + workflow_dispatch: + inputs: + environment: + description: "Target environment (staging|prod)" + required: true + default: "staging" + namespace: + description: "Kubernetes namespace" + required: false + default: "staging" + workflow_run: + workflows: ["build-api", "build-engine-model"] + types: ["completed"] + +jobs: + deploy: + if: | + github.event_name == 'workflow_dispatch' || + (github.event.workflow_run.conclusion == 'success') + runs-on: ubuntu-latest + permissions: + contents: read + id-token: write + env: + REGION: us-central1 + CLUSTER_NAME: echonet-gke + RELEASE: echonet + PROJECT_ID: ${{ secrets.GCP_PROJECT }} + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Auth to GCP + uses: google-github-actions/auth@v2 + with: + workload_identity_provider: ${{ secrets.GCP_WIF_PROVIDER }} + service_account: ${{ secrets.GCP_SA_EMAIL }} + + - name: Get GKE credentials + uses: google-github-actions/get-gke-credentials@v2 + with: + cluster_name: ${{ env.CLUSTER_NAME }} + location: ${{ env.REGION }} + + - name: Set ENV vars + id: setenv + run: | + if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then + ENV_INPUT='${{ github.event.inputs.environment }}' + NAMESPACE='${{ github.event.inputs.namespace }}' + else + # default to staging on build success auto-run + ENV_INPUT='staging' + NAMESPACE='staging' + fi + echo "env_sel=$ENV_INPUT" >> $GITHUB_OUTPUT + echo "ns=$NAMESPACE" >> $GITHUB_OUTPUT + + - name: Select values file + id: values + run: | + if [ "${{ steps.setenv.outputs.env_sel }}" = "prod" ]; then + echo "file=deploy/helm/echonet/values-prod.yaml" >> $GITHUB_OUTPUT + else + echo "file=deploy/helm/echonet/values-staging.yaml" >> $GITHUB_OUTPUT + fi + + - name: Ensure namespace exists + run: | + kubectl get ns ${{ steps.setenv.outputs.ns }} 2>/dev/null || kubectl create ns ${{ steps.setenv.outputs.ns }} + + - name: Install Helm + uses: azure/setup-helm@v4 + + - name: Helm upgrade + run: | + IMAGE_TAG=${GITHUB_SHA::7} + helm upgrade --install $RELEASE deploy/helm/echonet \ + -n ${{ steps.setenv.outputs.ns }} \ + -f ${{ steps.values.outputs.file }} \ + --set image.tag=$IMAGE_TAG \ + --set engine.tag=$IMAGE_TAG \ + --set modelServer.tag=$IMAGE_TAG \ + --wait --timeout 10m + + - name: Show deployed images + run: | + kubectl -n ${{ steps.setenv.outputs.ns }} get deploy -o jsonpath='{range .items[*]}{.metadata.name}{" => "}{.spec.template.spec.containers[0].image}{"\n"}{end}' + + - name: Post summary + run: | + echo "Deployment complete to namespace ${{ steps.setenv.outputs.ns }} using values: ${{ steps.values.outputs.file }}" >> $GITHUB_STEP_SUMMARY \ No newline at end of file diff --git a/.gitignore b/.gitignore index 5b4e521d8..763c4f6ec 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,15 @@ src/Components/HMI/ui/node_modules/* # Temporary files from Cloud Storage (added by ptanmay143) .tmp.driveupload *.mp3 + +# Secret & credential files to ignore going forward +*.secret.json +**/echo_config.json +**/db.config.js +*.env.local +secrets/ +**/init-mongo.js.backup +**/mserver-env.yaml +**/engine-env.yaml +**/api-env.yaml +**/hmi-env.yaml diff --git a/SECRET_MANAGEMENT.md b/SECRET_MANAGEMENT.md new file mode 100644 index 000000000..ec95e45fb --- /dev/null +++ b/SECRET_MANAGEMENT.md @@ -0,0 +1,49 @@ +## Secret Management (EchoNet) + +Do not commit real secrets. Use this guide for local dev and deployment. + +### Local Development +1. Create `secrets/mongo.env` (git-ignored): + ```bash + MONGODB_USER=your_user + MONGODB_PASS=your_pass + MONGODB_HOST=localhost + MONGODB_DB=EchoNet + ``` +2. Load it: `export $(grep -v '^#' secrets/mongo.env | xargs)` +3. Run the API normally. + +### Kubernetes +1. Create secret from sanitized manifest: + ```bash + kubectl apply -f deploy/helm/echonet/templates/mongo-credentials-secret.yaml -n + ``` +2. Or create directly: + ```bash + kubectl create secret generic mongo-credentials \ + --from-literal=MONGODB_USER=xxx \ + --from-literal=MONGODB_PASS=yyy \ + --from-literal=MONGODB_HOST=mongodb \ + --from-literal=MONGODB_DB=EchoNet -n + ``` +3. Reference in Deployment env: + ```yaml + envFrom: + - secretRef: + name: mongo-credentials + ``` + +### Rotation +1. Add new Mongo user & password. +2. Update K8s Secret (kubectl apply -f or kubectl create secret ... --dry-run=client -o yaml | kubectl apply -f -). +3. Restart deployments (or rely on rolling update). +4. Remove old user after validation. + +### Auditing +List all secrets in namespace: +```bash +kubectl -n get secret +``` + +### DO NOT COMMIT +Real `.env` files, raw passwords, API keys. diff --git a/cloud-deploy-status-checklist.txt b/cloud-deploy-status-checklist.txt new file mode 100644 index 000000000..9ca765693 --- /dev/null +++ b/cloud-deploy-status-checklist.txt @@ -0,0 +1,84 @@ +Cloud Deployment Status Checklist — Project Echo / EchoNet + +Status summary +- Environment: staging +- Overall status: red +- Last deploy time:"N/A" +- Build/commit: +- Demo URL: +- API health URL: + +Health checks +- curl -sfS https:///healthz || echo "FAIL" +- curl -sfS https:///readyz || echo "FAIL" +- curl -I https:// | head -n1 + +CI/CD +- GitHub Actions: + - gh run list -w deploy --branch main -L 1 + - gh run view +- GitLab CI: + - Open latest pipeline for default branch; confirm deploy stage green. + +Cloud platform +Kubernetes (EKS/AKS/GKE) +- kubectl config current-context +- kubectl -n get ingress,svc,deploy,pods +- kubectl -n describe ingress | grep -i host +- kubectl -n rollout status deploy/ +- kubectl -n get pods -l app= -o wide +- kubectl -n logs deploy/ --tail=200 +- kubectl -n get events --sort-by=.lastTimestamp | tail + +AWS ECS +- aws ecs describe-services --cluster --services \ + --query 'services[0].{status:status,desired:desiredCount,running:runningCount,deployment:deployments[0].updatedAt}' +- aws logs tail /ecs/ --since 1h + +GCP Cloud Run +- gcloud run services describe --region --format='value(status.url)' +- gcloud run services describe --region --format='value(status.latestReadyRevisionName,status.traffic)' +- gcloud run revisions list --service --region + +Azure App Service +- az webapp show -g -n --query defaultHostName -o tsv +- az webapp log tail -g -n + +Database/migrations +- Prisma: npx prisma migrate status +- Alembic: alembic current; alembic history | tail +- Rails: bundle exec rails db:migrate:status +- Sequelize: npx sequelize db:migrate:status +- Run a test read/write in staging. + +Background jobs/queues +- Kubernetes workers/cronjobs: kubectl -n get deploy,cronjob | grep -E "worker|job" +- SQS: aws sqs get-queue-attributes --queue-url --attribute-names ApproximateNumberOfMessages +- Redis/Sidekiq: check queue sizes and retries. + +Observability +- Link primary dashboard (Grafana/Datadog/New Relic): +- Verify last 24h SLOs: error rate, p95 latency, 5xx volume, uptime. +- Check active alerts/monitors status. + +Config & secrets +- kubectl -n get deploy -o yaml | grep -A2 -E '^\s*env:' +- kubectl -n get secret | grep +- Confirm required env vars and third-party credentials present. + +Last deploy metadata +- kubectl -n get deploy \ + -o jsonpath='{.spec.template.spec.containers[0].image} {" "}{.metadata.annotations.kubectl\.kubernetes\.io/restartedAt}{"\n"}' +- CI deploy job URL and timestamp: [paste] + +Access for demo +- Viewer account created: [yes/no] +- IP allowlist updated: [yes/no] +- Shared links: [UI], [API], [Dashboard], [Logs] + +Known blockers +- [ ] Pipeline failing +- [ ] Migrations pending/failing +- [ ] Secrets missing/invalid +- [ ] Elevated error rate +- [ ] Other: diff --git a/cloud_deploy_checklist.md b/cloud_deploy_checklist.md new file mode 100644 index 000000000..e69de29bb diff --git a/deploy/helm/echonet/Chart.yaml b/deploy/helm/echonet/Chart.yaml new file mode 100644 index 000000000..d066fae55 --- /dev/null +++ b/deploy/helm/echonet/Chart.yaml @@ -0,0 +1,3 @@ +apiVersion: v2 +name: echonet +version: 0.1.0 diff --git a/deploy/helm/echonet/templates/alerts.yaml b/deploy/helm/echonet/templates/alerts.yaml new file mode 100644 index 000000000..dd9915733 --- /dev/null +++ b/deploy/helm/echonet/templates/alerts.yaml @@ -0,0 +1,29 @@ +{{- if and .Values.monitoring.enabled .Values.monitoring.alerting.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: echonet-basic-alerts + labels: + role: alert-rules +{{- toYaml .Values.monitoring.labels | nindent 4 }} +spec: + groups: + - name: echonet.availability + rules: + - alert: ApiHighErrorRate + expr: sum(rate(http_requests_total{app="api",status=~"5.."}[5m])) / sum(rate(http_requests_total{app="api"}[5m])) > 0.05 + for: 10m + labels: + severity: warning + annotations: + summary: API high 5xx error rate + description: More than 5% of API requests failing over 10m. + - alert: ModelServerDown + expr: up{app="model-server"} == 0 + for: 5m + labels: + severity: critical + annotations: + summary: Model server down + description: Model server target not scraping for 5m. +{{- end }} \ No newline at end of file diff --git a/deploy/helm/echonet/templates/api-deployment.yaml b/deploy/helm/echonet/templates/api-deployment.yaml new file mode 100644 index 000000000..d0b941112 --- /dev/null +++ b/deploy/helm/echonet/templates/api-deployment.yaml @@ -0,0 +1,38 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: api + labels: + app: api +spec: + replicas: {{ .Values.api.replicas }} + selector: + matchLabels: + app: api + template: + metadata: + labels: + app: api + spec: + containers: + - name: api + image: "{{ .Values.api.image }}" + envFrom: + - secretRef: + name: mongo-credentials + ports: + - containerPort: 9000 + livenessProbe: + httpGet: + path: /health + port: 9000 + initialDelaySeconds: 15 + periodSeconds: 20 + readinessProbe: + httpGet: + path: /health + port: 9000 + initialDelaySeconds: 5 + periodSeconds: 10 + resources: +{{- toYaml .Values.api.resources | nindent 12 }} diff --git a/deploy/helm/echonet/templates/api-service.yaml b/deploy/helm/echonet/templates/api-service.yaml new file mode 100644 index 000000000..cc6884a58 --- /dev/null +++ b/deploy/helm/echonet/templates/api-service.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: Service +metadata: + name: api + labels: + app: api +spec: + selector: + app: api + ports: + - name: http + port: 80 + targetPort: 9000 + type: ClusterIP diff --git a/deploy/helm/echonet/templates/engine-deployment.yaml b/deploy/helm/echonet/templates/engine-deployment.yaml new file mode 100644 index 000000000..e3e26cb58 --- /dev/null +++ b/deploy/helm/echonet/templates/engine-deployment.yaml @@ -0,0 +1,40 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: engine + labels: + app: engine +spec: + replicas: {{ .Values.engine.replicas }} + selector: + matchLabels: + app: engine + template: + metadata: + labels: + app: engine + spec: + tolerations: + - key: "gpu" + operator: "Equal" + value: "true" + effect: "NoSchedule" + containers: + - name: engine + image: "{{ .Values.engine.image }}" + env: + - name: MODEL_SERVER_HOST + value: model-server + # Engine currently has no HTTP server; using TCP socket probe placeholder (update when health endpoint added) + readinessProbe: + tcpSocket: + port: 8000 + initialDelaySeconds: 20 + periodSeconds: 20 + livenessProbe: + tcpSocket: + port: 8000 + initialDelaySeconds: 60 + periodSeconds: 30 + resources: +{{- toYaml .Values.engine.resources | nindent 12 }} diff --git a/deploy/helm/echonet/templates/engine-service.yaml b/deploy/helm/echonet/templates/engine-service.yaml new file mode 100644 index 000000000..8ae92d303 --- /dev/null +++ b/deploy/helm/echonet/templates/engine-service.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: Service +metadata: + name: engine + labels: + app: engine +spec: + selector: + app: engine + ports: + - name: http + port: 8000 + targetPort: 8000 + type: ClusterIP diff --git a/deploy/helm/echonet/templates/hmi-deployment.yaml b/deploy/helm/echonet/templates/hmi-deployment.yaml new file mode 100644 index 000000000..6f09352b3 --- /dev/null +++ b/deploy/helm/echonet/templates/hmi-deployment.yaml @@ -0,0 +1,35 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: hmi + labels: + app: hmi +spec: + replicas: {{ .Values.hmi.replicas }} + selector: + matchLabels: + app: hmi + template: + metadata: + labels: + app: hmi + spec: + containers: + - name: hmi + image: "{{ .Values.hmi.image }}" + ports: + - containerPort: 8080 + readinessProbe: + httpGet: + path: /health + port: 8080 + initialDelaySeconds: 10 + periodSeconds: 15 + livenessProbe: + httpGet: + path: /health + port: 8080 + initialDelaySeconds: 30 + periodSeconds: 30 + resources: +{{- toYaml .Values.hmi.resources | nindent 12 }} \ No newline at end of file diff --git a/deploy/helm/echonet/templates/hmi-service.yaml b/deploy/helm/echonet/templates/hmi-service.yaml new file mode 100644 index 000000000..4e56f4974 --- /dev/null +++ b/deploy/helm/echonet/templates/hmi-service.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: Service +metadata: + name: hmi + labels: + app: hmi +spec: + selector: + app: hmi + ports: + - port: 8080 + targetPort: 8080 + name: http + type: ClusterIP \ No newline at end of file diff --git a/deploy/helm/echonet/templates/ingress.yaml b/deploy/helm/echonet/templates/ingress.yaml new file mode 100644 index 000000000..30f52f40f --- /dev/null +++ b/deploy/helm/echonet/templates/ingress.yaml @@ -0,0 +1,28 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: echonet-ingress + annotations: + kubernetes.io/ingress.class: "gce" # adjust for your ingress controller +spec: + rules: + - host: api.echonet.example.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: api + port: + number: 80 + - host: {{ .Values.hmi.host }} + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: hmi + port: + number: 8080 diff --git a/deploy/helm/echonet/templates/model-config-configmap.yaml b/deploy/helm/echonet/templates/model-config-configmap.yaml new file mode 100644 index 000000000..9df61b6fe --- /dev/null +++ b/deploy/helm/echonet/templates/model-config-configmap.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: model-config + labels: + app: model-server +data: + models.config: | + model_config_list: { + config: [ +{{- range $m := .Values.model.models }} + { name: "{{$m.name}}", base_path: "/models/{{$m.name}}", model_platform: "tensorflow", model_version_policy: { all { } } }, +{{- end }} + ] + } diff --git a/deploy/helm/echonet/templates/model-server-deployment.yaml b/deploy/helm/echonet/templates/model-server-deployment.yaml new file mode 100644 index 000000000..968df68ce --- /dev/null +++ b/deploy/helm/echonet/templates/model-server-deployment.yaml @@ -0,0 +1,68 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: model-server + labels: + app: model-server +spec: + replicas: {{ .Values.modelServer.replicas }} + selector: + matchLabels: + app: model-server + template: + metadata: + labels: + app: model-server + spec: + serviceAccountName: model-server-sa + tolerations: + - key: "gpu" + operator: "Equal" + value: "true" + effect: "NoSchedule" + initContainers: + - name: fetch-models + image: gcr.io/google.com/cloudsdktool/cloud-sdk:slim + command: ["/bin/sh","-c"] + args: + - >- + for m in {{ range $m := .Values.model.models }} {{ $m.name }} {{ end }}; do + mkdir -p /models/$m && gsutil -m rsync -r gs://{{ .Values.model.bucket }}/$m /models/$m || exit 1; + done; + volumeMounts: + - name: models + mountPath: /models + containers: + - name: tf-serving + image: "{{ .Values.modelServer.image }}" + args: + - "--model_config_file=/models/models.config" + - "--allow_version_labels_for_unavailable_models=true" + ports: + - containerPort: 8501 + readinessProbe: + httpGet: + path: /v1/models/{{ .Values.modelServer.readiness.modelName }} + port: 8501 + initialDelaySeconds: 10 + periodSeconds: 15 + livenessProbe: + httpGet: + path: /v1/models/{{ .Values.modelServer.readiness.modelName }} + port: 8501 + initialDelaySeconds: 30 + periodSeconds: 30 + resources: +{{- toYaml .Values.modelServer.resources | nindent 12 }} + volumeMounts: + - name: model-config + mountPath: /models/models.config + subPath: models.config + - name: models + mountPath: /models + volumes: + - name: model-config + configMap: + name: model-config + - name: models + emptyDir: {} diff --git a/deploy/helm/echonet/templates/model-server-service.yaml b/deploy/helm/echonet/templates/model-server-service.yaml new file mode 100644 index 000000000..a2f0e4c54 --- /dev/null +++ b/deploy/helm/echonet/templates/model-server-service.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: Service +metadata: + name: model-server + labels: + app: model-server +spec: + selector: + app: model-server + ports: + - name: http + port: 8501 + targetPort: 8501 + type: ClusterIP diff --git a/deploy/helm/echonet/templates/mongo-credentials-secret.yaml b/deploy/helm/echonet/templates/mongo-credentials-secret.yaml new file mode 100644 index 000000000..e90883241 --- /dev/null +++ b/deploy/helm/echonet/templates/mongo-credentials-secret.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: Secret +metadata: + name: mongo-credentials + labels: + app: echonet +stringData: + MONGODB_USER: "" + MONGODB_PASS: "" + MONGODB_HOST: "" + MONGODB_URI: "" \ No newline at end of file diff --git a/deploy/helm/echonet/templates/mqtt-deployment.yaml b/deploy/helm/echonet/templates/mqtt-deployment.yaml new file mode 100644 index 000000000..45e3be50f --- /dev/null +++ b/deploy/helm/echonet/templates/mqtt-deployment.yaml @@ -0,0 +1,35 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: mqtt + labels: + app: mqtt +spec: + replicas: {{ .Values.mqtt.replicas }} + selector: + matchLabels: + app: mqtt + template: + metadata: + labels: + app: mqtt + spec: + containers: + - name: mqtt + image: "{{ .Values.mqtt.image }}" + ports: + - containerPort: 1883 + - containerPort: 7001 + - containerPort: 7070 + livenessProbe: + tcpSocket: + port: 1883 + initialDelaySeconds: 20 + periodSeconds: 30 + readinessProbe: + tcpSocket: + port: 1883 + initialDelaySeconds: 10 + periodSeconds: 20 + resources: +{{- toYaml .Values.mqtt.resources | nindent 12 }} \ No newline at end of file diff --git a/deploy/helm/echonet/templates/mqtt-service.yaml b/deploy/helm/echonet/templates/mqtt-service.yaml new file mode 100644 index 000000000..5286a2e04 --- /dev/null +++ b/deploy/helm/echonet/templates/mqtt-service.yaml @@ -0,0 +1,20 @@ +apiVersion: v1 +kind: Service +metadata: + name: mqtt + labels: + app: mqtt +spec: + type: {{ .Values.mqtt.service.type }} + selector: + app: mqtt + ports: + - name: mqtt + port: 1883 + targetPort: 1883 + - name: ws + port: 7001 + targetPort: 7001 + - name: admin + port: 7070 + targetPort: 7070 \ No newline at end of file diff --git a/deploy/helm/echonet/templates/namespace-staging.yaml b/deploy/helm/echonet/templates/namespace-staging.yaml new file mode 100644 index 000000000..02fc0d480 --- /dev/null +++ b/deploy/helm/echonet/templates/namespace-staging.yaml @@ -0,0 +1,7 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: echonet-staging + labels: + env: staging + app: echonet \ No newline at end of file diff --git a/deploy/helm/echonet/templates/networkpolicies.yaml b/deploy/helm/echonet/templates/networkpolicies.yaml new file mode 100644 index 000000000..ea161aecf --- /dev/null +++ b/deploy/helm/echonet/templates/networkpolicies.yaml @@ -0,0 +1,79 @@ +{{- if .Values.networkPolicies.enabled }} +# Default deny all ingress within namespace +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: default-deny-ingress +spec: + podSelector: {} + policyTypes: ["Ingress"] +--- +# Allow ingress controller to reach API & HMI +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: allow-ingress-controller-web +spec: + podSelector: + matchExpressions: + - key: app + operator: In + values: ["api", "hmi"] + ingress: + - from: + - namespaceSelector: + matchLabels: + app.kubernetes.io/instance: {{ .Values.networkPolicies.ingressNamespaceLabelForIngress }} + ports: + - port: 9000 + - port: 8080 + policyTypes: ["Ingress"] +--- +# Allow API to talk to model-server & engine +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: allow-api-to-model-engine +spec: + podSelector: + matchLabels: + app: model-server + ingress: + - from: + - podSelector: + matchLabels: + app: api + policyTypes: ["Ingress"] +--- +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: allow-engine-from-api +spec: + podSelector: + matchLabels: + app: engine + ingress: + - from: + - podSelector: + matchLabels: + app: api + policyTypes: ["Ingress"] +--- +# Allow API to reach Mongo/Redis if they were internal (placeholder selectors) +# Adjust selectors if deploying those inside cluster +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: allow-db-from-api +spec: + podSelector: + matchLabels: + app: mongo + ingress: + - from: + - podSelector: + matchLabels: + app: api + policyTypes: ["Ingress"] +{{- end }} \ No newline at end of file diff --git a/deploy/helm/echonet/templates/serviceaccount-model.yaml b/deploy/helm/echonet/templates/serviceaccount-model.yaml new file mode 100644 index 000000000..fc20a68fc --- /dev/null +++ b/deploy/helm/echonet/templates/serviceaccount-model.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: model-server-sa +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: api-sa +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: engine-sa \ No newline at end of file diff --git a/deploy/helm/echonet/templates/servicemonitors.yaml b/deploy/helm/echonet/templates/servicemonitors.yaml new file mode 100644 index 000000000..0ecb42663 --- /dev/null +++ b/deploy/helm/echonet/templates/servicemonitors.yaml @@ -0,0 +1,29 @@ +{{- if .Values.monitoring.enabled }} +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: api-monitor + labels: +{{- toYaml .Values.monitoring.labels | nindent 4 }} +spec: + selector: + matchLabels: + app: api + endpoints: + - port: http + interval: {{ .Values.monitoring.scrapeInterval }} +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: model-server-monitor + labels: +{{- toYaml .Values.monitoring.labels | nindent 4 }} +spec: + selector: + matchLabels: + app: model-server + endpoints: + - port: http + interval: {{ .Values.monitoring.scrapeInterval }} +{{- end }} \ No newline at end of file diff --git a/deploy/helm/echonet/templates/simulator-deployment.yaml b/deploy/helm/echonet/templates/simulator-deployment.yaml new file mode 100644 index 000000000..7484fef60 --- /dev/null +++ b/deploy/helm/echonet/templates/simulator-deployment.yaml @@ -0,0 +1,24 @@ +{{- if .Values.simulator.enabled }} +apiVersion: apps/v1 +kind: Deployment +metadata: + name: simulator + labels: + app: simulator +spec: + replicas: {{ .Values.simulator.replicas }} + selector: + matchLabels: + app: simulator + template: + metadata: + labels: + app: simulator + spec: + containers: + - name: simulator + image: {{ .Values.simulator.image }} + args: ["python","system_manager.py"] + resources: +{{- toYaml .Values.simulator.resources | nindent 12 }} +{{- end }} \ No newline at end of file diff --git a/deploy/helm/echonet/templates/simulator-service.yaml b/deploy/helm/echonet/templates/simulator-service.yaml new file mode 100644 index 000000000..299b222ce --- /dev/null +++ b/deploy/helm/echonet/templates/simulator-service.yaml @@ -0,0 +1,15 @@ +{{- if .Values.simulator.enabled }} +apiVersion: v1 +kind: Service +metadata: + name: simulator + labels: + app: simulator +spec: + selector: + app: simulator + ports: + - port: 80 + targetPort: 8080 + type: ClusterIP +{{- end }} \ No newline at end of file diff --git a/deploy/helm/echonet/values-prod.yaml b/deploy/helm/echonet/values-prod.yaml new file mode 100644 index 000000000..dc7d3345c --- /dev/null +++ b/deploy/helm/echonet/values-prod.yaml @@ -0,0 +1,76 @@ +image: + registry: us-central1-docker.pkg.dev/echonet-prod/echonet + tag: latest +api: + replicas: 3 + resources: + requests: + cpu: 300m + memory: 384Mi + limits: + cpu: 800m + memory: 768Mi + mongoUriSecret: api-mongo-uri +engine: + image: us-central1-docker.pkg.dev/echonet-prod/echonet/engine:latest + replicas: 2 + resources: + requests: + cpu: 750m + memory: 1Gi + limits: + cpu: 1500m + memory: 2Gi +modelServer: + image: us-central1-docker.pkg.dev/echonet-prod/echonet/model-server:latest + replicas: 2 + resources: + requests: + cpu: 750m + memory: 2Gi + limits: + cpu: 1500m + memory: 4Gi + readiness: + modelName: echo_model +model: + bucket: echonet-models-prod + models: + - name: echo_model + path: echo_model/1 + - name: weather_model + path: weather_model/1 +hmi: + image: us-central1-docker.pkg.dev/echonet-prod/echonet/hmi:latest + replicas: 3 + host: app.echonet.example.com + resources: + requests: + cpu: 150m + memory: 192Mi + limits: + cpu: 400m + memory: 384Mi +mqtt: + image: us-central1-docker.pkg.dev/echonet-prod/echonet/mqtt:latest + replicas: 2 + service: + type: ClusterIP + resources: + requests: + cpu: 80m + memory: 96Mi + limits: + cpu: 250m + memory: 160Mi +simulator: + image: us-central1-docker.pkg.dev/echonet-prod/echonet/simulator:latest + replicas: 2 + resources: + requests: + cpu: 80m + memory: 128Mi + limits: + cpu: 250m + memory: 256Mi + enabled: true \ No newline at end of file diff --git a/deploy/helm/echonet/values-staging.yaml b/deploy/helm/echonet/values-staging.yaml new file mode 100644 index 000000000..66606c338 --- /dev/null +++ b/deploy/helm/echonet/values-staging.yaml @@ -0,0 +1,78 @@ +image: + registry: australia-southeast2-docker.pkg.dev/sit-23t1-project-echo-25288b9/echonet + tag: latest +api: + image: australia-southeast2-docker.pkg.dev/sit-23t1-project-echo-25288b9/echonet/api:latest + + replicas: 1 + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 300m + memory: 256Mi + mongoUriSecret: api-mongo-uri +engine: + image: australia-southeast2-docker.pkg.dev/sit-23t1-project-echo-25288b9/echonet/engine:latest + replicas: 1 + resources: + requests: + cpu: 250m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi +modelServer: + image: australia-southeast2-docker.pkg.dev/sit-23t1-project-echo-25288b9/echonet/model-server:latest + replicas: 1 + resources: + requests: + cpu: 250m + memory: 512Mi + limits: + cpu: 500m + memory: 1Gi + readiness: + modelName: echo_model +model: + bucket: echonet-models-stg + models: + - name: echo_model + path: echo_model/1 + - name: weather_model + path: weather_model/1 +hmi: + image: australia-southeast2-docker.pkg.dev/sit-23t1-project-echo-25288b9/echonet/hmi:latest + replicas: 1 + host: app-stg.echonet.example.com + resources: + requests: + cpu: 50m + memory: 96Mi + limits: + cpu: 200m + memory: 192Mi +mqtt: + image: australia-southeast2-docker.pkg.dev/sit-23t1-project-echo-25288b9/echonet/mqtt:latest + replicas: 1 + service: + type: ClusterIP + resources: + requests: + cpu: 25m + memory: 48Mi + limits: + cpu: 100m + memory: 96Mi +simulator: + image: australia-southeast2-docker.pkg.dev/sit-23t1-project-echo-25288b9/echonet/simulator:latest + replicas: 1 + resources: + requests: + cpu: 25m + memory: 96Mi + limits: + cpu: 100m + memory: 192Mi + enabled: true \ No newline at end of file diff --git a/deploy/helm/echonet/values.yaml b/deploy/helm/echonet/values.yaml new file mode 100644 index 000000000..44525fea8 --- /dev/null +++ b/deploy/helm/echonet/values.yaml @@ -0,0 +1,81 @@ +image: + registry: us-central1-docker.pkg.dev/echonet-prod/echonet + tag: latest +api: + replicas: 2 + resources: + requests: + cpu: 200m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi + mongoUriSecret: api-mongo-uri +engine: + image: engine + tag: latest + replicas: 1 + resources: + requests: + cpu: 500m + memory: 512Mi + limits: + cpu: 1 + memory: 1Gi +modelServer: + image: model-server + tag: latest + replicas: 1 + resources: + requests: + cpu: 500m + memory: 1Gi + limits: + cpu: 1 + memory: 2Gi + readiness: + modelName: echo_model +hmi: + image: us-central1-docker.pkg.dev/echonet-prod/echonet/hmi:latest + replicas: 2 + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 300m + memory: 256Mi + host: app.echonet.example.com +mqtt: + image: us-central1-docker.pkg.dev/echonet-prod/echonet/mqtt:latest + replicas: 1 + service: + type: ClusterIP # change to LoadBalancer if external clients connect + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 200m + memory: 128Mi +simulator: + image: us-central1-docker.pkg.dev/echonet-prod/echonet/simulator:latest + replicas: 1 + resources: + requests: + cpu: 50m + memory: 128Mi + limits: + cpu: 200m + memory: 256Mi + enabled: true +monitoring: + enabled: true + labels: + release: prometheus-stack + scrapeInterval: 30s + alerting: + enabled: true +networkPolicies: + enabled: true + ingressNamespaceLabelForIngress: ingress-nginx # adjust if using different ingress controller namespace diff --git a/infra/params/echonet-params.sample.txt b/infra/params/echonet-params.sample.txt new file mode 100644 index 000000000..499cdfa66 --- /dev/null +++ b/infra/params/echonet-params.sample.txt @@ -0,0 +1,25 @@ +project_id_prod=echonet-prod +project_id_stg=echonet-stg +region=us-central1 +zones=[us-central1-a,us-central1-b,us-central1-c] +domain_root=echonet.example.com +api_domain=api.echonet.example.com +app_domain=app.echonet.example.com +gpu_type=nvidia-l4 +gpu_pool_node_type=g2-standard-4 +general_pool_node_type=e2-standard-4 +mongo_atlas_tier=M20 +mongo_db_name=EchoNet +redis_required=yes +model_bucket_stg=gs://echonet-models-stg +model_bucket_prod=gs://echonet-models-prod +models=[ + {name=echo_model,est_size_mb=500,versioning_scheme=integer} +] +secrets=[MONGODB_URI,TWILIO_ACCOUNT_SID,TWILIO_AUTH_TOKEN,REDIS_URI] +image_registry_location=us-central1-docker.pkg.dev/echonet-prod/echonet +base_python_version=3.11 +base_node_version=20 +latency_target_p95_ms=400 +initial_api_qps=50 +budget_monthly_usd=1500 diff --git a/infra/terraform/main.tf b/infra/terraform/main.tf new file mode 100644 index 000000000..7f18bb962 --- /dev/null +++ b/infra/terraform/main.tf @@ -0,0 +1,81 @@ +terraform { + required_version = ">= 1.6.0" + required_providers { + google = { source = "hashicorp/google" version = "~> 5.0" } + } +} +provider "google" { + project = var.project_id + region = var.region +} +resource "google_container_cluster" "echonet" { + name = var.cluster_name + location = var.region + remove_default_node_pool = true + initial_node_count = 1 + networking_mode = "VPC_NATIVE" + workload_identity_config { workload_pool = "${var.project_id}.svc.id.goog" } + release_channel { channel = "REGULAR" } +} +resource "google_container_node_pool" "general" { + name = "general-pool" + cluster = google_container_cluster.echonet.name + location = var.region + node_config { + machine_type = "e2-standard-4" + oauth_scopes = ["https://www.googleapis.com/auth/cloud-platform"] + labels = { role = "general" } + } + initial_node_count = 2 + autoscaling { min_node_count = 1 max_node_count = 4 } +} +resource "google_container_node_pool" "gpu" { + name = "gpu-pool" + cluster = google_container_cluster.echonet.name + location = var.region + node_config { + machine_type = "g2-standard-4" + guest_accelerator { type = "nvidia-l4" count = 1 } + labels = { role = "gpu" } + taints = [{ key = "gpu", value = "true", effect = "NO_SCHEDULE" }] + } + initial_node_count = 1 + autoscaling { min_node_count = 0 max_node_count = 3 } +} +resource "google_artifact_registry_repository" "echonet" { + location = var.region + repository_id = var.artifact_repo_name + description = "EchoNet container images" + format = "DOCKER" +} + +resource "google_storage_bucket" "model_stg" { + name = replace(var.model_bucket_stg, "gs://", "") + location = var.region + force_destroy = false + uniform_bucket_level_access = true + versioning { enabled = true } + lifecycle_rule { action { type = "Delete" } condition { age = 120 } } + labels = { env = "staging", app = "echonet" } +} +resource "google_storage_bucket" "model_prod" { + name = replace(var.model_bucket_prod, "gs://", "") + location = var.region + force_destroy = false + uniform_bucket_level_access = true + versioning { enabled = true } + lifecycle_rule { action { type = "Delete" } condition { age = 365 } } + labels = { env = "prod", app = "echonet" } +} + +# Service accounts for workload identity +resource "google_service_account" "api" { account_id = var.api_sa_id display_name = "API Workload" } +resource "google_service_account" "engine" { account_id = var.engine_sa_id display_name = "Engine Workload" } +resource "google_service_account" "model" { account_id = var.model_sa_id display_name = "Model Server Workload" } + +# Secret Manager secrets (placeholders) +resource "google_secret_manager_secret" "secrets" { + for_each = toset(var.secrets) + secret_id = each.key + replication { automatic = true } +} diff --git a/infra/terraform/variables.tf b/infra/terraform/variables.tf new file mode 100644 index 000000000..e9bbf497f --- /dev/null +++ b/infra/terraform/variables.tf @@ -0,0 +1,12 @@ +variable "project_id" {} +variable "region" {} +variable "zones" { type = list(string) } +variable "cluster_name" { default = "echonet-gke" } +variable "model_buckets" { type = list(string) } +variable "artifact_repo_name" { default = "echonet" } +variable "model_bucket_stg" {} +variable "model_bucket_prod" {} +variable "api_sa_id" { default = "api-sa" } +variable "engine_sa_id" { default = "engine-sa" } +variable "model_sa_id" { default = "model-sa" } +variable "secrets" { type = list(string) default = ["mongo-uri", "twilio-auth-token"] } diff --git a/ns.json b/ns.json new file mode 100644 index 000000000..e69de29bb diff --git a/scripts/gcp-echo-discover.sh b/scripts/gcp-echo-discover.sh new file mode 100755 index 000000000..61ac809cf --- /dev/null +++ b/scripts/gcp-echo-discover.sh @@ -0,0 +1,96 @@ +#!/usr/bin/env bash +set -euo pipefail + +# gcp-echo-discover.sh — discover Echo/EchoNet endpoints and status on GCP +# Usage: bash scripts/gcp-echo-discover.sh [namespace-regex] [name-regex] +# Defaults search for "echo|echonet" across GKE and Cloud Run. + +NS_RE="${1:-.*}" +NAME_RE="${2:-echo|echonet}" +PROJECT="$(gcloud config get-value project 2>/dev/null)" +TS() { date +"%Y-%m-%dT%H:%M:%S%z"; } + +printf "[%s] Project: %s\n" "$(TS)" "${PROJECT}" + +found_any=false + +# ---- Cloud Run ---- +if gcloud services list --enabled --format=value(config.name) | grep -q '^run.googleapis.com$'; then + printf "\n[%s] Checking Cloud Run services...\n" "$(TS)" + while read -r REGION; do + [ -z "$REGION" ] && continue + SRV=$(gcloud run services list --region "$REGION" --format='value(name, status.url)' || true) + if [[ -n "$SRV" ]]; then + echo "$SRV" | awk -v re="$NAME_RE" '$1 ~ re {print $1 "\t" $2}' | while IFS=$'\t' read -r NAME URL; do + found_any=true + printf "Cloud Run | region=%s | service=%s | url=%s\n" "$REGION" "$NAME" "$URL" + # Health guesses + for path in /healthz /health /readyz /; do + code=$(curl -sk -o /dev/null -w '%{http_code}' "${URL%/}${path}") || code=000 + printf " -> %s : HTTP %s\n" "$path" "$code" + [ "$code" = "200" ] && break + done + done + fi + done < <(gcloud run regions list --format='value(name)') +else + printf "\n[%s] Cloud Run API not enabled or no access.\n" "$(TS)" +fi + +# ---- GKE ---- +if gcloud services list --enabled --format=value(config.name) | grep -q '^container.googleapis.com$'; then + printf "\n[%s] Checking GKE clusters...\n" "$(TS)" + while read -r CLUSTER LOCATION; do + [ -z "$CLUSTER" ] && continue + printf "Cluster: %s (%s)\n" "$CLUSTER" "$LOCATION" + gcloud container clusters get-credentials "$CLUSTER" --region "$LOCATION" >/dev/null 2>&1 || \ + gcloud container clusters get-credentials "$CLUSTER" --zone "$LOCATION" >/dev/null 2>&1 || true + + # Namespaces matching pattern + kubectl get ns --no-headers 2>/dev/null | awk -v re="$NS_RE" '$1 ~ re {print $1}' | while read -r ns; do + # Deployments with name filter + kubectl -n "$ns" get deploy --no-headers 2>/dev/null | awk -v re="$NAME_RE" '$1 ~ re {print $1}' | while read -r app; do + found_any=true + img=$(kubectl -n "$ns" get deploy "$app" -o jsonpath='{.spec.template.spec.containers[0].image}' 2>/dev/null || echo -n) + avail=$(kubectl -n "$ns" get deploy "$app" -o jsonpath='{.status.conditions[?(@.type=="Available")].lastUpdateTime}' 2>/dev/null || echo -n) + printf "GKE | ns=%s | app=%s | image=%s | availableAt=%s\n" "$ns" "$app" "$img" "$avail" + done + + # Ingress hosts + kubectl -n "$ns" get ingress --no-headers 2>/dev/null | awk '{print $1}' | while read -r ing; do + host=$(kubectl -n "$ns" get ingress "$ing" -o jsonpath='{.spec.rules[*].host}' 2>/dev/null || echo -n) + [ -z "$host" ] && continue + printf " Ingress: %s | host=%s\n" "$ing" "$host" + for path in /healthz /health /readyz /; do + code=$(curl -sk -o /dev/null -w '%{http_code}' "https://${host%/}${path}") || code=000 + printf " -> %s : HTTP %s\n" "$path" "$code" + [ "$code" = "200" ] && break + done + done + + # LoadBalancer services + kubectl -n "$ns" get svc --no-headers 2>/dev/null | awk '$3=="LoadBalancer" {print $1}' | while read -r svc; do + ip=$(kubectl -n "$ns" get svc "$svc" -o jsonpath='{.status.loadBalancer.ingress[0].ip}' 2>/dev/null || echo -n) + host=$(kubectl -n "$ns" get svc "$svc" -o jsonpath='{.status.loadBalancer.ingress[0].hostname}' 2>/dev/null || echo -n) + target="${host:-$ip}" + [ -z "$target" ] && continue + printf " SVC: %s | external=%s\n" "$svc" "$target" + for path in /healthz /health /readyz /; do + code=$(curl -sk -o /dev/null -w '%{http_code}' "http://${target%/}${path}") || code=000 + printf " -> %s : HTTP %s\n" "$path" "$code" + [ "$code" = "200" ] && break + done + done + done + done < <(gcloud container clusters list --format='value(name,location)') +else + printf "\n[%s] GKE API not enabled or no access.\n" "$(TS)" +fi + +# ---- Fallback: load balancers / forwarding rules ---- +printf "\n[%s] Checking external forwarding rules (HTTP(S))...\n" "$(TS)" +gcloud compute forwarding-rules list --filter='target~https|http' --format='table(name, IPAddress, target)' || true + +if ! $found_any; then + printf "\nNo Echo/EchoNet endpoints found yet. Try adjusting search: bash scripts/gcp-echo-discover.sh '.*' 'your-app-name'\n" +fi diff --git a/src/Components/API/.dockerignore b/src/Components/API/.dockerignore new file mode 100644 index 000000000..dca256702 --- /dev/null +++ b/src/Components/API/.dockerignore @@ -0,0 +1,12 @@ +.git +__pycache__/ +*.pyc +*.pyo +*.pyd +.env +.env.* +venv/ +coverage/ +.echo_config.json +echo_config.json +tests/ diff --git a/src/Components/API/Dockerfile b/src/Components/API/Dockerfile index 6b5bbfd42..f6471b5b4 100644 --- a/src/Components/API/Dockerfile +++ b/src/Components/API/Dockerfile @@ -1,21 +1,14 @@ -# Use an official Python runtime as a parent image FROM python:3.11-slim - -# Set the working directory +ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1 WORKDIR /app - -# Install system dependencies -RUN apt-get update && apt-get install -y build-essential gcc - -# Copy requirements and install +RUN apt-get update && apt-get install -y --no-install-recommends build-essential gcc \ + && rm -rf /var/lib/apt/lists/* COPY requirements.txt . -RUN pip install --no-cache-dir -r requirements.txt - -# Copy the full app into the container +RUN pip install --no-cache-dir -r requirements.txt \ + && apt-get purge -y build-essential gcc && apt-get autoremove -y && rm -rf /var/lib/apt/lists/* COPY . . - -# Expose port for the app -EXPOSE 8080 - -# Command to run the app using uvicorn -CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8080"] +RUN rm -f app/echo_config.json .env || true +RUN useradd -m appuser && chown -R appuser:appuser /app +USER appuser +EXPOSE 9000 +CMD ["uvicorn","app.main:app","--host","0.0.0.0","--port","9000"] diff --git a/src/Components/API/app/echo_config.json b/src/Components/API/app/echo_config.json index 571b5bcda..9a3a08ff9 100644 --- a/src/Components/API/app/echo_config.json +++ b/src/Components/API/app/echo_config.json @@ -1,5 +1,5 @@ { - "DB_USERNAME": "modelUser", - "DB_PASSWORD": "EchoNetAccess2023", - "DB_HOSTNAME": "ts-mongodb-cont" + "DB_USERNAME": "", + "DB_PASSWORD": "", + "DB_HOSTNAME": "" } diff --git a/src/Components/API/app/main.py b/src/Components/API/app/main.py index c6bdfbaf4..6d1cea4e1 100644 --- a/src/Components/API/app/main.py +++ b/src/Components/API/app/main.py @@ -34,7 +34,7 @@ app.add_middleware( CORSMiddleware, - allow_origins=["http://localhost:8080"], # 可根据实际需求配置 + allow_origins=["http://localhost:8080"],) from app.routers import hmi, engine, sim, iot @@ -94,4 +94,129 @@ app.include_router(species_predictor.router, tags=["predict"]) -import os # ✅ Ensure this is already imported at the top + +# ✅ Root endpoint +@app.get("/", response_description="API Root") +def show_home(): + return 'Welcome to echo api, move to /docs for more' + return 'Welcome to Project Echo API. Visit /docs for interactive documentation.' + +app.include_router(auth_router.router, tags=["auth"], prefix="/api") + +# ✅ /openapi-export - fetch live OpenAPI spec +@app.get("/openapi-export", include_in_schema=False) +async def get_openapi_spec(): + """ + Returns the current OpenAPI spec generated by FastAPI. + Used for downloading and converting to YAML. + """ + return app.openapi() + +# ✅ /spec/summary - for OpenAPI spec verification/debug +@app.get("/spec/summary", tags=["debug"], include_in_schema=False) +async def get_spec_summary(): + """ + Returns a summary of the OpenAPI spec for deployment verification. + """ + spec = app.openapi() + return { + "title": spec.get("info", {}).get("title"), + "version": spec.get("info", {}).get("version"), + "number_of_paths": len(spec.get("paths", {})), + "tags": [tag.get("name") for tag in spec.get("tags", []) if "name" in tag] + } + +# ✅ Save OpenAPI spec to file when app starts +def export_openapi_to_file(): + """ + Saves the OpenAPI spec to a file on startup. + Creates the 'backend' folder if it doesn't exist. + """ + output_dir = "backend" + os.makedirs(output_dir, exist_ok=True) # creates the folder if it doesn't exist + output_path = os.path.join(output_dir, "project-echo-openapi.json") + + with open(output_path, "w") as f: + json.dump(app.openapi(), f, indent=2) + + print(f"✅ OpenAPI spec exported to {output_path}") + +export_openapi_to_file() + +# Global error handlers +@app.exception_handler(HTTPException) +async def http_exception_handler(request: Request, exc: HTTPException): + return JSONResponse(status_code=exc.status_code, content={ + "error": { + "type": "http_error", + "message": exc.detail if isinstance(exc.detail, str) else str(exc.detail), + "status_code": exc.status_code, + "path": str(request.url), + } + }) + +@app.exception_handler(Exception) +async def unhandled_exception_handler(request: Request, exc: Exception): + return JSONResponse(status_code=500, content={ + "error": { + "type": "server_error", + "message": "Internal server error", + "status_code": 500, + "path": str(request.url), + } + }) + +# API versioning alias (v1) — keeps legacy routes while exposing versioned ones +try: + from app.routers import species_predictor, audio_upload_router + v1_api = APIRouter(prefix="/api/v1") + # Including existing routers under v1 prefix. Note: their internal paths may already include /api/* + v1_api.include_router(species_predictor.router, tags=["predict"]) # exposes /api/v1/predict + v1_api.include_router(audio_upload_router.router, tags=["audio"]) # exposes /api/v1/api/audio/upload + app.include_router(v1_api) +except Exception: + # If routers are not available at import time, skip v1 mounting + pass + +# --- Health & Readiness Endpoints --- +from functools import lru_cache +from time import time + +START_TIME = time() + +@app.get("/health", include_in_schema=False) +@app.get("/healthz", include_in_schema=False) +def health(): + """Basic liveness endpoint.""" + return {"status": "ok", "uptime_seconds": int(time() - START_TIME)} + +@lru_cache(maxsize=1) +def _mongo_client(): + uri = os.getenv("MONGODB_URI") or os.getenv("USER_MONGODB_URI") + if not uri: + return None + try: + client = pymongo.MongoClient(uri, serverSelectionTimeoutMS=800) + return client + except Exception: + return None + +@app.get("/readyz", include_in_schema=False) +def ready(): + """Readiness: validates dependencies (Mongo if configured).""" + checks = {} + mongo_ok = True + client = _mongo_client() + if client: + try: + client.admin.command("ping") + except Exception: + mongo_ok = False + else: + # If no URI configured we don't fail readiness—treated as optional. + mongo_ok = True + checks["mongo"] = mongo_ok + overall = all(checks.values()) + status_code = 200 if overall else 503 + return JSONResponse(status_code=status_code, content={"ready": overall, "checks": checks}) + diff --git a/src/Components/API/app/routers/audio_upload_router.py b/src/Components/API/app/routers/audio_upload_router.py index 33a830052..922142f59 100644 --- a/src/Components/API/app/routers/audio_upload_router.py +++ b/src/Components/API/app/routers/audio_upload_router.py @@ -10,17 +10,21 @@ UPLOAD_DIR = os.path.join(os.path.dirname(__file__), "uploads") os.makedirs(UPLOAD_DIR, exist_ok=True) -ALLOWED_EXTENSIONS = {".wav", ".mp3", ".flac"} -ALLOWED_CONTENT_TYPES = {"audio/wav", "audio/x-wav", "audio/mpeg", "audio/flac", "audio/x-flac"} -MAX_UPLOAD_BYTES = 30 * 1024 * 1024 # 30 MB + +def _validate_upload(filename: str, content_type: Optional[str], size: int): + ext = os.path.splitext(filename)[1].lower() + if ext not in ALLOWED_EXTENSIONS: + raise HTTPException(status_code=422, detail="Unsupported file type; allowed: .wav, .mp3, .m4a, .flac") + if size <= 0: + raise HTTPException(status_code=400, detail="Empty audio file") + if size > MAX_BYTES: + raise HTTPException(status_code=413, detail="File too large (max 30MB)") + if content_type and content_type not in ALLOWED_CONTENT_TYPES: + raise HTTPException(status_code=422, detail=f"Unsupported Content-Type: {content_type}") @router.post("/audio/upload") -async def upload_audio( - file: UploadFile = File(...), - user_id: Optional[str] = Form(None), -): - # Basic validations +async def upload_audio(file: UploadFile = File(...), user_id: Optional[str] = Form(None)): if not file or not file.filename: raise HTTPException(status_code=400, detail="No file provided") diff --git a/src/Components/HMI/.dockerignore b/src/Components/HMI/.dockerignore index bf881194f..a678788cb 100644 --- a/src/Components/HMI/.dockerignore +++ b/src/Components/HMI/.dockerignore @@ -1 +1,11 @@ -ui/node_modules/ \ No newline at end of file +ui/node_modules/ +node_modules +dist +build +*.log +.env +.env.* +prepros.config +coverage/ +*.tsbuildinfo +*.swp \ No newline at end of file diff --git a/src/Echo_Components_on_K8s/K8s_configs/configMaps_and_secrets/api-env.yaml b/src/Echo_Components_on_K8s/K8s_configs/configMaps_and_secrets/api-env.yaml index 5a8d7666b..07c3befa8 100644 --- a/src/Echo_Components_on_K8s/K8s_configs/configMaps_and_secrets/api-env.yaml +++ b/src/Echo_Components_on_K8s/K8s_configs/configMaps_and_secrets/api-env.yaml @@ -3,8 +3,9 @@ kind: ConfigMap metadata: name: api-env data: - DB_HOST: value - DB_USER: modelUser - DB_USER_PASS: EchoNetAccess2023 - DB_ROOT_USER: root - DB_ROOT_USER_PASS: root_password + DB_HOST: + # Remove user/pass from ConfigMap; provide via Secret object instead + DB_USER: + DB_USER_PASS: + DB_ROOT_USER: + DB_ROOT_USER_PASS: diff --git a/src/Echo_Components_on_K8s/K8s_configs/configMaps_and_secrets/engine-env.yaml b/src/Echo_Components_on_K8s/K8s_configs/configMaps_and_secrets/engine-env.yaml index 695065e4c..7dcfea385 100644 --- a/src/Echo_Components_on_K8s/K8s_configs/configMaps_and_secrets/engine-env.yaml +++ b/src/Echo_Components_on_K8s/K8s_configs/configMaps_and_secrets/engine-env.yaml @@ -3,10 +3,11 @@ kind: ConfigMap metadata: name: engine-env data: - DB_USER: modelUser - DB_USER_PASS: EchoNetAccess2023 - DB_ROOT_USER: root - DB_ROOT_USER_PASS: root_password - REDIS_HOST: value - DB_HOST: value - API_HOST: value + # Credentials now provided via Secret; only non-sensitive defaults here + DB_USER: + DB_USER_PASS: + DB_ROOT_USER: + DB_ROOT_USER_PASS: + REDIS_HOST: redis + DB_HOST: + API_HOST: api diff --git a/src/Echo_Components_on_K8s/K8s_configs/configMaps_and_secrets/hmi-env.yaml b/src/Echo_Components_on_K8s/K8s_configs/configMaps_and_secrets/hmi-env.yaml index 10de3a09f..81fe4e1be 100644 --- a/src/Echo_Components_on_K8s/K8s_configs/configMaps_and_secrets/hmi-env.yaml +++ b/src/Echo_Components_on_K8s/K8s_configs/configMaps_and_secrets/hmi-env.yaml @@ -3,10 +3,10 @@ kind: ConfigMap metadata: name: hmi-env data: - DB_USER: modelUser - DB_USER_PASS: EchoNetAccess2023 - DB_ROOT_USER: root - DB_ROOT_USER_PASS: root_password - REDIS_HOST: value - DB_HOST: value - API_HOST: value + DB_USER: + DB_USER_PASS: + DB_ROOT_USER: + DB_ROOT_USER_PASS: + REDIS_HOST: redis + DB_HOST: + API_HOST: api diff --git a/src/Echo_Components_on_K8s/K8s_configs/configMaps_and_secrets/mserver-env.yaml b/src/Echo_Components_on_K8s/K8s_configs/configMaps_and_secrets/mserver-env.yaml index 70a9c69f3..c2e3b3936 100644 --- a/src/Echo_Components_on_K8s/K8s_configs/configMaps_and_secrets/mserver-env.yaml +++ b/src/Echo_Components_on_K8s/K8s_configs/configMaps_and_secrets/mserver-env.yaml @@ -3,8 +3,8 @@ kind: ConfigMap metadata: name: mserver-env data: - DB_HOST: value - DB_USER: modelUser - DB_USER_PASS: EchoNetAccess2023 - DB_ROOT_USER: root - DB_ROOT_USER_PASS: root_password + DB_HOST: + DB_USER: + DB_USER_PASS: + DB_ROOT_USER: + DB_ROOT_USER_PASS: diff --git a/src/Echo_Components_on_K8s/MongoDb/init/init-mongo.js b/src/Echo_Components_on_K8s/MongoDb/init/init-mongo.js index 2851dc6e2..6c3a38261 100644 --- a/src/Echo_Components_on_K8s/MongoDb/init/init-mongo.js +++ b/src/Echo_Components_on_K8s/MongoDb/init/init-mongo.js @@ -1,16 +1,19 @@ db = db.getSiblingDB("EchoNet"); +// NOTE: Credentials removed from source control. Supply via environment / secret during container start. +// Example (do not commit real values): +// MONGO_APP_USER, MONGO_APP_PASS db.createUser({ - user: "modelUser", - pwd: "EchoNetAccess2023", - roles: [ - { - role: "readWrite", - db: "EchoNet", - }, - ], + user: _getEnv("MONGO_APP_USER", "replace_me"), + pwd: _getEnv("MONGO_APP_PASS", "replace_me"), + roles: [ { role: "readWrite", db: "EchoNet" } ], }); +function _getEnv(name, fallback){ + try { return cat(`/run/secrets/${name}`).trim(); } catch(e) {} + return fallback; +} + db.createCollection("events"); db.createCollection("microphones"); db.createCollection("movements"); diff --git a/src/Prototypes/hmi/ui/config/db.config.js b/src/Prototypes/hmi/ui/config/db.config.js index f757cd74c..6c8de01b4 100644 --- a/src/Prototypes/hmi/ui/config/db.config.js +++ b/src/Prototypes/hmi/ui/config/db.config.js @@ -1,8 +1,8 @@ module.exports = { - USERNAME: "modelUser", - PASSWORD: "EchoNetAccess2023", - HOST: "ts-mongodb-cont", - PORT: 27017, - DB: "UserSample" - }; + USERNAME: process.env.MONGODB_USER || "", + PASSWORD: process.env.MONGODB_PASS || "", + HOST: process.env.MONGODB_HOST || "localhost", + PORT: parseInt(process.env.MONGODB_PORT || "27017", 10), + DB: process.env.MONGODB_DB || "UserSample" +};